diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 00000000000..a4d3b96eb34
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,23 @@
+# More info: https://editorconfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Use space and 4 indentation style everywhere.
+# Also add a newline at the end of every file.
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+indent_style = space
+indent_size = 4
+
+# YAML files use 2 space indentation for now.
+[*.{yml,md}]
+indent_style = space
+indent_size = 2
+
+# Makefiles require tabs
+[Makefile]
+indent_style = tab
diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
new file mode 100644
index 00000000000..59935e07273
--- /dev/null
+++ b/.github/workflows/osx.yml
@@ -0,0 +1,35 @@
+name: OSX-build
+
+on: [push]
+
+jobs:
+  osx-clang-omp:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"}
+        - {shared: "OFF", build_type: "Release", name: "omp/release/static"}
+    name: ${{ matrix.config.name }}
+    runs-on: [macos-latest]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: setup
+      run: brew install libomp
+    - name: info
+      run: |
+        g++ -v
+        cmake --version
+    - name: configure
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }}
+        make -j8
+        ctest -j10 --output-on-failure
+    - name: install
+      run: |
+        cd build
+        make install
+        make test_install
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
new file mode 100644
index 00000000000..66a81dda312
--- /dev/null
+++ b/.github/workflows/windows-build.yml
@@ -0,0 +1,158 @@
+name: Windows-build
+
+on: [push]
+
+jobs:
+  windows_cuda:
+    name: cuda102/release/shared (only compile)
+    runs-on: [windows-latest]
+    steps:
+    - uses: actions/checkout@v2
+    - name: setup
+      run: |
+        choco install cuda --version=10.2.89.20191206 -y
+    - name: configure
+      run: |
+        $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.."   
+        Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+        refreshenv
+        mkdir build
+        cd build
+        $env:PATH="$pwd\windows_shared_library;$env:PATH"
+        cmake -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF ..
+        cmake --build . -j4 --config Release
+        
+  windows_ref:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
+        - {shared: "OFF", build_type: "Release", name: "reference/release/static"}
+        # Debug static needs too much storage
+        # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
+    name: msvc/${{ matrix.config.name }}
+    runs-on: [windows-latest]
+    steps:
+    - uses: actions/checkout@v2
+    - name: shared_env
+      if: matrix.config.shared == 'ON'
+      run: |
+        echo "::set-env name=origin_path::$env:PATH"
+        echo "::add-path::$pwd\build\windows_shared_library"
+    - name: debug_env
+      if: matrix.config.build_type == 'Debug'
+      run: |
+        echo "::set-env name=CXXFLAGS::/bigobj"
+    - name: configure
+      run: |
+        mkdir build
+        cd build
+        cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
+        cmake --build . -j4 --config ${{ matrix.config.build_type }}
+        ctest . -C ${{ matrix.config.build_type }} --output-on-failure
+    - name: install_shared_env
+      if: matrix.config.shared == 'ON'
+      run: |
+        echo "::set-env name=PATH::C:\Program Files (x86)\Ginkgo\lib;$env:origin_path"
+    - name: install
+      run: |
+        cd build
+        cmake --install . --config ${{ matrix.config.build_type }}
+        cmake --build . --target test_install --config ${{ matrix.config.build_type }}
+  windows_mingw:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", cflags: "-O1"}
+        - {shared: "OFF", build_type: "Release", name: "omp/release/static", cflags: ""}
+    name: mingw/${{ matrix.config.name }}
+    runs-on: [windows-latest]
+    steps:
+    - uses: actions/checkout@v2
+    - name: shared_env
+      if: matrix.config.shared == 'ON'
+      run: |
+        echo "::set-env name=origin_path::$env:PATH"
+        echo "::add-path::$pwd\build\windows_shared_library"
+    - name: debug_env
+      if: matrix.config.build_type == 'Debug'
+      run: |
+        echo "::set-env name=CXXFLAGS::-Wa,-mbig-obj"
+    - name: configure
+    # Use cmd to remove the path easily
+      run: |
+        set PATH=%PATH:C:\Program Files\Git\bin;=%
+        set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
+        bcdedit /set IncreaseUserVa 3072
+        editbin /LARGEADDRESSAWARE "C:\Program Files\Git\mingw64\bin\cc1plus.exe"
+        mkdir build
+        cd build
+        cmake -G "MinGW Makefiles" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} ..
+        cmake --build . -j4
+        ctest . --output-on-failure
+      shell: cmd
+    - name: install_shared_env
+      if: matrix.config.shared == 'ON'
+      run: |
+        echo "::set-env name=PATH::C:\Program Files (x86)\Ginkgo\lib;$env:origin_path"
+    - name: install
+      run: |
+        set PATH=%PATH:C:\Program Files\Git\bin;=%
+        set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
+        cd build
+        cmake --install .
+        cmake --build . --target test_install
+      shell: cmd
+
+  windows_cygwin:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", cflags: "-O1"}
+        - {shared: "OFF", build_type: "Release", name: "omp/release/static", cflags: ""}
+    name: cygwin/${{ matrix.config.name }}
+    runs-on: [windows-latest]
+    steps:
+    - run: git config --global core.autocrlf input
+    - uses: actions/checkout@v2
+    - name: setup
+      run: |
+        choco install cygwin -y
+        choco install cyg-get -y
+        cyg-get cmake make gcc-g++ git
+    - name: shared_static_env
+      run: |
+        echo "::set-env name=shared_ON_path::;$pwd\build\windows_shared_library"
+        echo "::set-env name=shared_OFF_path::"
+    - name: debug_env
+      if: matrix.config.build_type == 'Debug'
+      run: |
+        echo "::set-env name=CXXFLAGS::-Wa,-mbig-obj"
+    - name: configure
+      run: |
+        path C:\tools\cygwin\bin%shared_${{ matrix.config.shared }}_path%
+        mkdir build
+        cd build
+        bash -c "cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .."
+        bash -c "make -j4"
+        bash -c "ctest . --output-on-failure"
+      shell: cmd
+    - name: install_shared
+      if: matrix.config.shared == 'ON'
+      run: |
+        path C:\tools\cygwin\bin
+        cd build
+        bash -c "make install"
+        bash -c "export PATH=/usr/local/lib:$PATH && make test_install"
+      shell: cmd
+    - name: install_static
+      if: matrix.config.shared == 'OFF'
+      run: |
+        path C:\tools\cygwin\bin
+        cd build
+        bash -c "make install"
+        bash -c "make test_install"
+      shell: cmd
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index eaf638e04d3..008e88c45ed 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,5 +1,3 @@
-image: localhost:5000/gko-cuda100-gnu7-llvm60
-
 stages:
   - sync
   - build
@@ -15,22 +13,24 @@ stages:
 # Templates with reasonable defaults for builds and tests
 .variables_template: &default_variables
   BENCHMARK_SERVER: "FINECI"
-  C_COMPILER: gcc
-  CXX_COMPILER: g++
-  BUILD_TYPE: Debug
+  C_COMPILER: "gcc"
+  CXX_COMPILER: "g++"
+  CUDA_COMPILER: "nvcc"
+  BUILD_TYPE: "Debug"
   BUILD_SHARED_LIBS: "ON"
   BUILD_REFERENCE: "ON"
   BUILD_OMP: "OFF"
   BUILD_CUDA: "OFF"
+  BUILD_HIP: "OFF"
   CXX_FLAGS: ""
   EXTRA_CMAKE_FLAGS: ""
 
 .before_script_template: &default_before_script
-  - export OMP_NUM_THREADS=4
+  - export NUM_CORES=${CI_PARALLELISM}
+  - export OMP_NUM_THREADS=${NUM_CORES}
   - export CUDA_VISIBLE_DEVICES=0
 
 .before_script_git_template: &git_before_script
-    # set up identities
   - eval $(ssh-agent -s)
   - echo "${BOT_KEY}" | tr -d '\r' | ssh-add - >/dev/null
   - mkdir -p ~/.ssh
@@ -45,25 +45,57 @@ stages:
   before_script: *default_before_script
   script:
     - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME}
+    - if [ -n "${CUDA_ARCH}" ]; then
+      CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH};
+      CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER});
+      fi
     - cmake ${CI_PROJECT_DIR}
+        -GNinja
         -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER}
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}"
-        -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${EXTRA_CMAKE_FLAGS}
+        -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+        ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR}
         -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE}
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
+        -DGINKGO_BUILD_HIP=${BUILD_HIP}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
-    - make -j$(grep "core id" /proc/cpuinfo | sort -u | wc -l)
+    - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT}
+  dependencies: []
+  except:
+      - schedules
+
+.build_template: &default_build_with_test
+  stage: build
+  variables: *default_variables
+  before_script: *default_before_script
+  script:
+    - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME}
+    - if [ -n "${CUDA_ARCH}" ]; then
+      CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH};
+      CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER});
+      fi
+    - cmake ${CI_PROJECT_DIR}
+        -GNinja
+        -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER}
+        -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+        ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR}
+        -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE}
+        -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
+        -DGINKGO_BUILD_HIP=${BUILD_HIP}
+        -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
+    - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install
     - |
         (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
     - ctest -V
-    - make install
-    - make test_install
+    - ninja test_install
   dependencies: []
   except:
       - schedules
 
 sync:
   stage: sync
+  image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     GIT_STRATEGY: none
     PRIVATE_REPO: git@gitlab.com:ginkgo-project/ginkgo.git
@@ -80,287 +112,444 @@ sync:
     - develop
   except:
     - schedules
+  tags:
+    - private_ci
+    - cpu
 
 
 # Build jobs
-build/cuda90/gcc/cuda/debug/shared:
-  <<: *default_build
+build/cuda90/gcc/all/debug/shared:
+  <<: *default_build_with_test
   image: localhost:5000/gko-cuda90-gnu5-llvm39
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
-    EXTRA_CMAKE_FLAGS: &cuda_flags
-      "-DGINKGO_CUDA_ARCHITECTURES=35 -DCMAKE_CUDA_HOST_COMPILER=${CXX_COMPILER}"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-build/cuda90/clang/cuda/release/static:
-  <<: *default_build
+build/cuda90/clang/all/release/static:
+  <<: *default_build_with_test
   image: localhost:5000/gko-cuda90-gnu5-llvm39
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Release
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
 # cuda 9.1 and friends
-build/cuda91/gcc/cuda/debug/static:
-  <<: *default_build
+build/cuda91/gcc/all/debug/static:
+  <<: *default_build_with_test
   image: localhost:5000/gko-cuda91-gnu6-llvm40
   variables:
     <<: *default_variables
+    BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
     BUILD_SHARED_LIBS: "OFF"
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-build/cuda91/clang/cuda/release/shared:
-  <<: *default_build
+build/cuda91/clang/all/release/shared:
+  <<: *default_build_with_test
   image: localhost:5000/gko-cuda91-gnu6-llvm40
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Release
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-build/cuda91/intel/cuda/debug/shared:
-  <<: *default_build
-  image: localhost:5000/gko-cuda91-gnu6-llvm40
-  variables:
-    <<: *default_variables
-    C_COMPILER: icc
-    CXX_COMPILER: icpc
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
-    EXTRA_CMAKE_FLAGS: *cuda_flags
-  tags:
-    - cuda
-    - gpu
 
 # cuda 9.2 and friends
-build/cuda92/gcc/cuda/release/shared:
-  <<: *default_build
-  image: localhost:5000/gko-cuda92-gnu7-llvm50
+build/cuda92/gcc/all/release/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Release
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-build/cuda92/clang/cuda/debug/static:
-  <<: *default_build
-  image: localhost:5000/gko-cuda92-gnu7-llvm50
+build/cuda92/clang/all/debug/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
     BUILD_SHARED_LIBS: "OFF"
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
 build/cuda92/intel/cuda/release/static:
-  <<: *default_build
-  image: localhost:5000/gko-cuda92-gnu7-llvm50
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017
   variables:
     <<: *default_variables
-    C_COMPILER: icc
-    CXX_COMPILER: icpc
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Release
+    BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
 # cuda 10.0 and friends
-build/cuda100/gcc/cuda/debug/shared:
-  <<: *default_build
-  image: localhost:5000/gko-cuda100-gnu7-llvm60
+build/cuda100/gcc/all/debug/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda100-gnu7-llvm60-intel2018
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-build/cuda100/clang/cuda/release/static:
-  <<: *default_build
-  image: localhost:5000/gko-cuda100-gnu7-llvm60
+build/cuda100/clang/all/release/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda100-gnu7-llvm60-intel2018
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Release
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
 build/cuda100/intel/cuda/release/shared:
-  <<: *default_build
-  image: localhost:5000/gko-cuda100-gnu7-llvm60
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda100-gnu7-llvm60-intel2018
   variables:
     <<: *default_variables
-    C_COMPILER: icc
-    CXX_COMPILER: icpc
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Release
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_TYPE: "Release"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
 # cuda 10.1 and friends
-build/cuda101/gcc/cuda/debug/shared:
-  <<: *default_build
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+build/cuda101/gcc/all/debug/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-build/cuda101/clang/cuda/release/static:
-  <<: *default_build
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+build/cuda101/clang/all/release/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
 build/cuda101/intel/cuda/debug/static:
-  <<: *default_build
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
-    C_COMPILER: icc
-    CXX_COMPILER: icpc
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: Debug
-    EXTRA_CMAKE_FLAGS: *cuda_flags
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
+# clang-cuda with cuda 10.1 and friends
+build/clang-cuda101/gcc/all/release/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
+  variables:
+    <<: *default_variables
+    CUDA_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+build/clang-cuda101/clang/cuda/debug/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    CUDA_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+# HIP AMD
+build/amd/gcc/hip/debug/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-amd-gnu8-llvm7
+  variables:
+    <<: *default_variables
+    BUILD_OMP: "ON"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - amd
+    - gpu
+
+build/amd/clang/hip/release/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-amd-gnu8-llvm7
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - amd
+    - gpu
+
 # no cuda but latest gcc and clang
 build/nocuda/gcc/core/debug/static:
-  <<: *default_build
+  <<: *default_build_with_test
   image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
     BUILD_REFERENCE: "OFF"
-    BUILD_TYPE: Debug
+    BUILD_TYPE: "Debug"
     BUILD_SHARED_LIBS: "OFF"
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cpu
 
 build/nocuda/clang/core/release/shared:
-  <<: *default_build
+  <<: *default_build_with_test
   image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_REFERENCE: "OFF"
-    BUILD_TYPE: Release
+    BUILD_TYPE: "Release"
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cpu
 
 build/nocuda/intel/core/debug/shared:
-  <<: *default_build
-  image: localhost:5000/gko-nocuda-gnu8-llvm70
+  <<: *default_build_with_test
+  image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
-    C_COMPILER: icc
-    CXX_COMPILER: icpc
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
     BUILD_REFERENCE: "OFF"
-    BUILD_TYPE: Debug
+    BUILD_TYPE: "Debug"
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
+    - cuda
     - cpu
 
 build/nocuda/gcc/omp/release/shared:
-  <<: *default_build
+  <<: *default_build_with_test
   image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
-    BUILD_TYPE: Release
+    BUILD_TYPE: "Release"
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cpu
 
 build/nocuda/clang/omp/debug/static:
-  <<: *default_build
+  <<: *default_build_with_test
   image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
-    C_COMPILER: clang
-    CXX_COMPILER: clang++
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
-    BUILD_TYPE: Debug
+    BUILD_TYPE: "Debug"
     BUILD_SHARED_LIBS: "OFF"
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cpu
 
 build/nocuda/intel/omp/release/static:
-  <<: *default_build
-  image: localhost:5000/gko-nocuda-gnu8-llvm70
+  <<: *default_build_with_test
+  image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
-    C_COMPILER: icc
-    CXX_COMPILER: icpc
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
     BUILD_OMP: "ON"
-    BUILD_TYPE: Release
+    BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
+  only:
+    variables:
+      - $RUN_CI_TAG
   tags:
+    - private_ci
+    - cuda
     - cpu
 
 
@@ -368,15 +557,20 @@ build/nocuda/intel/omp/release/static:
 warnings:
   <<: *default_build
   stage: code_quality
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
+    BUILD_HIP: "ON"
     CXX_FLAGS: "-Werror=pedantic -pedantic-errors"
+  only:
+    variables:
+      - $RUN_CI_TAG
   dependencies: []
   allow_failure: yes
   tags:
+    - private_ci
     - cuda
     - gpu
 
@@ -384,15 +578,20 @@ warnings:
 no-circular-deps:
   <<: *default_build
   stage: code_quality
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
+    BUILD_HIP: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_CHECK_CIRCULAR_DEPS=on'
+  only:
+    variables:
+      - $RUN_CI_TAG
   dependencies: []
   allow_failure: no
   tags:
+    - private_ci
     - cuda
     - gpu
 
@@ -400,30 +599,40 @@ no-circular-deps:
 clang-tidy:
   <<: *default_build
   stage: code_quality
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
+    BUILD_HIP: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_CLANG_TIDY=ON'
+  only:
+    variables:
+      - $RUN_CI_TAG
   dependencies: []
   allow_failure: yes
   tags:
+    - private_ci
     - cuda
     - gpu
 
 iwyu:
   <<: *default_build
   stage: code_quality
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
+    BUILD_CUDA: "HIP"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON'
+  only:
+    variables:
+      - $RUN_CI_TAG
   dependencies: []
   allow_failure: yes
   tags:
+    - private_ci
     - cuda
     - gpu
 
@@ -431,7 +640,7 @@ iwyu:
 # For short living branches or PRs, try to detect an open PR
 sonarqube_cov_:
   stage: code_quality
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   before_script: *default_before_script
   script:
     - PR_ID=$(curl "https://api.github.com/search/issues?q=sha:${CI_COMMIT_SHA}"
@@ -453,7 +662,7 @@ sonarqube_cov_:
       -Dsonar.cfamily.build-wrapper-output=build/bw-output
       -Dsonar.cfamily.gcov.reportsPath=build/Testing/CoverageInfo
       ${sonar_branching}
-#    - bash <(curl -s https://codecov.io/bash) -X gcov -X xcode -f "!*examples*" -f "!*third_party*" -f "!*c\\+\\+*" -f "!*benchmark*"
+    - bash <(curl -s https://codecov.io/bash) -f "\!*examples*" -f "\!*third_party*" -f "\!*c\\+\\+*" -f "\!*benchmark*"
   dependencies: []
   except:
     refs:
@@ -462,8 +671,9 @@ sonarqube_cov_:
       - tags
   only:
     variables:
-      - $PUBLIC_CI_TAG
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
@@ -471,7 +681,7 @@ sonarqube_cov_:
 # (the one that was merged).
 sonarqube_cov:
   stage: code_quality
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
   before_script: *default_before_script
   script:
     - ctest -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=COVERAGE
@@ -480,7 +690,7 @@ sonarqube_cov:
       -Dsonar.cfamily.build-wrapper-output=build/bw-output
       -Dsonar.cfamily.gcov.reportsPath=build/Testing/CoverageInfo
       -Dsonar.branch.name=${CI_COMMIT_REF_NAME}
-#    - bash <(curl -s https://codecov.io/bash) -X gcov -X xcode -f "!*test*" -f "!*examples*" -f "!*third_party*" -f "!*c\\+\\+*" -f "!*benchmark*"
+    - bash <(curl -s https://codecov.io/bash) -f "\!*examples*" -f "\!*third_party*" -f "\!*c\\+\\+*" -f "\!*benchmark*"
   dependencies: []
   only:
     refs:
@@ -488,8 +698,9 @@ sonarqube_cov:
       - master
       - tags
     variables:
-      - $PUBLIC_CI_TAG
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
@@ -507,10 +718,10 @@ gh-pages:
     - mkdir -p ${CI_JOB_NAME} && pushd ${CI_JOB_NAME}
     - cmake ${CI_PROJECT_DIR}
         -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER}
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DBUILD_SHARED_LIBS=ON
-        ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=OFF
-        -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF
-        -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF
+        -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+        -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF
+        -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF
+        -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF
         -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON
     - make usr
     - make pdf
@@ -534,18 +745,25 @@ gh-pages:
       - master
       - tags
     variables:
-      - $PUBLIC_CI_TAG
+      - $RUN_CI_TAG
   except:
       - schedules
+  tags:
+    - private_ci
+    - cpu
 
 
 threadsanitizer:
   stage: QoS_tools
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
   before_script: *default_before_script
   script:
-    - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN
-      -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer
+    - LD_PRELOAD=/usr/local/lib/libomp.so
+      CC=clang CXX=clang++
+        ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN
+        -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer
+        -DCTEST_MEMORYCHECK_SANITIZER_OPTIONS=ignore_noninstrumented_modules=1
+        --timeout 6000
   dependencies: []
   only:
     refs:
@@ -553,14 +771,35 @@ threadsanitizer:
       - develop
       - tags
     variables:
-      - $PUBLIC_CI_TAG
+      - $RUN_CI_TAG
   tags:
+    - private_ci
+    - cuda
+    - gpu
+
+leaksanitizer:
+  stage: QoS_tools
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
+  before_script: *default_before_script
+  script:
+    - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=LSAN
+      -DCTEST_MEMORYCHECK_TYPE=LeakSanitizer
+  dependencies: []
+  only:
+    refs:
+      - master
+      - develop
+      - tags
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
     - cuda
     - gpu
 
 addresssanitizer:
   stage: QoS_tools
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
   before_script: *default_before_script
   script:
     - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=ASAN
@@ -572,17 +811,21 @@ addresssanitizer:
       - develop
       - tags
     variables:
-      - $PUBLIC_CI_TAG
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
-valgrind:
+undefinedsanitizer:
   stage: QoS_tools
-  image: localhost:5000/gko-cuda101-gnu8-llvm70
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
   before_script: *default_before_script
   script:
-    - ctest -V -S cmake/CTestScript.cmake -DCTEST_MEMORYCHECK_TYPE=Valgrind
+    # the Gold linker is required because of a linker flag issues given by UBsan
+    # in the Ubuntu setup we are using.
+    - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=UBSAN
+      -DCTEST_MEMORYCHECK_TYPE=UndefinedBehaviorSanitizer
   dependencies: []
   only:
     refs:
@@ -590,8 +833,9 @@ valgrind:
       - develop
       - tags
     variables:
-      - $PUBLIC_CI_TAG
+      - $RUN_CI_TAG
   tags:
+    - private_ci
     - cuda
     - gpu
 
@@ -617,10 +861,11 @@ valgrind:
 
 fineci-benchmark-build:
   stage: benchmark-build
+  image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
     BENCHMARK_SERVER: FINECI
-    BUILD_TYPE: Release
+    BUILD_TYPE: "Release"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
     PUBLIC_REPO: https://github.com/ginkgo-project/ginkgo.git
@@ -645,15 +890,20 @@ fineci-benchmark-build:
                   -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} \\
                   -DGINKGO_BUILD_OMP=${BUILD_OMP} \\
                   -DGINKGO_BUILD_CUDA=${BUILD_CUDA} \\
+                  -DGINKGO_BUILD_HIP=${BUILD_HIP} \\
                   -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF \\
                   -DGINKGO_BUILD_BENCHMARKS=ON
-        make -j$(grep 'core id' /proc/cpuinfo | sort -u | wc -l)
+        make -j${CI_PARALLELISM}
       EOT
   dependencies: []
   only:
     - schedules
 #    - develop
 #    - master
+  tags:
+    - private_ci
+    - cpu
+    - cuda
 
 
 # Benchmark runs
@@ -692,6 +942,7 @@ fineci-benchmark-build:
 
 fineci-benchmark-cuda:
   stage: benchmark-cuda
+  image: localhost:5000/gko-nocuda-gnu9-llvm8
   variables:
     <<: *default_variables
     BENCHMARK_SERVER: FINECI
@@ -700,6 +951,10 @@ fineci-benchmark-cuda:
     BENCHMARK_REPO: git@github.com:ginkgo-project/ginkgo-data.git
     SYSTEM_NAME: K20Xm
   <<: *default_benchmark
+  tags:
+    - private_ci
+    - cpu
+    - cuda
 
 # fineci-benchmark-omp:
 #   stage: benchmark-omp
@@ -725,6 +980,7 @@ fineci-benchmark-cuda:
 
 new-issue-on-failure:
   stage: on-failure
+  image: localhost:5000/gko-nocuda-gnu9-llvm8
   script: curl --request POST "https://gitlab.com/api/v4/projects/${PROJECT_ID}/issues?private_token=${BOT_ACCESS_TOKEN}&title=Error%20in%20${CI_PROJECT_NAME}%20with%20pipeline%20${CI_PIPELINE_ID}%20for%20commit%20${CI_COMMIT_SHA}&labels&description=${CI_PIPELINE_URL}"
   when: on_failure
   only:
@@ -732,3 +988,6 @@ new-issue-on-failure:
       - develop
       - master
   dependencies: []
+  tags:
+    - private_ci
+    - cpu
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32a05209929..bba805119bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,150 @@ commits. For a comprehensive list, use the following command:
 git log --first-parent
 ```
 
+## Version 1.2.0
+
+The Ginkgo team is proud to announce the new minor release of Ginkgo version
+1.2.0. This release brings full HIP support to Ginkgo, new preconditioners
+(ParILUT, ISAI), conversion between double and float for all LinOps, and many
+more features and fixes.
+
+Supported systems and requirements:
++ For all platforms, cmake 3.9+
++ Linux and MacOS
+  + gcc: 5.3+, 6.3+, 7.3+, all versions after 8.1+
+  + clang: 3.9+
+  + Intel compiler: 2017+
+  + Apple LLVM: 8.0+
+  + CUDA module: CUDA 9.0+
+  + HIP module: ROCm 2.8+
++ Windows
+  + MinGW and CygWin: gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+
+  + Microsoft Visual Studio: VS 2017 15.7+
+  + CUDA module: CUDA 9.0+, Microsoft Visual Studio
+  + OpenMP module: MinGW or CygWin.
+
+
+The current known issues can be found in the [known issues page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues).
+
+
+### Additions
+Here are the main additions to the Ginkgo library. Other thematic additions are listed below.
++ Add full HIP support to Ginkgo [#344](https://github.com/ginkgo-project/ginkgo/pull/344), [#357](https://github.com/ginkgo-project/ginkgo/pull/357), [#384](https://github.com/ginkgo-project/ginkgo/pull/384), [#373](https://github.com/ginkgo-project/ginkgo/pull/373), [#391](https://github.com/ginkgo-project/ginkgo/pull/391), [#396](https://github.com/ginkgo-project/ginkgo/pull/396), [#395](https://github.com/ginkgo-project/ginkgo/pull/395), [#393](https://github.com/ginkgo-project/ginkgo/pull/393), [#404](https://github.com/ginkgo-project/ginkgo/pull/404), [#439](https://github.com/ginkgo-project/ginkgo/pull/439), [#443](https://github.com/ginkgo-project/ginkgo/pull/443), [#567](https://github.com/ginkgo-project/ginkgo/pull/567)
++ Add a new ISAI preconditioner [#489](https://github.com/ginkgo-project/ginkgo/pull/489), [#502](https://github.com/ginkgo-project/ginkgo/pull/502), [#512](https://github.com/ginkgo-project/ginkgo/pull/512), [#508](https://github.com/ginkgo-project/ginkgo/pull/508), [#520](https://github.com/ginkgo-project/ginkgo/pull/520)
++ Add support for ParILUT and ParICT factorization with ILU preconditioners [#400](https://github.com/ginkgo-project/ginkgo/pull/400)
++ Add a new BiCG solver [#438](https://github.com/ginkgo-project/ginkgo/pull/438)
++ Add a new permutation matrix format [#352](https://github.com/ginkgo-project/ginkgo/pull/352), [#469](https://github.com/ginkgo-project/ginkgo/pull/469)
++ Add CSR SpGEMM support [#386](https://github.com/ginkgo-project/ginkgo/pull/386), [#398](https://github.com/ginkgo-project/ginkgo/pull/398), [#418](https://github.com/ginkgo-project/ginkgo/pull/418), [#457](https://github.com/ginkgo-project/ginkgo/pull/457)
++ Add CSR SpGEAM support [#556](https://github.com/ginkgo-project/ginkgo/pull/556)
++ Make all solvers and preconditioners transposable [#535](https://github.com/ginkgo-project/ginkgo/pull/535)
++ Add CsrBuilder and CooBuilder for intrusive access to matrix arrays [#437](https://github.com/ginkgo-project/ginkgo/pull/437)
++ Add a standard-compliant allocator based on the Executors [#504](https://github.com/ginkgo-project/ginkgo/pull/504)
++ Support conversions for all LinOp between double and float [#521](https://github.com/ginkgo-project/ginkgo/pull/521)
++ Add a new boolean to the CUDA and HIP executors to control DeviceReset (default off) [#557](https://github.com/ginkgo-project/ginkgo/pull/557)
++ Add a relaxation factor to IR to represent Richardson Relaxation [#574](https://github.com/ginkgo-project/ginkgo/pull/574)
++ Add two new stopping criteria, for relative (to `norm(b)`) and absolute residual norm [#577](https://github.com/ginkgo-project/ginkgo/pull/577)
+
+#### Example additions
++ Templatize all examples to simplify changing the precision [#513](https://github.com/ginkgo-project/ginkgo/pull/513)
++ Add a new adaptive precision block-Jacobi example [#507](https://github.com/ginkgo-project/ginkgo/pull/507)
++ Add a new IR example [#522](https://github.com/ginkgo-project/ginkgo/pull/522)
++ Add a new Mixed Precision Iterative Refinement example [#525](https://github.com/ginkgo-project/ginkgo/pull/525)
++ Add a new example on iterative trisolves in ILU preconditioning [#526](https://github.com/ginkgo-project/ginkgo/pull/526), [#536](https://github.com/ginkgo-project/ginkgo/pull/536), [#550](https://github.com/ginkgo-project/ginkgo/pull/550)
+
+#### Compilation and library changes
++ Auto-detect compilation settings based on environment [#435](https://github.com/ginkgo-project/ginkgo/pull/435), [#537](https://github.com/ginkgo-project/ginkgo/pull/537)
++ Add SONAME to shared libraries [#524](https://github.com/ginkgo-project/ginkgo/pull/524)
++ Add clang-cuda support [#543](https://github.com/ginkgo-project/ginkgo/pull/543)
+
+#### Other additions
++ Add sorting, searching and merging kernels for GPUs [#403](https://github.com/ginkgo-project/ginkgo/pull/403), [#428](https://github.com/ginkgo-project/ginkgo/pull/428), [#417](https://github.com/ginkgo-project/ginkgo/pull/417), [#455](https://github.com/ginkgo-project/ginkgo/pull/455)
++ Add `gko::as` support for smart pointers [#493](https://github.com/ginkgo-project/ginkgo/pull/493)
++ Add setters and getters for criterion factories [#527](https://github.com/ginkgo-project/ginkgo/pull/527)
++ Add a new method to check whether a solver uses `x` as an initial guess [#531](https://github.com/ginkgo-project/ginkgo/pull/531)
++ Add contribution guidelines [#549](https://github.com/ginkgo-project/ginkgo/pull/549)
+
+### Fixes
+#### Algorithms
++ Improve the classical CSR strategy's performance [#401](https://github.com/ginkgo-project/ginkgo/pull/401)
++ Improve the CSR automatical strategy [#407](https://github.com/ginkgo-project/ginkgo/pull/407), [#559](https://github.com/ginkgo-project/ginkgo/pull/559)
++ Memory, speed improvements to the ELL kernel [#411](https://github.com/ginkgo-project/ginkgo/pull/411)
++ Multiple improvements and fixes to ParILU [#419](https://github.com/ginkgo-project/ginkgo/pull/419), [#427](https://github.com/ginkgo-project/ginkgo/pull/427), [#429](https://github.com/ginkgo-project/ginkgo/pull/429), [#456](https://github.com/ginkgo-project/ginkgo/pull/456), [#544](https://github.com/ginkgo-project/ginkgo/pull/544)
++ Fix multiple issues with GMRES [#481](https://github.com/ginkgo-project/ginkgo/pull/481), [#523](https://github.com/ginkgo-project/ginkgo/pull/523), [#575](https://github.com/ginkgo-project/ginkgo/pull/575)
++ Optimize OpenMP matrix conversions [#505](https://github.com/ginkgo-project/ginkgo/pull/505)
++ Ensure the linearity of the ILU preconditioner [#506](https://github.com/ginkgo-project/ginkgo/pull/506)
++ Fix IR's use of the advanced apply [#522](https://github.com/ginkgo-project/ginkgo/pull/522)
++ Fix empty matrices conversions and add tests [#560](https://github.com/ginkgo-project/ginkgo/pull/560)
+
+#### Other core functionalities
++ Fix complex number support in our math header [#410](https://github.com/ginkgo-project/ginkgo/pull/410)
++ Fix CUDA compatibility of the main ginkgo header [#450](https://github.com/ginkgo-project/ginkgo/pull/450)
++ Fix isfinite issues [#465](https://github.com/ginkgo-project/ginkgo/pull/465)
++ Fix the Array::view memory leak and the array/view copy/move [#485](https://github.com/ginkgo-project/ginkgo/pull/485)
++ Fix typos preventing use of some interface functions [#496](https://github.com/ginkgo-project/ginkgo/pull/496)
++ Fix the `gko::dim` to abide to the C++ standard [#498](https://github.com/ginkgo-project/ginkgo/pull/498)
++ Simplify the executor copy interface [#516](https://github.com/ginkgo-project/ginkgo/pull/516)
++ Optimize intermediate storage for Composition [#540](https://github.com/ginkgo-project/ginkgo/pull/540)
++ Provide an initial guess for relevant Compositions [#561](https://github.com/ginkgo-project/ginkgo/pull/561)
++ Better management of nullptr as criterion [#562](https://github.com/ginkgo-project/ginkgo/pull/562)
++ Fix the norm calculations for complex support [#564](https://github.com/ginkgo-project/ginkgo/pull/564)
+
+#### CUDA and HIP specific
++ Use the return value of the atomic operations in our wrappers [#405](https://github.com/ginkgo-project/ginkgo/pull/405)
++ Improve the portability of warp lane masks [#422](https://github.com/ginkgo-project/ginkgo/pull/422)
++ Extract thread ID computation into a separate function [#464](https://github.com/ginkgo-project/ginkgo/pull/464)
++ Reorder kernel parameters for consistency [#474](https://github.com/ginkgo-project/ginkgo/pull/474)
++ Fix the use of `pragma unroll` in HIP [#492](https://github.com/ginkgo-project/ginkgo/pull/492)
+
+#### Other
++ Fix the Ginkgo CMake installation files [#414](https://github.com/ginkgo-project/ginkgo/pull/414), [#553](https://github.com/ginkgo-project/ginkgo/pull/553)
++ Fix the Windows compilation [#415](https://github.com/ginkgo-project/ginkgo/pull/415)
++ Always use demangled types in error messages [#434](https://github.com/ginkgo-project/ginkgo/pull/434), [#486](https://github.com/ginkgo-project/ginkgo/pull/486)
++ Add CUDA header dependency to appropriate tests [#452](https://github.com/ginkgo-project/ginkgo/pull/452)
++ Fix several sonarqube or compilation warnings [#453](https://github.com/ginkgo-project/ginkgo/pull/453), [#463](https://github.com/ginkgo-project/ginkgo/pull/463), [#532](https://github.com/ginkgo-project/ginkgo/pull/532), [#569](https://github.com/ginkgo-project/ginkgo/pull/569)
++ Add shuffle tests [#460](https://github.com/ginkgo-project/ginkgo/pull/460)
++ Fix MSVC C2398 error [#490](https://github.com/ginkgo-project/ginkgo/pull/490)
++ Fix missing interface tests in test install [#558](https://github.com/ginkgo-project/ginkgo/pull/558)
+
+### Tools and ecosystem
+#### Benchmarks
++ Add better norm support in the benchmarks [#377](https://github.com/ginkgo-project/ginkgo/pull/377)
++ Add CUDA 10.1 generic SpMV support in benchmarks [#468](https://github.com/ginkgo-project/ginkgo/pull/468), [#473](https://github.com/ginkgo-project/ginkgo/pull/473)
++ Add sparse library ILU in benchmarks [#487](https://github.com/ginkgo-project/ginkgo/pull/487)
++ Add overhead benchmarking capacities [#501](https://github.com/ginkgo-project/ginkgo/pull/501)
++ Allow benchmarking from a matrix list file [#503](https://github.com/ginkgo-project/ginkgo/pull/503)
++ Fix benchmarking issue with JSON and non-finite numbers [#514](https://github.com/ginkgo-project/ginkgo/pull/514)
++ Fix benchmark logger crashers with OpenMP [#565](https://github.com/ginkgo-project/ginkgo/pull/565)
+
+#### CI related
++ Improvements to the CI setup with HIP compilation [#421](https://github.com/ginkgo-project/ginkgo/pull/421), [#466](https://github.com/ginkgo-project/ginkgo/pull/466)
++ Add MacOSX CI support [#470](https://github.com/ginkgo-project/ginkgo/pull/470), [#488](https://github.com/ginkgo-project/ginkgo/pull/488)
++ Add Windows CI support [#471](https://github.com/ginkgo-project/ginkgo/pull/471), [#488](https://github.com/ginkgo-project/ginkgo/pull/488), [#510](https://github.com/ginkgo-project/ginkgo/pull/510), [#566](https://github.com/ginkgo-project/ginkgo/pull/566)
++ Use sanitizers instead of valgrind [#476](https://github.com/ginkgo-project/ginkgo/pull/476)
++ Add automatic container generation and update facilities [#499](https://github.com/ginkgo-project/ginkgo/pull/499)
++ Fix the CI parallelism settings [#517](https://github.com/ginkgo-project/ginkgo/pull/517), [#538](https://github.com/ginkgo-project/ginkgo/pull/538), [#539](https://github.com/ginkgo-project/ginkgo/pull/539)
++ Make the codecov patch check informational [#519](https://github.com/ginkgo-project/ginkgo/pull/519)
++ Add support for LLVM sanitizers with improved thread sanitizer support [#578](https://github.com/ginkgo-project/ginkgo/pull/578)
+
+#### Test suite
++ Add an assertion for sparsity pattern equality [#416](https://github.com/ginkgo-project/ginkgo/pull/416)
++ Add core and reference multiprecision tests support [#448](https://github.com/ginkgo-project/ginkgo/pull/448)
++ Speed up GPU tests by avoiding device reset [#467](https://github.com/ginkgo-project/ginkgo/pull/467)
++ Change test matrix location string [#494](https://github.com/ginkgo-project/ginkgo/pull/494)
+
+#### Other
++ Add Ginkgo badges from our tools [#413](https://github.com/ginkgo-project/ginkgo/pull/413)
++ Update the `create_new_algorithm.sh` script [#420](https://github.com/ginkgo-project/ginkgo/pull/420)
++ Bump copyright and improve license management [#436](https://github.com/ginkgo-project/ginkgo/pull/436), [#433](https://github.com/ginkgo-project/ginkgo/pull/433)
++ Set clang-format minimum requirement [#441](https://github.com/ginkgo-project/ginkgo/pull/441), [#484](https://github.com/ginkgo-project/ginkgo/pull/484)
++ Update git-cmake-format [#446](https://github.com/ginkgo-project/ginkgo/pull/446), [#484](https://github.com/ginkgo-project/ginkgo/pull/484)
++ Disable the development tools by default [#442](https://github.com/ginkgo-project/ginkgo/pull/442)
++ Add a script for automatic header formatting [#447](https://github.com/ginkgo-project/ginkgo/pull/447)
++ Add GDB pretty printer for `gko::Array` [#509](https://github.com/ginkgo-project/ginkgo/pull/509)
++ Improve compilation speed [#533](https://github.com/ginkgo-project/ginkgo/pull/533)
++ Add editorconfig support [#546](https://github.com/ginkgo-project/ginkgo/pull/546)
++ Add a compile-time check for header self-sufficiency [#552](https://github.com/ginkgo-project/ginkgo/pull/552)
+
+
 ## Version 1.1.1
 This version of Ginkgo provides a few fixes in Ginkgo's core routines. The
 supported systems and requirements are unchanged from version 1.1.0.
diff --git a/CITING.md b/CITING.md
new file mode 100644
index 00000000000..7f579d1a69f
--- /dev/null
+++ b/CITING.md
@@ -0,0 +1,94 @@
+# Citing Ginkgo                                           {#citing_ginkgo}
+
+The main Ginkgo paper describing Ginkgo's purpose, design and interface is
+available through the following reference:
+
+``` bibtex
+@misc{anzt2020ginkgo,
+    title={Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing},
+    author={Hartwig Anzt and Terry Cojean and Goran Flegar and Fritz Göbel and Thomas Grützmacher and Pratik Nayak and Tobias Ribizel and Yuhsiang Mike Tsai and Enrique S. Quintana-Ortí},
+    year={2020},
+    eprint={2006.16852},
+    archivePrefix={arXiv},
+    primaryClass={cs.MS}
+}
+```
+
+Multiple topical papers exist on Ginkgo and its algorithms. The following papers
+can be used to cite specific aspects of the Ginkgo project.
+
+### On Portability
+
+``` bibtex
+@misc{tsai2020amdportability,
+    title={Preparing Ginkgo for AMD GPUs -- A Testimonial on Porting CUDA Code to HIP},
+    author={Yuhsiang M. Tsai and Terry Cojean and Tobias Ribizel and Hartwig Anzt},
+    year={2020},
+    eprint={2006.14290},
+    archivePrefix={arXiv},
+    primaryClass={cs.MS}
+}
+```
+
+### On Software Sustainability
+
+``` bibtex
+@inproceedings{anzt2019pasccb,
+author = {Anzt, Hartwig and Chen, Yen-Chen and Cojean, Terry and Dongarra, Jack and Flegar, Goran and Nayak, Pratik and Quintana-Ort\'{\i}, Enrique S. and Tsai, Yuhsiang M. and Wang, Weichung},
+title = {Towards Continuous Benchmarking: An Automated Performance Evaluation Framework for High Performance Software},
+year = {2019},
+isbn = {9781450367707},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3324989.3325719},
+doi = {10.1145/3324989.3325719},
+booktitle = {Proceedings of the Platform for Advanced Scientific Computing Conference},
+articleno = {9},
+numpages = {11},
+keywords = {interactive performance visualization, healthy software lifecycle, continuous integration, automated performance benchmarking},
+location = {Zurich, Switzerland},
+series = {PASC ’19}
+}
+```
+
+### On SpMV performance
+
+``` bibtex
+@InProceedings{tsai2020amdspmv,
+author="Tsai, Yuhsiang M.
+and Cojean, Terry
+and Anzt, Hartwig",
+editor="Sadayappan, Ponnuswamy
+and Chamberlain, Bradford L.
+and Juckeland, Guido
+and Ltaief, Hatem",
+title="Sparse Linear Algebra on AMD and NVIDIA GPUs -- The Race Is On",
+booktitle="High Performance Computing",
+year="2020",
+publisher="Springer International Publishing",
+address="Cham",
+pages="309--327",
+abstract="Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD's hipSPARSE library and NVIDIA's cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.",
+isbn="978-3-030-50743-5"
+}
+
+
+@article{anzt2020spmv,
+author = {Anzt, Hartwig and Cojean, Terry and Yen-Chen, Chen and Dongarra, Jack and Flegar, Goran and Nayak, Pratik and Tomov, Stanimire and Tsai, Yuhsiang M. and Wang, Weichung},
+title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs},
+year = {2020},
+issue_date = {March 2020},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {7},
+number = {1},
+issn = {2329-4949},
+url = {https://doi.org/10.1145/3380930},
+doi = {10.1145/3380930},
+journal = {ACM Trans. Parallel Comput.},
+month = mar,
+articleno = {2},
+numpages = {26},
+keywords = {irregular matrices, GPUs, Sparse Matrix Vector Product (SpMV)}
+}
+```
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e9af2bdd07..5835b7a27a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,17 +1,23 @@
 cmake_minimum_required(VERSION 3.9)
 
-project(Ginkgo LANGUAGES C CXX VERSION 1.1.1 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
+project(Ginkgo LANGUAGES C CXX VERSION 1.2.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
 set(Ginkgo_VERSION_TAG "master")
 set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
 
+# Determine which executors can be compiled
+include(cmake/hip_path.cmake)
+include(cmake/autodetect_executors.cmake)
+include(cmake/build_type_helpers.cmake)
+
 # Ginkgo configuration options
-option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" ON)
+option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF)
 option(GINKGO_BUILD_TESTS "Generate build files for unit tests" ON)
 option(GINKGO_BUILD_EXAMPLES "Build Ginkgo's examples" ON)
 option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON)
 option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON)
-option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" OFF)
-option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" OFF)
+option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP})
+option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
+option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
 option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
@@ -20,6 +26,9 @@ option(GINKGO_EXPORT_BUILD_DIR
     OFF)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
 option(GINKGO_WITH_IWYU "Make Ginkgo call `iwyu` (Include What You Use) to find include issues." OFF)
+option(GINKGO_CHECK_CIRCULAR_DEPS
+    "Enable compile-time checks detecting circular dependencies between libraries and non-self-sufficient headers."
+    OFF)
 set(GINKGO_VERBOSE_LEVEL "1" CACHE STRING
     "Verbosity level. Put 0 to turn off. 1 activates a few important messages.")
 if(MSVC)
@@ -34,29 +43,47 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING
 set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING
     "A list of target NVIDIA GPU achitectures. See README.md for more detail.")
 option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF)
+set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING
+    "Set the required HIP compiler flags. Current default is an empty string.")
+set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING
+    "Set the required HIP nvcc compiler flags. Current default is an empty string.")
+set(GINKGO_HIP_HCC_COMPILER_FLAGS "" CACHE STRING
+    "Set the required HIP HCC compiler flags. Current default is an empty string.")
+set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING
+    "Set the required HIP CLANG compiler flags. Current default is an empty string.")
+set(GINKGO_HIP_AMDGPU "" CACHE STRING
+    "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).")
 option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF)
 option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON)
 
+set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined")
+
 if(BUILD_SHARED_LIBS AND (WIN32 OR CYGWIN) AND (GINKGO_BUILD_TESTS OR GINKGO_BUILD_EXAMPLES OR GINKGO_BUILD_BENCHMARKS))
     # Change shared libraries output only if this build has executable program with shared libraries.
     set(GINKGO_CHANGED_SHARED_LIBRARY TRUE)
     option(GINKGO_CHECK_PATH "Tell Ginkgo to check if the environment variable PATH is available for this build." ON)
     set(GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH "windows_shared_library" CACHE STRING
         "Set Ginkgo's shared library relative path in windows. Current default is `windows_shared_library`. \
-        This absoulte path ${PROJECT_BINARY_DIR}/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH must be in the environment variable PATH.")
+        This absolute path ${PROJECT_BINARY_DIR}/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH must be in the environment variable PATH.")
     set(GINKGO_WINDOWS_SHARED_LIBRARY_PATH ${PROJECT_BINARY_DIR}/${GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH})
 else()
     set(GINKGO_CHANGED_SHARED_LIBRARY FALSE)
 endif()
 
-if(GINKGO_BUILD_TESTS AND (GINKGO_BUILD_CUDA OR GINKGO_BUILD_OMP))
+if(GINKGO_BUILD_TESTS AND (GINKGO_BUILD_CUDA OR GINKGO_BUILD_OMP OR GINKGO_BUILD_HIP))
     message(STATUS "GINKGO_BUILD_TESTS is ON, enabling GINKGO_BUILD_REFERENCE")
     set(GINKGO_BUILD_REFERENCE ON CACHE BOOL "Compile reference CPU kernels" FORCE)
 endif()
 
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to 'Release' as none was specified.")
-  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+    message(STATUS "Setting build type to 'Release' as none was specified.")
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+endif()
+
+if(BUILD_SHARED_LIBS)
+    set(GINKGO_STATIC_OR_SHARED SHARED)
+else()
+    set(GINKGO_STATIC_OR_SHARED STATIC)
 endif()
 
 # Ensure we have a debug postfix
@@ -77,22 +104,15 @@ if(GINKGO_BUILD_TESTS)
     include(CTest)
 endif()
 
-if (GINKGO_WITH_CLANG_TIDY)
-  find_program(GINKGO_CLANG_TIDY_PATH clang-tidy)
+if(GINKGO_WITH_CLANG_TIDY)
+    find_program(GINKGO_CLANG_TIDY_PATH clang-tidy)
 endif()
 
-if (GINKGO_WITH_IWYU)
-  find_program(GINKGO_IWYU_PATH iwyu)
+if(GINKGO_WITH_IWYU)
+    find_program(GINKGO_IWYU_PATH iwyu)
 endif()
 
-
-# Load CMake helpers and modules
-include(cmake/build_helpers.cmake)
-include(cmake/build_type_helpers.cmake)
-include(cmake/create_test.cmake)
-include(cmake/install_helpers.cmake)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
-
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 
 # Find important header files, store the definitions in include/ginkgo/config.h.in
 # For details, see https://gitlab.kitware.com/cmake/community/wikis/doc/tutorials/How-To-Write-Platform-Checks
@@ -102,13 +122,41 @@ check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H)
 # Automatically find PAPI and search for the required 'sde' component
 set(GINKGO_HAVE_PAPI_SDE 0)
 find_package(PAPI OPTIONAL_COMPONENTS sde)
-if (PAPI_sde_FOUND)
+if(PAPI_sde_FOUND)
     set(GINKGO_HAVE_PAPI_SDE 1)
 endif()
 
+set(GINKGO_HIP_PLATFORM_NVCC 0)
+set(GINKGO_HIP_PLATFORM_HCC 0)
+
+if(GINKGO_BUILD_HIP)
+    # GINKGO_HIPCONFIG_PATH and HIP_PATH are set in cmake/hip_path.cmake
+    if(DEFINED ENV{HIP_PLATFORM})
+        set(GINKGO_HIP_PLATFORM "$ENV{HIP_PLATFORM}")
+    elseif(GINKGO_HIPCONFIG_PATH)
+        execute_process(COMMAND ${GINKGO_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE GINKGO_HIP_PLATFORM)
+    else()
+        message(FATAL_ERROR "No platform could be found for HIP. "
+            "Set and export the environment variable HIP_PLATFORM.")
+    endif()
+    message(STATUS "HIP platform set to ${GINKGO_HIP_PLATFORM}")
+
+    if (GINKGO_HIP_PLATFORM STREQUAL "hcc")
+        set(GINKGO_HIP_PLATFORM_HCC 1)
+    elseif (GINKGO_HIP_PLATFORM STREQUAL "nvcc")
+        set(GINKGO_HIP_PLATFORM_NVCC 1)
+    endif()
+endif()
+
 configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in
     ${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY)
 
+# Load CMake helpers
+include(cmake/build_helpers.cmake)
+include(cmake/hip_helpers.cmake)
+include(cmake/install_helpers.cmake)
+include(cmake/windows_helpers.cmake)
+
 # This is modified from https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace
 if(MSVC)
     if(BUILD_SHARED_LIBS)
@@ -129,19 +177,25 @@ ginkgo_find_package(gflags gflags FALSE 2.2.2)
 ginkgo_find_package(RapidJSON rapidjson TRUE 1.1.0)
 add_subdirectory(third_party)    # Third-party tools and libraries
 
+# Ginkgo core libraries
 # Needs to be first in order for `CMAKE_CUDA_DEVICE_LINK_EXECUTABLE` to be
 # propagated to the other parts of Ginkgo in case of building as static libraries
 if(GINKGO_BUILD_CUDA)
     add_subdirectory(cuda)       # High-performance kernels for NVIDIA GPUs
 endif()
-# Ginkgo core libraries
 add_subdirectory(core)           # Core Ginkgo types and top-level functions
+add_subdirectory(include)        # Public API self-contained check
 if (GINKGO_BUILD_REFERENCE)
     add_subdirectory(reference)  # Reference kernel implementations
 endif()
 if (GINKGO_BUILD_OMP)
     add_subdirectory(omp)        # High-performance omp kernels
 endif()
+# HIP needs to be last because it builds the GINKGO_RPATH_FOR_HIP variable
+# which needs to know the `ginkgo` target.
+if(GINKGO_BUILD_HIP)
+    add_subdirectory(hip)        # High-performance kernels for AMD or NVIDIA GPUs
+endif()
 
 # Non core directories and targets
 if(GINKGO_BUILD_EXAMPLES)
@@ -159,13 +213,33 @@ if(GINKGO_DEVEL_TOOLS)
     add_dependencies(format add_license)
 endif()
 
-# Generate the global `ginkgo/ginkgo.hpp` header with every call of make
-# when bash is present and the developer tools are enabled
+# MacOS needs to install bash, gnu-sed, findutils and coreutils
+# format_header needs clang-format 6.0.0+
 find_program(BASH bash)
 if(NOT "${BASH}" STREQUAL "BASH-NOTFOUND" AND GINKGO_DEVEL_TOOLS)
     add_custom_target(generate_ginkgo_header ALL
         COMMAND ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/update_ginkgo_header.sh
         WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR})
+    find_program(GIT git)
+    if(NOT "${GIT}" STREQUAL "GIT-NOTFOUND")
+        add_custom_target(format_header
+            COMMAND echo "format header on the modified code files except build/examples/third_party/ginkgo.hpp"
+            COMMAND bash -c "git diff --name-only origin/master...HEAD | \
+                grep -Ev 'build|examples|third_party|ginkgo.hpp' | \
+                grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \
+                xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh"
+            WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}
+            VERBATIM)
+    endif()
+    unset(GIT CACHE)
+    add_custom_target(format_header_all
+        COMMAND echo "format header on all code files except build/examples/third_party/ginkgo.hpp"
+        COMMAND bash -c "find * -type f | \
+                grep -Ev 'build|examples|third_party|ginkgo.hpp' | \
+                grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \
+                xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh"
+        WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}
+        VERBATIM)
 endif()
 unset(BASH CACHE)
 
@@ -182,15 +256,34 @@ endif()
 configure_file(${Ginkgo_SOURCE_DIR}/cmake/ginkgo.pc.in
     ${Ginkgo_BINARY_DIR}/ginkgo.pc @ONLY)
 
+# WINDOWS NVCC has " inside the string, add escape charater to avoid config problem.
+ginkgo_modify_flags(CMAKE_CUDA_FLAGS)
+ginkgo_modify_flags(CMAKE_CUDA_FLAGS_DEBUG)
+ginkgo_modify_flags(CMAKE_CUDA_FLAGS_RELEASE)
 ginkgo_install()
 
+if(MSVC)
+    # Set path/command with $<CONFIG>
+    set(GINKGO_TEST_INSTALL_COMMAND "${Ginkgo_BINARY_DIR}/test_install/$<CONFIG>/test_install")
+    if(GINKGO_BUILD_CUDA)
+        set(GINKGO_TEST_INSTALL_COMMAND "${GINKGO_TEST_INSTALL_COMMAND}" "${Ginkgo_BINARY_DIR}/test_install/$<CONFIG>/test_install_cuda")
+    endif()
+else()
+    set(GINKGO_TEST_INSTALL_COMMAND "${Ginkgo_BINARY_DIR}/test_install/test_install")
+    if(GINKGO_BUILD_CUDA)
+        set(GINKGO_TEST_INSTALL_COMMAND "${GINKGO_TEST_INSTALL_COMMAND}" "${Ginkgo_BINARY_DIR}/test_install/test_install_cuda")
+    endif()
+endif()
 add_custom_target(test_install
     COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} -H${Ginkgo_SOURCE_DIR}/test_install
-        -B${Ginkgo_BINARY_DIR}/test_install
-        -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}/${GINKGO_INSTALL_CONFIG_DIR}
-        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    COMMAND ${CMAKE_COMMAND} --build ${Ginkgo_BINARY_DIR}/test_install
-    COMMAND ${Ginkgo_BINARY_DIR}/test_install/test_install
+    -B${Ginkgo_BINARY_DIR}/test_install
+    -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}/${GINKGO_INSTALL_CONFIG_DIR}
+    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+    # `--config cfg` is ignored by single-configuration generator.
+    # `$<CONFIG>` is always be the same as `CMAKE_BUILD_TYPE` in single-configuration generator.
+    COMMAND ${CMAKE_COMMAND} --build ${Ginkgo_BINARY_DIR}/test_install --config $<CONFIG>
+    COMMAND ${GINKGO_TEST_INSTALL_COMMAND}
     COMMENT "Running a test on the installed binaries. This requires running `(sudo) make install` first.")
 
 # Setup CPack
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000000..9fcdc25ed13
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,610 @@
+# Contributing guidelines                         {#contributing_guidelines}
+
+We are glad that you are interested in contributing to Ginkgo. Please have a
+look at our coding guidelines before proposing a pull request.
+
+## Table of Contents
+
+[Most Important stuff](#most-important-stuff-a-tldr)
+
+[Project Structure](#project-structure)
+ * [Extended header files](#extended-header-files)
+ * [Using library classes](#using-library-classes)
+
+[Git related](#git-related)
+ * [Our git Workflow](#our-git-workflow)
+ * [Writing good commit messages](#writing-good-commit-messages)
+ * [Creating, Reviewing and Merging Pull
+   Requests](#creating-reviewing-and-merging-pull-requests)
+
+[Code Style](#code-style)
+ * [Automatic code formatting](#automatic-code-formatting)
+ * [Naming Scheme](#naming-scheme)
+ * [Whitespace](#whitespace)
+ * [Include statement grouping](#include-statement-grouping)
+ * [Other Code Formatting not handled by
+   ClangFormat](#other-code-formatting-not-handled-by-clangformat)
+ * [CMake coding style](#cmake-coding-style)
+
+[Helper Scripts](#helper-scripts)
+ * [Create a new algorithm](#create-a-new-algorithm)
+ * [Converting CUDA code to HIP code](#converting-cuda-code-to-hip-code)
+
+[Writing Tests](#writing-tests)
+ * [Testing know-how](#testing-know-how)
+ * [Some general rules](#some-general-rules)
+ * [Writing tests for kernels](#writing-tests-for-kernels)
+
+[Documentation style](#documentation-style)
+ * [Developer targeted notes](#developer-targeted-notes)
+ * [Whitespaces](#whitespaces)
+ * [Documenting examples](#documenting-examples)
+
+[Other programming comments](#other-programming-comments)
+ * [C++ standard stream objects](#c-standard-stream-objects)
+ * [Warnings](#warnings)
+ * [Avoiding circular dependencies](#avoiding-circular-dependencies)
+
+
+## Most important stuff (A TL;DR)
+
+* `GINKGO_DEVEL_TOOLS` needs to be set to `on` to commit. This requires
+  `clang-format` to be installed. See [Automatic code
+  formatting](#automatic-code-formatting) for more details. Once installed, you
+  can run `make format` in your `build/` folder to automatically format your
+  modified files. As `make format` unstages your files post-formatting, you must
+  stage the files again once you have verified that `make format` has done the
+  appropriate formatting, before committing the files.
+
+* See [Our git workflow](#our-git-workflow) to get a quick overview of our
+  workflow.
+
+* See [Creating, Reviewing and Merging Pull
+  Requests](#creating-reviewing-and-merging-pull-requests) on how to create a
+  Pull request.
+
+
+## Project structure
+
+Ginkgo is divided into a `core` module with common functionalities independent
+of the architecture, and several kernel modules (`reference`, `omp`, `cuda`,
+`hip`) which contain low-level computational routines for each supported
+architecture.
+
+### Extended header files
+
+Some header files from the core module have to be extended to include special
+functionality for specific architectures. An example of this is
+`core/base/math.hpp`, which has a GPU counterpart in `cuda/base/math.hpp`. For
+such files you should always include the version from the module you are working
+on, and this file will internally include its `core` counterpart.
+
+### Using library classes
+
+You can use and call functions of existing classes inside a kernel (that are
+defined and not just declared in a header file), however, you are not allowed to
+create new instances of a polymorphic class inside a kernel (or in general
+inside any kernel module like cuda/hip/omp/reference) as this creates circular
+dependencies between the `core` and the backend library. With this in mind, our
+CI contains a job which checks if such a circular dependency exists.
+These checks can be run manually using the `-DGINKGO_CHECK_CIRCULAR_DEPS=ON`
+option in the CMake configuration.
+
+For example, when creating a new matrix class `AB` by combining existing classes
+`A` and `B`, the `AB::apply()` function composed of invocations to `A::apply()`
+and `B::apply()` can only be defined in the core module, it is not possible to
+create instances of `A` and `B` inside the `AB` kernel files. This is to avoid
+the aforementioned circular dependency issue. An example for such a class is the
+`Hybrid` matrix format, which uses the `apply()` of the `Ell` and `Coo` matrix
+formats. Nevertheless, it is possible to call the kernels themselves directly
+within the same executor. For example, `cuda::dense::add_scaled()` can be called
+from any other `cuda` kernel.
+
+## Git related
+
+Ginkgo uses git, the distributed version control system to track code changes
+and coordinate work among its developers. A general guide to git can be found in
+[its extensive documentation](https://git-scm.com/docs).
+
+### Our git workflow
+
+In Ginkgo, we prioritize keeping a clean history over accurate tracking of
+commits. `git rebase` is hence our command of choice to make sure that we have a
+nice and linear history, especially for pulling the latest changes from the
+`develop` branch. More importantly, rebasing upon develop is **required** before
+the commits of the PR are merged into the `develop` branch.
+
+### Writing good commit messages
+
+With software sustainability and maintainability in mind, it is important to
+write commit messages that are short, clear and informative. Ideally, this would
+be the format to prefer:
+
+```sh
+Summary of the changes in a sentence, max 50 chars.
+
+More detailed comments:
++ Changes that have been added.
+- Changes that been removed.
+
+Related PR: https://github.com/ginkgo-project/ginkgo/pull/<PR-number>
+```
+
+You can refer to [this informative
+guide](https://chris.beams.io/posts/git-commit/) for more details.
+
+#### Attributing credit
+
+Git has a nice feature where it allows you to add a co-author for your commit,
+if you would like to attribute credits for the changes made in the commit. This
+can be done by:
+
+```sh
+Commit message.
+
+Co-authored-by: Name <email@domain>
+```
+
+In the Ginkgo commit history, this is most common associated with suggested
+improvements from code reviews.
+
+### Creating, Reviewing and Merging Pull Requests
+
+* The `develop` branch is the default branch to submit PR's to. From time to
+  time, we merge the `develop` branch to the `master` branch and create tags on
+  the `master` to create new releases of Ginkgo. Therefore, all pull requests
+  must be merged into `develop`.
+* Please have a look at the labels and make sure to add the relevant labels.
+* You can mark the PR as a `WIP` if you are still working on it, `Ready for
+  Review` when it is ready for others to review it.
+* Assignees to the PR should be the ones responsible for merging that PR.
+  Currently, it is only possible to assign members within the `ginkgo-project`.
+* Each pull request requires at least two approvals before merging.
+* PR's created from within the repository will automatically trigger two CI
+  pipelines on pushing to the branch from the which the PR has been created. The
+  Github Actions pipeline tests our framework on Mac OSX and on Windows
+  platforms. Another comprehensive Linux based pipeline is run from a [mirror on
+  gitlab](https://gitlab.com/ginkgo-project/ginkgo-public-ci/pipelines) and
+  contains additional checks like static analysis and test coverage.
+* Once a PR has been approved and the build has passed, one of the reviewers can
+  mark the PR as `READY TO MERGE`. At this point the creator/assignee of the PR
+  *needs to* verify that the branch is up to date with `develop` and rebase it
+  on `develop` if it is not.
+
+
+## Code style
+
+### Automatic code formatting
+
+Ginkgo uses [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html)
+(executable is usually named `clang-format`) and a custom `.clang-format`
+configuration file (mostly based on ClangFormat's _Google_ style) to
+automatically format your code. __Make sure you have ClangFormat set up and
+running properly__ ( you should be able to run `make format` from Ginkgo's build
+directory) before committing anything that will end up in a pull request against
+`ginkgo-project/ginkgo` repository. In addition, you should __never__ modify the
+`.clang-format` configuration file shipped with Ginkgo. E.g. if ClangFormat has
+trouble reading this file on your system, you should install a newer version of
+ClangFormat, and avoid commenting out parts of the configuration file.
+
+ClangFormat is the primary tool that helps us achieve a uniform look of Ginkgo's
+codebase, while reducing the learning curve of potential contributors. However,
+ClangFormat configuration is not expressive enough to incorporate the entire
+coding style, so there are several additional rules that all contributed code
+should follow.
+
+_Note_: To learn more about how ClangFormat will format your code, see existing
+files in Ginkgo, `.clang-format` configuration file shipped with Ginkgo, and
+ClangFormat's documentation.
+
+### Naming scheme
+
+#### Filenames
+
+Filenames use `snake_case` and use the following extensions:
+*   C++ source files: `.cpp`
+*   C++ header files: `.hpp`
+*   CUDA source files: `.cu`
+*   CUDA header files: `.cuh`
+*   HIP source files: `.hip.cpp`
+*   HIP header files: `.hip.hpp`
+*   Common source files used by both CUDA and HIP: `.hpp.inc`
+*   CMake utility files: `.cmake`
+*   Shell scripts: `.sh`
+
+_Note:_ A C++ source/header file is considered a `CUDA` file if it contains CUDA
+code that is not guarded with `#if` guards that disable this code in non-CUDA
+compilers. I.e. if a file can be compiled by a general C++ compiler, it is not
+considered a CUDA file.
+
+#### Macros
+
+Macros (both object-like and function-like macros) use `CAPITAL_CASE`. They have
+to start with `GKO_` to avoid name clashes (even if they are `#undef`-ed in the
+same file!).
+
+#### Variables
+
+Variables use `snake_case`.
+
+#### Constants
+
+Constants use `snake_case`.
+
+#### Functions
+
+Functions use `snake_case`.
+
+#### Structures and classes
+
+Structures and classes which do not experience polymorphic behavior (i.e. do not
+contain virtual methods, nor members which experience polymorphic behavior) use
+`snake_case`.
+
+All other structures and classes use `CamelCase`.
+
+#### Members
+
+All structure / class members use the same naming scheme as they would if they
+were not members:
+*   methods use the naming scheme for functions
+*   data members the naming scheme for variables or constants
+*   type members for classes / structures
+
+Additionally, non-public data members end with an underscore (`_`).
+
+#### Namespaces
+
+Namespaces use `snake_case`.
+
+#### Template parameters
+
+* Type template parameters use `CamelCase`, for example `ValueType`.
+* Non-type template parameters use `snake_case`, for example `subwarp_size`.
+
+### Whitespace
+
+Spaces and tabs are handled by ClangFormat, but blank lines are only partially
+handled (the current configuration doesn't allow for more than 2 blank lines).
+Thus, contributors should be aware of the following rules for blank lines:
+
+1.  Top-level statements and statements directly within namespaces are separated
+    with 2 blank lines. The first / last statement of a namespace is separated
+    by two blank lines from the opening / closing brace of the namespace.
+    1.  _exception_: if the first __or__ the last statement in the namespace is
+    another namespace, then no blank lines are required
+        _example_:
+        ```c++
+        namespace foo {
+
+
+        struct x {
+        };
+
+
+        }  // namespace foo
+
+
+        namespace bar {
+        namespace baz {
+
+
+        void f();
+
+
+        }  // namespace baz
+        }  // namespace bar
+        ```
+
+    2.  _exception_: in header files whose only purpose is to _declare_ a bunch
+        of functions (e.g. the `*_kernel.hpp` files) these declarations can be
+        separated by only 1 blank line (note: standard rules apply for all other
+        statements that might be present in that file)
+    3.  _exception_: "related" statement can have 1 blank line between them.
+        "Related" is not a strictly defined adjective in this sense, but is in
+        general one of:
+
+        1.  overload of a same function,
+        2.  function / class template and it's specializations,
+        3.  macro that modifies the meaning or adds functionality to the
+            previous / following statement.
+
+        However, simply calling function `f` from function `g` does not imply
+        that `f` and `g` are "related".
+2.  Statements within structures / classes are separated with 1 blank line.
+    There are no blank lines betweeen the first / last statement in the
+    structure / class.
+    1.  _exception_: there is no blank line between an access modifier (`private`, `protected`, `public`) and the following statement.
+       _example_:
+        ```c++
+        class foo {
+        public:
+            int get_x() const noexcept { return x_; }
+
+            int &get_x() noexcept { return x_; }
+
+        private:
+            int x_;
+        };
+        ```
+
+3.  Function bodies cannot have multiple consecutive blank lines, and a single
+    blank line can only appear between two logical sections of the function.
+4. Unit tests should follow the [AAA](http://wiki.c2.com/?ArrangeActAssert)
+   pattern, and a single blank line must appear between consecutive "A"
+   sections. No other blank lines are allowed in unit tests.
+5.  Enumeration definitions should have no blank lines between consecutive
+    enumerators.
+
+
+### Include statement grouping
+
+In general, all include statements should be present on the top of the file,
+ordered in the following groups, with two blank lines between each group:
+
+1. Related header file (e.g. `core/foo/bar.hpp` included in `core/foo/bar.cpp`,
+   or in the unit test`core/test/foo/bar.cpp`)
+2. Standard library headers (e.g. `vector`)
+3. Executor specific library headers (e.g. `omp.h`)
+4. System third-party library headers (e.g. `papi.h`)
+5. Local third-party library headers
+6. Public Ginkgo headers
+7. Private Ginkgo headers
+
+_Example_: A file `core/base/my_file.cpp` might have an include list like this:
+
+```c++
+#include <ginkgo/core/base/my_file.hpp>
+
+
+#include <algorithm>
+#include <vector>
+#include <tuple>
+
+
+#include <omp.h>
+
+
+#include <papi.h>
+
+
+#include "third_party/blas/cblas.hpp"
+#include "third_party/lapack/lapack.hpp"
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/my_file_kernels.hpp"
+```
+
+#### Main header
+
+This section presents general rules used to define the main header attributed to
+the file. In the previous example, this would be ` #include
+<ginkgo/core/base/my_file.hpp>`.
+
+General rules:
+1. Some fixed main header.
+2. components:
+  - with `_kernel` suffix looks for the header in the same folder.
+  - without `_kernel` suffix looks for the header in `core`.
+3. `test/utils`: looks for the header in `core`
+4. `core`: looks for the header in `ginkgo`
+5. `test` or `base`: looks for the header in `ginkgo/core`
+6. others: looks for the header in `core`
+
+_Note_: Please see the detail in the `dev_tools/scripts/config`.
+
+#### Some general comments.
+
+1. Private headers of Ginkgo should not be included within the public Ginkgo header.
+2. It is a good idea to keep the headers self-sufficient, See [Google Style guide for reasoning](https://google.github.io/styleguide/cppguide.html#Self_contained_Headers).
+When compiling with `GINKGO_CHECK_CIRCULAR_DEPS` enabled, this property is explicitly checked.
+3. The recommendations of the `iwyu` (Include what you use) tool can be used to make sure that the headers are self-sufficient and that the compiled files ( `.cu`, `.cpp`, `.hip.cpp` ) include only what they use. A [CI pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/jobs/584358356) is available that runs with the `iwyu` tool. Please be aware that this tool can be incorrect in some cases.
+
+#### Automatic header arrangement
+
+1. `dev_tools/script/format_header.sh` will take care of the group/sorting of
+   headers according to this guideline.
+2. `make format_header` arranges the header of the modified files in the branch.
+3. `make format_header_all` arranges the header of all files.
+
+
+### Other Code Formatting not handled by ClangFormat
+
+#### Control flow constructs
+
+Single line statements should be avoided in all cases. Use of brackets is
+mandatory for all control flow constructs (e.g. `if`, `for`, `while`, ...).
+
+#### Variable declarations
+
+C++ supports declaring / defining multiple variables using a single
+_type-specifier_. However, this is often very confusing as references and
+pointers exhibit strange behavior:
+
+```c++
+template <typename T> using pointer = T *;
+
+int *        x, y;  // x is a pointer, y is not
+pointer<int> x, y;  // both x and y are pointers
+```
+
+For this reason, __always__ declare each variable on a separate line, with its
+own _type-specifier_.
+
+### CMake coding style
+
+#### Whitespaces
+
+All alignment in CMake files should use four spaces.
+
+#### Use of macros vs functions
+
+Macros in CMake do not have a scope. This means that any variable set in this
+macro will be available to the whole project. In contrast, functions in CMake
+have local scope and therefore all set variables are local only. In general,
+wrap all piece of algorithms using temporary variables in a function and use
+macros to propagate variables to the whole project.
+
+#### Naming style
+
+All Ginkgo specific variables should be prefixed with a `GINKGO_` and all
+functions by `ginkgo_`.
+
+
+## Helper scripts
+
+To facilitate easy development within Ginkgo and to encourage coders and
+scientists who do not want get bogged down by the details of the Ginkgo library,
+but rather focus on writing the algorithms and the kernels, Ginkgo provides the
+developers with a few helper scripts.
+
+### Create a new algorithm
+
+A `create_new_algorithm.sh` script is available for developers to facilitate
+easy addition of new algorithms. The options it provides can be queried with
+
+```sh
+./create_new_algorithm.sh --help
+```
+The main objective of this script is to add files and boiler plate code for the
+new algorithm using a model and an instance of that model. For example, models
+can be any one of `factorization`, `matrix`, `preconditioner` or `solver`. For
+example to create a new solver named `my_solver` similar to `gmres`, you would
+set the `ModelType` to `solver` and set the `ModelName` to `gmres`. This would
+duplicate the core algorithm and kernels of the `gmres` algorithm and replace
+the naming to `my_solver`. Additionally, all the kernels of the new `my_solver`
+are marked as `GKO_NOT_IMPLEMENTED`. For easy navigation and `.txt` file is created
+in the folder where the script is run, which lists all the TODO's. These TODO's can
+also be found in the corresponding files.
+
+### Converting CUDA code to HIP code
+We provide a `cuda2hip` script that converts `cuda` kernel code into `hip` kernel code.
+Internally, this script calls the [`hipify` script](https://github.com/ROCm-Developer-Tools/HIPIFY) provided by HIP, converting the CUDA syntax
+to HIP syntax. Additionally, it also automatically replaces the instances of
+CUDA with HIP as appropriate. Hence, this script can be called on a Ginkgo CUDA
+file. You can find this script in the `dev_tools/scripts/` folder.
+
+
+## Writing Tests
+
+Ginkgo uses the [GTest framework](https://github.com/google/googletest) for the
+unit test framework within Ginkgo. Writing good tests are extremely important to
+verify the functionality of the new code and to make sure that none of the
+existing code has been broken.
+
+### Testing know-how
+
+* GTest provides a [comprehensive
+  documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+  of the functionality available within Gtest.
+* Reduce code duplication with [Testing Fixtures,
+  `TEST_F`](https://github.com/google/googletest/blob/master/googletest/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests)
+* Write templated tests using
+  [`TYPED_TEST`](https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests).
+
+### Some general rules.
+
+* Unit tests must follow the [KISS
+  principle](https://en.wikipedia.org/wiki/KISS_principle).
+* Unit tests must follow the [AAA](http://wiki.c2.com/?ArrangeActAssert)
+  pattern, and a single blank line must appear between consecutive "A" sections.
+
+### Writing tests for kernels
+
+* Reference kernels, kernels on the `ReferenceExecutor`, are meant to be single
+  threaded reference implementations. Therefore, tests for reference kernels
+  need to be performed with data that can be as small as possible. For example,
+  matrices lesser than 5x5 are acceptable. This allows the reviewers to verify
+  the results for exactness with tools such as MATLAB.
+* OpenMP, CUDA and HIP kernels have to be tested against the reference kernels.
+  Hence data for the tests of these kernels can be generated in the test files
+  using helper functions or by using external files to be read through the
+  standard input. In particular for CUDA and HIP, the data size should be at
+  least bigger than the architecture's warp size to ensure there is no corner
+  case in the kernels.
+
+
+## Documentation style
+
+Documentation uses standard Doxygen.
+
+###  Developer targeted notes
+
+Make use of `@internal` doxygen tag. This can be used for any comment which is
+not intended for users, but is useful to better understand a piece of code.
+
+### Whitespaces
+
+#### After named tags such as `@param foo`
+
+The documentation tags which use an additional name should be followed by two
+spaces in order to better distinguish the text from the doxygen tag. It is also
+possible to use a line break instead.
+
+### Documenting examples
+
+There are two main steps:
+
+1. First, you can just copy over the
+   [`doc/`](https://github.com/ginkgo-project/ginkgo/tree/master/examples/simple-solver)
+   folder (you can copy it from the example most relevant to you) and adapt your
+   example names and such, then you can modify the actual documentation.
++ In `tooltip`: A short description of the example.
++ In `short-intro`: The name of the example.
++ In `results.dox`: Run the example and write the output you get.
++ In `kind`: The kind of the example. For different kinds see [the
+  documentation](https://ginkgo-project.github.io/ginkgo/doc/master/Examples.html).
+  Examples can be of `basic`, `techniques`, `logging`, `stopping_criteria` or
+  `preconditioners`. If your example does not fit any of these categories, feel
+  free to create one.
++ In `intro.dox`: You write an explanation of your code with some introduction
+  similar to what you see in an existing example most relevant to you.
++ In `builds-on`: You write the examples it builds on.
+
+2. You also need to modify the
+   [examples.hpp.in](https://github.com/ginkgo-project/ginkgo/blob/master/doc/examples/examples.hpp.in)
+   file. You add the name of the example in the main section and in the section
+   that you specified in the `doc/kind` file in the example documentation.
+
+
+## Other programming comments
+
+### C++ standard stream objects
+
+These are global objects and are shared inside the same translation unit.
+Therefore, whenever its state or formatting is changed (e.g. using `std::hex` or
+floating point formatting) inside library code, make sure to restore the state
+before returning the control to the user. See this [stackoverflow
+question](https://stackoverflow.com/questions/2273330/restore-the-state-of-stdcout-after-manipulating-it)
+for examples on how to do it correctly. This is extremely important for header
+files.
+
+### Warnings
+
+By default, the `-DGINKGO_COMPILER_FLAGS` is set to `-Wpedantic` and hence
+pedantic warnings are emitted by default. Some of these warnings are false
+positives and a complete list of the resolved warnings and their solutions is
+listed in [Issue 174](https://github.com/ginkgo-project/ginkgo/issues/174).
+Specifically, when macros are being used, we have the issue of having `extra ;`
+warnings, which is resolved by adding a `static_assert()`. The CI system
+additionally also has a step where it compiles for pedantic warnings to be
+errors.
+
+### Avoiding circular dependencies
+
+To facilitate finding circular dependencies issues (see [Using library
+classes](#using-library-classes) for more details), a CI step `no-circular-deps`
+was created. For more details on its usage, see [this
+pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/pipelines/52941979),
+where Ginkgo did not abide to this policy and [PR
+#278](https://github.com/ginkgo-project/ginkgo/pull/278) which fixed this. Note
+that doing so is not enough to guarantee with 100% accuracy that no circular
+dependency is present. For an example of such a case, take a look at [this
+pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/pipelines/53006772)
+where one of the compiler setups detected an incorrect dependency of the `cuda`
+module (due to jacobi) on the `core` module.
diff --git a/INSTALL.md b/INSTALL.md
index d542d4d27a9..a3456d24ff7 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,6 +1,6 @@
 Installation Instructions                      {#install_ginkgo}
 -------------------------------------
-### Building 
+### Building
 
 Use the standard cmake build procedure:
 
@@ -17,7 +17,7 @@ Ginkgo adds the following additional switches to control what is being built:
 
 *   `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development
     (requires clang-format, will also download git-cmake-format),
-    default is `ON`.
+    default is `OFF`.
 *   `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
     (will download googletest), default is `ON`.
 *   `-DGINKGO_BUILD_BENCHMARKS={ON, OFF}` builds Ginkgo's benchmarks
@@ -28,9 +28,15 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_BUILD_REFERENCE={ON, OFF}` build reference implementations of the
     kernels, useful for testing, default is `ON`
 *   `-DGINKGO_BUILD_OMP={ON, OFF}` builds optimized OpenMP versions of the kernels,
-    default is `OFF`
+    default is `ON` if the selected C++ compiler supports OpenMP, `OFF` otherwise.
 *   `-DGINKGO_BUILD_CUDA={ON, OFF}` builds optimized cuda versions of the kernels
-    (requires CUDA), default is `OFF`
+    (requires CUDA), default is `ON` if a CUDA compiler could be detected,
+    `OFF` otherwise.
+*   `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels
+    (requires HIP), default is `ON` if an installation of HIP could be detected,
+    `OFF` otherwise.
+*   `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable
+    passed to hipcc for the `hcc` HIP backend. The default is none (auto).
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.
 *   `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples
@@ -47,6 +53,9 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_WITH_IWYU={ON, OFF}` makes Ginkgo call `iwyu` to find include
     issues. The path can be manually controlled with the CMake variable
     `-DGINKGO_IWYU_PATH=<path>`. The default is `OFF`.
+*   `-DGINKGO_CHECK_CIRCULAR_DEPS={ON, OFF}` enables compile-time checks for
+    circular dependencies between different Ginkgo libraries and self-sufficient
+    headers. Should only be used for development purposes. The default is `OFF`.
 *   `-DGINKGO_VERBOSE_LEVEL=integer` sets the verbosity of Ginkgo.
     * `0` disables all output in the main libraries,
     * `1` enables a few important messages related to unexpected behavior (default).
@@ -54,8 +63,9 @@ Ginkgo adds the following additional switches to control what is being built:
     The default value is usually something like `/usr/local`.
 *   `-DCMAKE_BUILD_TYPE=type` specifies which configuration will be used for
     this build of Ginkgo. The default is `RELEASE`. Supported values are CMake's
-    standard build types such as `DEBUG` and `RELEASE` and the Ginkgo specific 
-	`COVERAGE`, `ASAN` (AddressSanitizer) and `TSAN` (ThreadSanitizer) types.
+    standard build types such as `DEBUG` and `RELEASE` and the Ginkgo specific
+    `COVERAGE`, `ASAN` (AddressSanitizer), `LSAN` (LeakSanitizer), `TSAN`
+    (ThreadSanitizer) and `UBSAN` (undefined behavior sanitizer) types.
 *   `-DBUILD_SHARED_LIBS={ON, OFF}` builds ginkgo as shared libraries (`OFF`)
     or as dynamic libraries (`ON`), default is `ON`.
 *   `-DGINKGO_JACOBI_FULL_OPTIMIZATIONS={ON, OFF}` use all the optimizations
@@ -99,14 +109,110 @@ For example, to build everything (in debug mode), use:
 
 ```cmake
 cmake  -G "Unix Makefiles" -H. -BDebug -DCMAKE_BUILD_TYPE=Debug -DGINKGO_DEVEL_TOOLS=ON \
-              -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_REFERENCE=ON -DGINKGO_BUILD_OMP=ON \
-	          -DGINKGO_BUILD_CUDA=ON 
+    -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_REFERENCE=ON -DGINKGO_BUILD_OMP=ON \
+    -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_HIP=ON
 cmake --build Debug
 ```
 
 NOTE: Ginkgo is known to work with the `Unix Makefiles` and `Ninja` based
 generators. Other CMake generators are untested.
 
+### Building Ginkgo with HIP support
+Ginkgo provides a [HIP](https://github.com/ROCm-Developer-Tools/HIP) backend.
+This allows to compile optimized versions of the kernels for either AMD or
+NVIDIA GPUs. The CMake configuration step will try to auto-detect the presence
+of HIP either at `/opt/rocm/hip` or at the path specified by `HIP_PATH` as a
+CMake parameter (`-DHIP_PATH=`) or environment variable (`export HIP_PATH=`),
+unless `-DGINKGO_BUILD_HIP=ON/OFF` is set explicitly.
+
+#### Correctly installing HIP toolkits and dependencies for Ginkgo
+In general, Ginkgo's HIP backend requires the following packages:
++ HIP,
++ hipBLAS,
++ hipSPARSE,
++ Thrust.
+
+It is necessary to provide some details about the different ways to
+procure and install these packages, in particular for NVIDIA systems since
+getting a correct, non bloated setup is not straightforward.
+
+For AMD systems, the simplest way is to follow the [instructions provided
+here](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md) which
+provide package installers for most Linux distributions. Ginkgo also needs the
+installation of the [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS)
+and [hipSPARSE](https://github.com/ROCmSoftwarePlatform/hipSPARSE) interfaces.
+Optionally if you do not already have a thrust installation, [the ROCm provided
+rocThrust package can be
+used](https://github.com/ROCmSoftwarePlatform/rocThrust).
+
+For NVIDIA systems, the traditional installation (package `hip_nvcc`), albeit
+working properly is currently odd: it depends on all the `hcc` related packages,
+although the `nvcc` backend seems to entirely rely on the CUDA suite. [See this
+issue for more
+details](https://github.com/ROCmSoftwarePlatform/hipBLAS/issues/53). It is
+advised in this case to compile everything manually, including using forks of
+`hipBLAS` and `hipSPARSE` specifically made to not depend on the `hcc` specific
+packages. `Thrust` is often provided by CUDA and this Thrust version should work
+with `HIP`. Here is a sample procedure for installing `HIP`, `hipBLAS` and
+`hipSPARSE`.
+
+
+```bash
+# HIP
+git clone https://github.com/ROCm-Developer-Tools/HIP.git
+pushd HIP && mkdir build && pushd build
+cmake .. && make install
+popd && popd
+
+# hipBLAS
+git clone https://github.com/tcojean/hipBLAS.git
+pushd hipBLAS && mkdir build && pushd build
+cmake .. && make install
+popd && popd
+
+# hipSPARSE
+git clone https://github.com/tcojean/hipSPARSE.git
+pushd hipSPARSE && mkdir build && pushd build
+cmake -DBUILD_CUDA=ON .. && make install
+popd && popd
+```
+
+
+#### Changing the paths to search for HIP and other packages
+All HIP installation paths can be configured through the use of environment
+variables or CMake variables. This way of configuring the paths is currently
+imposed by the `HIP` tool suite. The variables are the following:
++ CMake `-DHIP_PATH=` or  environment `export HIP_PATH=`: sets the `HIP`
+  installation path. The default value is `/opt/rocm/hip`.
++ CMake `-DHIPBLAS_PATH=` or  environment `export HIPBLAS_PATH=`: sets the
+  `hipBLAS` installation path. The default value is `/opt/rocm/hipblas`.
++ CMake `-DHIPSPARSE_PATH=` or  environment `export HIPSPARSE_PATH=`: sets the
+  `hipSPARSE` installation path. The default value is `/opt/rocm/hipsparse`.
++ CMake `-DHCC_PATH=` or  environment `export HCC_PATH=`: sets the `HCC`
+  installation path, for AMD backends. The default value is `/opt/rocm/hcc`.
++ environment `export CUDA_PATH=`: where `hipcc` can find `CUDA` if it is not in
+  the default `/usr/local/cuda` path.
+
+
+#### HIP platform detection of AMD and NVIDIA
+By default, Ginkgo uses the output of `/opt/rocm/hip/bin/hipconfig --platform`
+to select the backend. The accepted values are either `hcc` (AMD) or `nvcc`
+(NVIDIA). When on an AMD or NVIDIA system, this should output the correct
+platform by default. When on a system without GPUs, this should output `hcc` by
+default. To change this value, export the environment variable `HIP_PLATFORM`
+like so:
+```bash
+export HIP_PLATFORM=nvcc
+```
+
+#### Setting platform specific compilation flags
+Platform specific compilation flags can be given through the following
+CMake variables:
++ `-DGINKGO_HIP_COMPILER_FLAGS=`: compilation flags given to all platforms.
++ `-DGINKGO_HIP_HCC_COMPILER_FLAGS=`: compilation flags given to AMD platforms.
++ `-DGINKGO_HIP_NVCC_COMPILER_FLAGS=`: compilation flags given to NVIDIA platforms.
+
+
 ### Third party libraries and packages
 
 Ginkgo relies on third party packages in different cases. These third party
diff --git a/LICENSE b/LICENSE
index efb4bb6d9bf..48867b57a87 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index 7a8ba605cdb..48d344e2c6e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
 ![Ginkgo](/assets/logo.png)
 
-[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/build.svg)](https://github.com/ginkgo-project/ginkgo/commits/master)
+[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/pipeline.svg)](https://github.com/ginkgo-project/ginkgo/commits/master)
+[![OSX-build](https://github.com/ginkgo-project/ginkgo/workflows/OSX-build/badge.svg?branch=master)](https://github.com/ginkgo-project/ginkgo/actions?query=workflow%3AOSX-build+branch%3Amaster)
+[![Windows-build](https://github.com/ginkgo-project/ginkgo/workflows/windows-build/badge.svg?branch=master)](https://github.com/ginkgo-project/ginkgo/actions?query=workflow%3AWindows-build+branch%3Amaster)
+[![codecov](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master/graph/badge.svg)](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master)
+[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)
+[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=reliability_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)
+
 [![CDash dashboard](https://img.shields.io/badge/CDash-Access-blue.svg)](http://my.cdash.org/index.php?project=Ginkgo+Project)
 [![Documentation](https://img.shields.io/badge/Documentation-latest-blue.svg)](https://ginkgo-project.github.io/ginkgo/doc/master/)
 [![License](https://img.shields.io/github/license/ginkgo-project/ginkgo.svg)](./LICENSE)
@@ -9,7 +15,7 @@
 Ginkgo is a high-performance linear algebra library for manycore systems, with a
 focus on sparse solution of linear systems. It is implemented using modern C++
 (you will need at least C++11 compliant compiler to build it), with GPU kernels
-implemented in CUDA.
+implemented in CUDA and HIP.
 
 
 Performance
@@ -31,7 +37,7 @@ For Ginkgo core library:
 
 *   _cmake 3.9+_
 *   C++11 compliant compiler, one of:
-    *   _gcc 5.3+, 6.3+, 7.3+, 8.1+_
+    *   _gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+_
     *   _clang 3.9+_
     *   _Intel compiler 2017+_
     *   _Apple LLVM 8.0+_ (__TODO__: verify)
@@ -44,20 +50,28 @@ The Ginkgo CUDA module has the following __additional__ requirements:
     [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
     or [CUDA installation guide for Mac Os X](https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html)
 
+
 In addition, if you want to contribute code to Ginkgo, you will also need the
 following:
 
-*   _clang-format 5.0.1+_ (ships as part of _clang_)
+*   _clang-format 5.0.0+_ (ships as part of _clang_)
 *   _clang-tidy_ (optional, when setting the flag `-DGINKGO_WITH_CLANG_TIDY=ON`)
 *   _iwyu_ (Include What You Use, optional, when setting the flag `-DGINKGO_WITH_IWYU=ON`)
 
+The Ginkgo HIP module has the following __additional__ requirements:
+
+* _ROCm 2.8+_
+*    the HIP, hipBLAS and hipSPARSE packages compiled with either:
+    * _AMD_ backend
+    * _CUDA 9.0+_ backend. When using CUDA 10+, _cmake 3.12.2+_ is required.
+
 ### Windows
 
 The prequirement needs to be verified
 *   _cmake 3.9+_
 *   C++11 compliant 64-bits compiler:
-    *   _MinGW : gcc 5.3+, 6.3+, 7.3+, 8.1+_
-    *   _Cygwin : gcc 5.3+, 6.3+, 7.3+, 8.1+_
+    *   _MinGW : gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+_
+    *   _Cygwin : gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+_
     *   _Microsoft Visual Studio : VS 2017 15.7+_
 
 __NOTE:__ Need to add `--autocrlf=input` after `git clone` in _Cygwin_.
@@ -128,12 +142,10 @@ Name Surname <email@domain> Institution(s)
 
 #### Contributing guidelines
 
-Contributing guidelines can be accessed in our Wiki under the [Developer's
-Homepage](https://github.com/ginkgo-project/ginkgo/wiki/Developers-Homepage).
-This page also contains other information useful to developers, such as writing
-proper commit messages, understanding Ginkgo's library design, relevant C++
-information, and more. In general, always refer to this page for developer
-information.
+Contributing guidelines can be accessed in the [CONTRIBUTING.md
+page](./CONTRIBUTING.md). This page also contains other information useful to
+developers, such as writing proper commit messages, understanding Ginkgo's
+library design, relevant C++ information, and more.
 
 ### Support
 If you have any question, bug to report or would like to propose a new feature,
@@ -152,3 +164,23 @@ Depending on the configuration options used when building Ginkgo, third party
 software may be pulled as additional dependencies, which have their own
 licensing conditions. Refer to [ABOUT-LICENSING.md](ABOUT-LICENSING.md) for
 details.
+
+Citing Ginkgo
+-------------
+
+The main Ginkgo paper describing Ginkgo's purpose, design and interface is
+available through the following reference:
+
+``` bibtex
+@misc{anzt2020ginkgo,
+    title={Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing},
+    author={Hartwig Anzt and Terry Cojean and Goran Flegar and Fritz Göbel and Thomas Grützmacher and Pratik Nayak and Tobias Ribizel and Yuhsiang Mike Tsai and Enrique S. Quintana-Ortí},
+    year={2020},
+    eprint={2006.16852},
+    archivePrefix={arXiv},
+    primaryClass={cs.MS}
+}
+```
+
+For more information on topical subjects, please refer to the [CITING.md
+page](CITING.md).
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index cbfe9975edc..e786c63d5ed 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -4,6 +4,13 @@ if (NOT CMAKE_BUILD_TYPE STREQUAL "Release")
         "will be affected")
 endif()
 
+if (GINKGO_BUILD_CUDA AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "hcc")
+    message(FATAL_ERROR "Building the benchmarks for both HIP AMD and CUDA "
+        "at the same time is currently not supported. "
+        "Disable the benchmark build using `-DGINKGO_BUILD_BENCHMARKS=OFF` "
+        "or use `export HIP_PLATFORM=nvcc` in your build environment instead.")
+endif()
+
 function(ginkgo_benchmark_cusp_linops name)
     target_compile_definitions("${name}" PRIVATE HAS_CUDA=1)
     target_link_libraries("${name}" ginkgo ${CUDA_RUNTIME_LIBS}
@@ -14,6 +21,35 @@ function(ginkgo_benchmark_cusp_linops name)
     endif()
 endfunction()
 
+function(ginkgo_benchmark_hipsp_linops name)
+    target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
+    EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
+    set_target_properties("${name}" PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
+    # for some reason, HIP creates a dependency on Threads::Threads here, so we
+    # need to find it
+    find_package(Threads REQUIRED)
+    find_package(HIP REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(hipsparse REQUIRED)
+    target_include_directories("${name}" SYSTEM PRIVATE
+        ${HSA_HEADER} ${HIP_INCLUDE_DIRS}
+        ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
+
+    if(GINKGO_HIP_PLATFORM MATCHES "hcc")
+        ginkgo_hip_ban_link_hcflag(hcc::hccrt)
+        ginkgo_hip_ban_link_hcflag(hcc::hc_am)
+        ginkgo_hip_ban_link_hcflag(hcc::mcwamp)
+        ginkgo_hip_ban_compile_hcflag(hcc::hccrt)
+        ginkgo_hip_ban_compile_hcflag(hcc::hc_am)
+        ginkgo_hip_ban_compile_hcflag(hcc::mcwamp)
+        ginkgo_hip_clang_ban_hip_device_flags()
+        target_link_libraries("${name}" hip::device)
+    else()
+        target_link_libraries("${name}" ${HIP_CUDA_LIBRARIES})
+    endif()
+    target_link_libraries("${name}" ${HIPSPARSE_LIBRARIES})
+endfunction()
+
 add_subdirectory(conversions)
 add_subdirectory(matrix_generator)
 add_subdirectory(matrix_statistics)
diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
index 7921c31fa81..d2cc6c147e9 100644
--- a/benchmark/conversions/conversions.cpp
+++ b/benchmark/conversions/conversions.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -195,5 +195,5 @@ int main(int argc, char *argv[])
         }
     }
 
-    std::cout << test_cases;
+    std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp
index 9f127b11cff..7622d2cd4ed 100644
--- a/benchmark/matrix_generator/matrix_generator.cpp
+++ b/benchmark/matrix_generator/matrix_generator.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -72,6 +72,7 @@ std::string input_format =
 }  // namespace
 
 
+// clang-format off
 // input validation
 [[noreturn]] void print_config_error_and_exit(int code = 1)
 {
@@ -79,6 +80,7 @@ std::string input_format =
               << input_format << std::endl;
     std::exit(code);
 }
+// clang-format on
 
 
 void validate_option_object(const rapidjson::Value &value)
@@ -151,5 +153,5 @@ int main(int argc, char *argv[])
         }
     }
 
-    std::cout << configurations;
+    std::cout << configurations << std::endl;
 }
diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index c0f6a86f70c..72e899407d7 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -72,18 +72,21 @@ void compute_summary(const std::vector<gko::size_type> &dist,
     // clang-format on
 
     add_or_set_member(out, "min", dist[0], allocator);
-    add_or_set_member(out, "q1",
-                      coefs[r][0] * dist[positions[r][0]] +
-                          coefs[r][1] * dist[positions[r][1]],
-                      allocator);
-    add_or_set_member(out, "median",
-                      coefs[r][2] * dist[positions[r][2]] +
-                          coefs[r][3] * dist[positions[r][3]],
-                      allocator);
-    add_or_set_member(out, "q3",
-                      coefs[r][4] * dist[positions[r][4]] +
-                          coefs[r][5] * dist[positions[r][5]],
-                      allocator);
+    add_or_set_member(
+        out, "q1",
+        coefs[r][0] * static_cast<double>(dist[positions[r][0]]) +
+            coefs[r][1] * static_cast<double>(dist[positions[r][1]]),
+        allocator);
+    add_or_set_member(
+        out, "median",
+        coefs[r][2] * static_cast<double>(dist[positions[r][2]]) +
+            coefs[r][3] * static_cast<double>(dist[positions[r][3]]),
+        allocator);
+    add_or_set_member(
+        out, "q3",
+        coefs[r][4] * static_cast<double>(dist[positions[r][4]]) +
+            coefs[r][5] * static_cast<double>(dist[positions[r][5]]),
+        allocator);
     add_or_set_member(out, "max", dist[dist.size() - 1], allocator);
 }
 
@@ -94,11 +97,12 @@ double compute_moment(int degree, const std::vector<gko::size_type> &dist,
     if (normalization == 0.0) {
         return 0.0;
     }
-    auto moment = 0.0;
+    double moment = 0.0;
     for (const auto &x : dist) {
-        moment += std::pow(x - center, degree);
+        moment += std::pow(static_cast<double>(x) - center, degree);
     }
-    return moment / dist.size() / std::pow(normalization, degree);
+    return moment / static_cast<double>(dist.size()) /
+           std::pow(normalization, static_cast<double>(degree));
 }
 
 
@@ -208,5 +212,5 @@ int main(int argc, char *argv[])
         }
     }
 
-    std::cout << test_cases;
+    std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 0fc19054d85..11979fd6ba1 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -52,9 +52,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 DEFINE_uint32(max_block_size, 32,
               "Maximal block size of the block-Jacobi preconditioner");
 
-DEFINE_string(preconditioners, "jacobi",
-              "A comma-separated list of solvers to run."
-              "Supported values are: jacobi");
+DEFINE_uint32(num_iterations, 5,
+              "Number of iterations for the ParICT/ParILU(T) preconditioner");
+
+DEFINE_bool(
+    approx_select, true,
+    "Use approximate selection for the threshold filtering in ParICT/ParILUT");
+
+DEFINE_double(fill_limit, 2.0, "The fill-in limit used in ParICT/ParILUT");
+
+DEFINE_string(preconditioners, "jacobi,parilu,parilut,ilu",
+              "A comma-separated list of preconditioners to run."
+              "Supported values are: jacobi, parict, parilu, parilut, ilu");
 
 DEFINE_string(storage_optimization, "0,0",
               "Defines the kind of storage optimization to perform on "
@@ -90,13 +99,55 @@ gko::precision_reduction parse_storage_optimization(const std::string &flag)
 const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                                 std::shared_ptr<const gko::Executor> exec)>>
     precond_factory{
-        {"jacobi", [](std::shared_ptr<const gko::Executor> exec) {
+        {"jacobi",
+         [](std::shared_ptr<const gko::Executor> exec) {
              return gko::preconditioner::Jacobi<etype>::build()
                  .with_max_block_size(FLAGS_max_block_size)
                  .with_storage_optimization(
                      parse_storage_optimization(FLAGS_storage_optimization))
                  .with_accuracy(FLAGS_accuracy)
                  .on(exec);
+         }},
+        {"parict",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto ict_fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::ParIct<etype>::build()
+                     .with_iterations(FLAGS_num_iterations)
+                     .with_approximate_select(FLAGS_approx_select)
+                     .with_fill_in_limit(FLAGS_fill_limit)
+                     .on(exec));
+             return gko::preconditioner::Ilu<>::build()
+                 .with_factorization_factory(ict_fact)
+                 .on(exec);
+         }},
+        {"parilu",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto ilu_fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::ParIlu<etype>::build()
+                     .with_iterations(FLAGS_num_iterations)
+                     .on(exec));
+             return gko::preconditioner::Ilu<>::build()
+                 .with_factorization_factory(ilu_fact)
+                 .on(exec);
+         }},
+        {"parilut",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto ilut_fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::ParIlut<etype>::build()
+                     .with_iterations(FLAGS_num_iterations)
+                     .with_approximate_select(FLAGS_approx_select)
+                     .with_fill_in_limit(FLAGS_fill_limit)
+                     .on(exec));
+             return gko::preconditioner::Ilu<>::build()
+                 .with_factorization_factory(ilut_fact)
+                 .on(exec);
+         }},
+        {"ilu", [](std::shared_ptr<const gko::Executor> exec) {
+             auto ilu_fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::Ilu<etype>::build().on(exec));
+             return gko::preconditioner::Ilu<>::build()
+                 .with_factorization_factory(ilu_fact)
+                 .on(exec);
          }}};
 
 
@@ -105,12 +156,34 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
 std::string encode_parameters(const char *precond_name)
 {
     static std::map<std::string, std::string (*)()> encoder{
-        {"jacobi", [] {
+        {"jacobi",
+         [] {
              std::ostringstream oss;
              oss << "jacobi-" << FLAGS_max_block_size << "-"
                  << FLAGS_storage_optimization;
              return oss.str();
-         }}};
+         }},
+        {"parict",
+         [] {
+             std::ostringstream oss;
+             oss << "parict-" << FLAGS_num_iterations << '-'
+                 << FLAGS_approx_select << '-' << FLAGS_fill_limit;
+             return oss.str();
+         }},
+        {"parilu",
+         [] {
+             std::ostringstream oss;
+             oss << "parilu-" << FLAGS_num_iterations;
+             return oss.str();
+         }},
+        {"parilut",
+         [] {
+             std::ostringstream oss;
+             oss << "parilut-" << FLAGS_num_iterations << '-'
+                 << FLAGS_approx_select << '-' << FLAGS_fill_limit;
+             return oss.str();
+         }},
+        {"ilu", [] { return std::string{"ilu"}; }}};
     return encoder[precond_name]();
 }
 
@@ -196,7 +269,8 @@ void run_preconditioner(const char *precond_name,
             auto x_clone = clone(x);
             auto precond = precond_factory.at(precond_name)(exec);
 
-            auto gen_logger = std::make_shared<OperationLogger>(exec);
+            auto gen_logger =
+                std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
             exec->add_logger(gen_logger);
             std::unique_ptr<gko::LinOp> precond_op;
             for (auto i = 0u; i < FLAGS_repetitions; ++i) {
@@ -207,7 +281,8 @@ void run_preconditioner(const char *precond_name,
             gen_logger->write_data(this_precond_data["generate"]["components"],
                                    allocator, FLAGS_repetitions);
 
-            auto apply_logger = std::make_shared<OperationLogger>(exec);
+            auto apply_logger =
+                std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
             exec->add_logger(apply_logger);
             for (auto i = 0u; i < FLAGS_repetitions; ++i) {
                 precond_op->apply(lend(b), lend(x_clone));
@@ -310,5 +385,5 @@ int main(int argc, char *argv[])
         }
     }
 
-    std::cout << test_cases;
+    std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh
index 64e2b5944b2..6a782bac322 100644
--- a/benchmark/run_all_benchmarks.sh
+++ b/benchmark/run_all_benchmarks.sh
@@ -30,6 +30,26 @@ if [ ! "${PRECONDS}" ]; then
     PRECONDS="none"
 fi
 
+if [ ! "${FORMATS}" ]; then
+    echo "FORMATS    environment variable not set - assuming \"csr,coo,ell,hybrid,sellp\"" 1>&2
+    FORMATS="csr,coo,ell,hybrid,sellp"
+fi
+
+if [ ! "${SOLVERS}" ]; then
+    echo "SOLVERS    environment variable not set - assuming \"bicgstab,cg,cgs,fcg,gmres\"" 1>&2
+    SOLVERS="bicgstab,cg,cgs,fcg,gmres"
+fi
+
+if [ ! "${SOLVERS_PRECISION}" ]; then
+    echo "SOLVERS_PRECISION    environment variable not set - assuming \"1e-6\"" 1>&2
+    SOLVERS_PRECISION=1e-6
+fi
+
+if [ ! "${SOLVERS_MAX_ITERATIONS}" ]; then
+    echo "SOLVERS_MAX_ITERATIONS    environment variable not set - assuming \"10000\"" 1>&2
+    SOLVERS_MAX_ITERATIONS=10000
+fi
+
 if [ ! "${SYSTEM_NAME}" ]; then
     echo "SYSTEM_MANE environment variable not set - assuming \"unknown\"" 1>&2
     SYSTEM_NAME="unknown"
@@ -40,6 +60,31 @@ if [ ! "${DEVICE_ID}" ]; then
     DEVICE_ID="0"
 fi
 
+# Control whether to run detailed benchmarks or not.
+# Default setting is detailed=false. To activate, set DETAILED=1.
+if  [ ! "${DETAILED}" ] || [ "${DETAILED}" -eq 0 ]; then
+    DETAILED_STR="--detailed=false"
+else
+    DETAILED_STR="--detailed=true"
+fi
+
+# This allows using a matrix list file for benchmarking.
+# The file should contains a suitesparse matrix on each line.
+# The allowed formats to target suitesparse matrix is:
+#   id or group/name or name.
+# Example:
+# 1903
+# Freescale/circuit5M
+# thermal2
+if [ ! "${MATRIX_LIST_FILE}" ]; then
+    use_matrix_list_file=0
+elif [ -f "${MATRIX_LIST_FILE}" ]; then
+    use_matrix_list_file=1
+else
+    echo -e "A matrix list file was set to ${MATRIX_LIST_FILE} but it cannot be found."
+    exit 1
+fi
+
 
 ################################################################################
 # Utilities
@@ -87,7 +132,7 @@ run_conversion_benchmarks() {
     [ "${DRY_RUN}" == "true" ] && return
     cp "$1" "$1.imd" # make sure we're not loosing the original input
     ./conversions/conversions --backup="$1.bkp" --double_buffer="$1.bkp2" \
-                --executor="${EXECUTOR}" --formats="csr,coo,hybrid,sellp,ell" \
+                --executor="${EXECUTOR}" --formats="${FORMATS}" \
                 --device_id="${DEVICE_ID}" \
                 <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
@@ -103,7 +148,7 @@ run_spmv_benchmarks() {
     [ "${DRY_RUN}" == "true" ] && return
     cp "$1" "$1.imd" # make sure we're not loosing the original input
     ./spmv/spmv --backup="$1.bkp" --double_buffer="$1.bkp2" \
-                --executor="${EXECUTOR}" --formats="csr,coo,hybrid,sellp,ell" \
+                --executor="${EXECUTOR}" --formats="${FORMATS}" \
                 --device_id="${DEVICE_ID}" \
                 <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
@@ -119,10 +164,10 @@ run_solver_benchmarks() {
     [ "${DRY_RUN}" == "true" ] && return
     cp "$1" "$1.imd" # make sure we're not loosing the original input
     ./solver/solver --backup="$1.bkp" --double_buffer="$1.bkp2" \
-                    --executor="${EXECUTOR}" --solvers="cg,bicgstab,cgs,fcg" \
+                    --executor="${EXECUTOR}" --solvers="${SOLVERS}" \
                     --preconditioners="${PRECONDS}" \
-                    --max_iters=10000 --rel_res_goal=1e-6 \
-                    --device_id="${DEVICE_ID}" \
+                    --max_iters=${SOLVERS_MAX_ITERATIONS} --rel_res_goal=${SOLVERS_PRECISION} \
+                    ${DETAILED_STR} --device_id="${DEVICE_ID}" \
                     <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -173,9 +218,42 @@ generate_suite_sparse_input() {
 EOT
 }
 
+parse_matrix_list() {
+    local source_list_file=$1
+    local benchmark_list=""
+    local id=0
+    for mtx in $(cat ${source_list_file}); do
+        if [[ ! "$mtx" =~ ^[0-9]+$ ]]; then
+            if [[ "$mtx" =~ ^[a-zA-Z0-9_-]+$ ]]; then
+                id=$(${SSGET} -s "[ @name == $mtx ]")
+            elif [[ "$mtx" =~ ^([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)$ ]]; then
+                local group="${BASH_REMATCH[1]}"
+                local name="${BASH_REMATCH[2]}"
+                id=$(${SSGET} -s "[ @name == $name ] && [ @group == $group ]")
+            else
+                >&2 echo -e "Could not recognize entry $mtx."
+            fi
+        else
+            id=$mtx
+        fi
+        benchmark_list="$benchmark_list $id"
+    done
+    echo "$benchmark_list"
+}
+
+if [ $use_matrix_list_file -eq 1 ]; then
+    MATRIX_LIST=($(parse_matrix_list $MATRIX_LIST_FILE))
+    NUM_PROBLEMS=${#MATRIX_LIST[@]}
+fi
+
 LOOP_START=$((1 + (${NUM_PROBLEMS}) * (${SEGMENT_ID} - 1) / ${SEGMENTS}))
 LOOP_END=$((1 + (${NUM_PROBLEMS}) * (${SEGMENT_ID}) / ${SEGMENTS}))
-for (( i=${LOOP_START}; i < ${LOOP_END}; ++i )); do
+for (( p=${LOOP_START}; p < ${LOOP_END}; ++p )); do
+    if [ $use_matrix_list_file -eq 1 ]; then
+        i=${MATRIX_LIST[$((p-1))]}
+    else
+        i=$p
+    fi
     if [ "${BENCHMARK}" == "preconditioner" ]; then
         break
     fi
diff --git a/benchmark/solver/CMakeLists.txt b/benchmark/solver/CMakeLists.txt
index fc1d203ca05..1faae042b24 100644
--- a/benchmark/solver/CMakeLists.txt
+++ b/benchmark/solver/CMakeLists.txt
@@ -2,4 +2,7 @@ add_executable(solver solver.cpp)
 target_link_libraries(solver ginkgo gflags rapidjson)
 if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusp_linops(solver)
+endif()
+if (GINKGO_BUILD_HIP)
+    ginkgo_benchmark_hipsp_linops(solver)
 endif()
\ No newline at end of file
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index 7885c30511c..f043977ab9a 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <algorithm>
 #include <chrono>
+#include <cmath>
 #include <cstdlib>
 #include <exception>
 #include <fstream>
@@ -45,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/overhead_linop.hpp"
 
 
 // some Ginkgo shortcuts
@@ -57,21 +59,40 @@ DEFINE_uint32(max_iters, 1000,
 
 DEFINE_double(rel_res_goal, 1e-6, "The relative residual goal of the solver");
 
-DEFINE_string(solvers, "cg",
-              "A comma-separated list of solvers to run."
-              "Supported values are: bicgstab, cg, cgs, fcg, gmres");
+DEFINE_string(
+    solvers, "cg",
+    "A comma-separated list of solvers to run. "
+    "Supported values are: bicgstab, bicg, cg, cgs, fcg, gmres, overhead");
 
-DEFINE_string(preconditioners, "none",
-              "A comma-separated list of preconditioners to use."
-              "Supported values are: none, jacobi, adaptive-jacobi");
+DEFINE_string(
+    preconditioners, "none",
+    "A comma-separated list of preconditioners to use. "
+    "Supported values are: none, jacobi, adaptive-jacobi, parict, parilu, "
+    "parilut, ilu, overhead");
+
+DEFINE_uint32(parilu_iterations, 5,
+              "The number of iterations for ParICT/ParILU(T)");
+
+DEFINE_bool(parilut_approx_select, true,
+            "Use approximate selection for ParICT/ParILUT");
+
+DEFINE_double(parilut_limit, 2.0, "The fill-in limit for ParICT/ParILUT");
 
 DEFINE_uint32(
     nrhs, 1,
     "The number of right hand sides. Record the residual only when nrhs == 1.");
 
+// This allows to benchmark the overhead of a solver by using the following
+// data: A=[1.0], x=[0.0], b=[nan]. This data can be used to benchmark normal
+// solvers or using the argument --solvers=overhead, a minimal solver will be
+// launched which contains only a few kernel calls.
+DEFINE_bool(overhead, false,
+            "If set, uses dummy data to benchmark Ginkgo overhead");
+
 
 // input validation
-[[noreturn]] void print_config_error_and_exit() {
+[[noreturn]] void print_config_error_and_exit()
+{
     std::cerr << "Input has to be a JSON array of matrix configurations:\n"
               << "  [\n"
               << "    { \"filename\": \"my_file.mtx\",  \"optimal\": { "
@@ -116,10 +137,12 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                                 std::shared_ptr<const gko::Executor>,
                                 std::shared_ptr<const gko::LinOpFactory>)>>
     solver_factory{{"bicgstab", create_solver<gko::solver::Bicgstab<>>},
+                   {"bicg", create_solver<gko::solver::Bicg<>>},
                    {"cg", create_solver<gko::solver::Cg<>>},
                    {"cgs", create_solver<gko::solver::Cgs<>>},
                    {"fcg", create_solver<gko::solver::Fcg<>>},
-                   {"gmres", create_solver<gko::solver::Gmres<>>}};
+                   {"gmres", create_solver<gko::solver::Gmres<>>},
+                   {"overhead", create_solver<gko::Overhead<>>}};
 
 
 // TODO: Workaround until GPU matrix conversions are implemented
@@ -165,7 +188,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return std::unique_ptr<ReferenceFactoryWrapper>(
                  new ReferenceFactoryWrapper(f));
          }},
-        {"adaptive-jacobi", [](std::shared_ptr<const gko::Executor> exec) {
+        {"adaptive-jacobi",
+         [](std::shared_ptr<const gko::Executor> exec) {
              std::shared_ptr<const gko::LinOpFactory> f =
                  gko::preconditioner::Jacobi<>::build()
                      .with_storage_optimization(
@@ -173,6 +197,66 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                      .on(exec);
              return std::unique_ptr<ReferenceFactoryWrapper>(
                  new ReferenceFactoryWrapper(f));
+         }},
+        {"parict",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::ParIct<>::build()
+                     .with_iterations(FLAGS_parilu_iterations)
+                     .with_approximate_select(FLAGS_parilut_approx_select)
+                     .with_fill_in_limit(FLAGS_parilut_limit)
+                     .on(exec));
+             std::shared_ptr<const gko::LinOpFactory> f =
+                 gko::preconditioner::Ilu<>::build()
+                     .with_factorization_factory(fact)
+                     .on(exec);
+             return std::unique_ptr<ReferenceFactoryWrapper>(
+                 new ReferenceFactoryWrapper(f));
+         }},
+        {"parilu",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::ParIlu<>::build()
+                     .with_iterations(FLAGS_parilu_iterations)
+                     .on(exec));
+             std::shared_ptr<const gko::LinOpFactory> f =
+                 gko::preconditioner::Ilu<>::build()
+                     .with_factorization_factory(fact)
+                     .on(exec);
+             return std::unique_ptr<ReferenceFactoryWrapper>(
+                 new ReferenceFactoryWrapper(f));
+         }},
+        {"parilut",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::ParIlut<>::build()
+                     .with_iterations(FLAGS_parilu_iterations)
+                     .with_approximate_select(FLAGS_parilut_approx_select)
+                     .with_fill_in_limit(FLAGS_parilut_limit)
+                     .on(exec));
+             std::shared_ptr<const gko::LinOpFactory> f =
+                 gko::preconditioner::Ilu<>::build()
+                     .with_factorization_factory(fact)
+                     .on(exec);
+             return std::unique_ptr<ReferenceFactoryWrapper>(
+                 new ReferenceFactoryWrapper(f));
+         }},
+        {"ilu",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             auto fact = std::shared_ptr<gko::LinOpFactory>(
+                 gko::factorization::Ilu<>::build().on(exec));
+             std::shared_ptr<const gko::LinOpFactory> f =
+                 gko::preconditioner::Ilu<>::build()
+                     .with_factorization_factory(fact)
+                     .on(exec);
+             return std::unique_ptr<ReferenceFactoryWrapper>(
+                 new ReferenceFactoryWrapper(f));
+         }},
+        {"overhead", [](std::shared_ptr<const gko::Executor> exec) {
+             std::shared_ptr<const gko::LinOpFactory> f =
+                 gko::Overhead<>::build().on(exec);
+             return std::unique_ptr<ReferenceFactoryWrapper>(
+                 new ReferenceFactoryWrapper(f));
          }}};
 
 
@@ -244,8 +328,10 @@ void solve_system(const std::string &solver_name,
                           rapidjson::Value(rapidjson::kArrayType), allocator);
         add_or_set_member(solver_json, "true_residuals",
                           rapidjson::Value(rapidjson::kArrayType), allocator);
-        if (FLAGS_nrhs == 1) {
-            auto rhs_norm = compute_norm(lend(b));
+        add_or_set_member(solver_json, "iteration_timestamps",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        if (FLAGS_nrhs == 1 && !FLAGS_overhead) {
+            auto rhs_norm = compute_norm2(lend(b));
             add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator);
         }
         for (auto stage : {"generate", "apply"}) {
@@ -258,21 +344,28 @@ void solve_system(const std::string &solver_name,
         }
 
         // warm run
+        auto it_logger = std::make_shared<IterationLogger>(exec);
         for (unsigned int i = 0; i < FLAGS_warmup; i++) {
             auto x_clone = clone(x);
             auto precond = precond_factory.at(precond_name)(exec);
             auto solver = solver_factory.at(solver_name)(exec, give(precond))
                               ->generate(system_matrix);
+            solver->add_logger(it_logger);
             solver->apply(lend(b), lend(x_clone));
             exec->synchronize();
+            solver->remove_logger(gko::lend(it_logger));
+        }
+        if (FLAGS_warmup > 0) {
+            it_logger->write_data(solver_json["apply"], allocator);
         }
 
         // detail run
-        if (FLAGS_detailed) {
+        if (FLAGS_detailed && !FLAGS_overhead) {
             // slow run, get the time of each functions
             auto x_clone = clone(x);
 
-            auto gen_logger = std::make_shared<OperationLogger>(exec);
+            auto gen_logger =
+                std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
             exec->add_logger(gen_logger);
 
             auto precond = precond_factory.at(precond_name)(exec);
@@ -293,7 +386,8 @@ void solve_system(const std::string &solver_name,
                                    solver_json["preconditioner"], allocator);
             }
 
-            auto apply_logger = std::make_shared<OperationLogger>(exec);
+            auto apply_logger =
+                std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
             exec->add_logger(apply_logger);
 
             solver->apply(lend(b), lend(x_clone));
@@ -308,7 +402,8 @@ void solve_system(const std::string &solver_name,
                 auto res_logger = std::make_shared<ResidualLogger<etype>>(
                     exec, lend(system_matrix), b,
                     solver_json["recurrent_residuals"],
-                    solver_json["true_residuals"], allocator);
+                    solver_json["true_residuals"],
+                    solver_json["iteration_timestamps"], allocator);
                 solver->add_logger(res_logger);
                 solver->apply(lend(b), lend(x_clone));
             }
@@ -344,7 +439,8 @@ void solve_system(const std::string &solver_name,
             apply_time += std::chrono::duration_cast<std::chrono::nanoseconds>(
                 a_tac - a_tic);
 
-            if (FLAGS_nrhs == 1 && i == FLAGS_repetitions - 1) {
+            if (FLAGS_nrhs == 1 && i == FLAGS_repetitions - 1 &&
+                !FLAGS_overhead) {
                 auto residual = compute_residual_norm(lend(system_matrix),
                                                       lend(b), lend(x_clone));
                 add_or_set_member(solver_json, "residual_norm", residual,
@@ -406,9 +502,18 @@ int main(int argc, char *argv[])
         }
     }
 
-    rapidjson::IStreamWrapper jcin(std::cin);
     rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
+    if (!FLAGS_overhead) {
+        rapidjson::IStreamWrapper jcin(std::cin);
+        test_cases.ParseStream(jcin);
+    } else {
+        // Fake test case to run once
+        auto overhead_json = std::string() +
+                             " [{\"filename\": \"overhead.mtx\", \"optimal\": "
+                             "{ \"spmv\": \"csr\"}}]";
+        test_cases.Parse(overhead_json.c_str());
+    }
+
     if (!test_cases.IsArray()) {
         print_config_error_and_exit();
     }
@@ -435,15 +540,26 @@ int main(int argc, char *argv[])
             }
             std::clog << "Running test case: " << test_case << std::endl;
             std::ifstream mtx_fd(test_case["filename"].GetString());
-            auto data = gko::read_raw<etype>(mtx_fd);
 
-            auto system_matrix = share(formats::matrix_factory.at(
-                test_case["optimal"]["spmv"].GetString())(exec, data));
-            auto b = create_matrix<etype>(
-                exec, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs},
-                engine);
-            auto x = create_matrix<etype>(
-                exec, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs});
+            using Vec = gko::matrix::Dense<>;
+            std::shared_ptr<gko::LinOp> system_matrix;
+            std::unique_ptr<Vec> b;
+            std::unique_ptr<Vec> x;
+            if (FLAGS_overhead) {
+                system_matrix = gko::initialize<Vec>({1.0}, exec);
+                b = gko::initialize<Vec>({std::nan("")}, exec);
+                x = gko::initialize<Vec>({0.0}, exec);
+            } else {
+                auto data = gko::read_raw<etype>(mtx_fd);
+                system_matrix = share(formats::matrix_factory.at(
+                    test_case["optimal"]["spmv"].GetString())(exec, data));
+                b = create_matrix<etype>(
+                    exec, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs},
+                    engine);
+                x = create_matrix<etype>(
+                    exec,
+                    gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs});
+            }
 
             std::clog << "Matrix is of size (" << system_matrix->get_size()[0]
                       << ", " << system_matrix->get_size()[1] << ")"
@@ -467,5 +583,5 @@ int main(int argc, char *argv[])
         }
     }
 
-    std::cout << test_cases;
+    std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt
index 13e637097cf..222d3f750b4 100644
--- a/benchmark/spmv/CMakeLists.txt
+++ b/benchmark/spmv/CMakeLists.txt
@@ -3,3 +3,6 @@ target_link_libraries(spmv ginkgo gflags rapidjson)
 if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusp_linops(spmv)
 endif()
+if (GINKGO_BUILD_HIP)
+    ginkgo_benchmark_hipsp_linops(spmv)
+endif()
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index 69a3a9e90e0..07debcf9426 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,8 @@ DEFINE_uint32(nrhs, 1, "The number of right hand sides");
 // calling it
 void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
                 const gko::matrix_data<etype> &data, const vec<etype> *b,
-                const vec<etype> *x, rapidjson::Value &test_case,
+                const vec<etype> *x, const vec<etype> *answer,
+                rapidjson::Value &test_case,
                 rapidjson::MemoryPoolAllocator<> &allocator)
 {
     try {
@@ -73,8 +74,20 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
         exec->add_logger(storage_logger);
         auto system_matrix =
             share(formats::matrix_factory.at(format_name)(exec, data));
+
         exec->remove_logger(gko::lend(storage_logger));
         storage_logger->write_data(spmv_case[format_name], allocator);
+        // check the residual
+        if (FLAGS_detailed) {
+            auto x_clone = clone(x);
+            exec->synchronize();
+            system_matrix->apply(lend(b), lend(x_clone));
+            exec->synchronize();
+            double max_relative_norm2 =
+                compute_max_relative_norm2(lend(x_clone), lend(answer));
+            add_or_set_member(spmv_case[format_name], "max_relative_norm2",
+                              max_relative_norm2, allocator);
+        }
         // warm run
         for (unsigned int i = 0; i < FLAGS_warmup; i++) {
             auto x_clone = clone(x);
@@ -172,9 +185,20 @@ int main(int argc, char *argv[])
                                     rapidjson::Value(rapidjson::kObjectType),
                                     allocator);
             }
+
+            // Compute the result from ginkgo::coo as the correct answer
+            auto answer = vec<etype>::create(exec);
+            if (FLAGS_detailed) {
+                auto system_matrix =
+                    share(formats::matrix_factory.at("coo")(exec, data));
+                answer->copy_from(lend(x));
+                exec->synchronize();
+                system_matrix->apply(lend(b), lend(answer));
+                exec->synchronize();
+            }
             for (const auto &format_name : formats) {
                 apply_spmv(format_name.c_str(), exec, data, lend(b), lend(x),
-                           test_case, allocator);
+                           lend(answer), test_case, allocator);
                 std::clog << "Current state:" << std::endl
                           << test_cases << std::endl;
                 if (spmv_case[format_name.c_str()]["completed"].GetBool()) {
@@ -199,5 +223,5 @@ int main(int argc, char *argv[])
         }
     }
 
-    std::cout << test_cases;
+    std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/utils/cuda_linops.hpp b/benchmark/utils/cuda_linops.hpp
index 105e0a3f4d5..7762a2439d2 100644
--- a/benchmark/utils/cuda_linops.hpp
+++ b/benchmark/utils/cuda_linops.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -37,14 +37,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/ginkgo.hpp>
 
 
+#include <memory>
+
+
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
-#include <memory>
 
 
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/device_guard.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/types.hpp"
 
 
 namespace detail {
@@ -54,7 +58,12 @@ class CuspBase : public gko::LinOp {
 public:
     cusparseMatDescr_t get_descr() const { return this->descr_.get(); }
 
-    const gko::CudaExecutor *get_gpu_exec() const { return gpu_exec_.get(); }
+    // Return shared pointer not plain pointer such that CuspGenericSpMV uses
+    // gko::Array to allocate buffer.
+    std::shared_ptr<const gko::CudaExecutor> get_gpu_exec() const
+    {
+        return gpu_exec_;
+    }
 
 protected:
     void apply_impl(const gko::LinOp *, const gko::LinOp *, const gko::LinOp *,
@@ -91,11 +100,11 @@ class CuspBase : public gko::LinOp {
     void initialize_descr()
     {
         const auto id = this->gpu_exec_->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         this->descr_ = handle_manager<cusparseMatDescr>(
             gko::kernels::cuda::cusparse::create_mat_descr(),
             [id](cusparseMatDescr_t descr) {
-                gko::device_guard g{id};
+                gko::cuda::device_guard g{id};
                 gko::kernels::cuda::cusparse::destroy(descr);
             });
     }
@@ -141,7 +150,7 @@ class CuspCsrmp
         auto dx = dense_x->get_values();
 
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         gko::kernels::cuda::cusparse::spmv_mp(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -201,7 +210,7 @@ class CuspCsr
         auto dx = dense_x->get_values();
 
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         gko::kernels::cuda::cusparse::spmv(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -261,7 +270,7 @@ class CuspCsrmm
         auto dx = dense_x->get_values();
 
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         gko::kernels::cuda::cusparse::spmm(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
@@ -318,7 +327,7 @@ class CuspCsrEx
         const auto id = this->get_gpu_exec()->get_device_id();
         if (set_buffer_) {
             try {
-                gko::device_guard g{id};
+                gko::cuda::device_guard g{id};
                 GKO_ASSERT_NO_CUDA_ERRORS(cudaFree(buffer_));
             } catch (const std::exception &e) {
                 std::cerr
@@ -344,7 +353,7 @@ class CuspCsrEx
         gko::size_type buffer_size = 0;
 
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         auto handle = this->get_gpu_exec()->get_cusparse_handle();
         // This function seems to require the pointer mode to be set to HOST.
         // Ginkgo use pointer mode DEVICE by default, so we change this
@@ -416,7 +425,7 @@ class CuspHybrid
         this->set_size(gko::dim<2>{t_csr->get_size()});
 
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         gko::kernels::cuda::cusparse::csr2hyb(
             this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
@@ -428,7 +437,7 @@ class CuspHybrid
     {
         const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::device_guard g{id};
+            gko::cuda::device_guard g{id};
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_));
         } catch (const std::exception &e) {
             std::cerr << "Error when unallocating CuspHybrid hyb_ matrix: "
@@ -449,7 +458,7 @@ class CuspHybrid
         auto dx = dense_x->get_values();
 
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         gko::kernels::cuda::cusparse::spmv(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
@@ -462,7 +471,7 @@ class CuspHybrid
           trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
     {
         const auto id = this->get_gpu_exec()->get_device_id();
-        gko::device_guard g{id};
+        gko::cuda::device_guard g{id};
         GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
     }
 
@@ -475,6 +484,206 @@ class CuspHybrid
 };
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
+    !(defined(_WIN32) || defined(__CYGWIN__))
+
+
+template <typename ValueType>
+void cusp_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
+                       const cusparseSpMatDescr_t mat,
+                       const gko::Array<ValueType> &scalars,
+                       const gko::LinOp *b, gko::LinOp *x,
+                       cusparseOperation_t trans, cusparseSpMVAlg_t alg)
+{
+    cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+    using gko::kernels::cuda::as_culibs_type;
+    auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
+    auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
+    auto db = dense_b->get_const_values();
+    auto dx = dense_x->get_values();
+    const auto id = gpu_exec->get_device_id();
+    gko::cuda::device_guard g{id};
+    cusparseDnVecDescr_t vecb, vecx;
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(),
+                            as_culibs_type(dx), cu_value));
+    // cusparseCreateDnVec only allows non-const pointer
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateDnVec(
+        &vecb, dense_b->get_num_stored_elements(),
+        as_culibs_type(const_cast<ValueType *>(db)), cu_value));
+
+    size_t buffer_size = 0;
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
+        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
+        &buffer_size));
+    gko::Array<char> buffer_array(gpu_exec, buffer_size);
+    auto dbuffer = buffer_array.get_data();
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV(
+        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb));
+}
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32,
+          cusparseSpMVAlg_t Alg = CUSPARSE_MV_ALG_DEFAULT>
+class CuspGenericCsr
+    : public gko::EnableLinOp<CuspGenericCsr<ValueType, IndexType, Alg>,
+                              CuspBase>,
+      public gko::EnableCreateMethod<CuspGenericCsr<ValueType, IndexType, Alg>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<CuspGenericCsr>;
+    friend class gko::EnablePolymorphicObject<CuspGenericCsr, CuspBase>;
+
+public:
+    using csr = gko::matrix::Csr<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+    cusparseIndexType_t cu_index =
+        gko::kernels::cuda::cusparse_index_type<IndexType>();
+    cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+
+    void read(const mat_data &data) override
+    {
+        using gko::kernels::cuda::as_culibs_type;
+        csr_->read(data);
+        this->set_size(gko::dim<2>{csr_->get_size()});
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseCreateCsr(&mat_, csr_->get_size()[0], csr_->get_size()[1],
+                              csr_->get_num_stored_elements(),
+                              as_culibs_type(csr_->get_row_ptrs()),
+                              as_culibs_type(csr_->get_col_idxs()),
+                              as_culibs_type(csr_->get_values()), cu_index,
+                              cu_index, CUSPARSE_INDEX_BASE_ZERO, cu_value));
+    }
+
+    gko::size_type get_num_stored_elements() const noexcept
+    {
+        return csr_->get_num_stored_elements();
+    }
+
+    ~CuspGenericCsr() override
+    {
+        const auto id = this->get_gpu_exec()->get_device_id();
+        try {
+            gko::cuda::device_guard g{id};
+            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
+        } catch (const std::exception &e) {
+            std::cerr << "Error when unallocating CuspGenericCsr mat_ matrix: "
+                      << e.what() << std::endl;
+        }
+    }
+
+    CuspGenericCsr(const CuspGenericCsr &other) = delete;
+
+    CuspGenericCsr &operator=(const CuspGenericCsr &other) = default;
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
+                          Alg);
+    }
+
+    CuspGenericCsr(std::shared_ptr<const gko::Executor> exec,
+                   const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<CuspGenericCsr, CuspBase>(exec, size),
+          csr_(std::move(
+              csr::create(exec, std::make_shared<typename csr::classical>()))),
+          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+    {}
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    std::shared_ptr<csr> csr_;
+    cusparseOperation_t trans_;
+    cusparseSpMatDescr_t mat_;
+};
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32>
+class CuspGenericCoo
+    : public gko::EnableLinOp<CuspGenericCoo<ValueType, IndexType>, CuspBase>,
+      public gko::EnableCreateMethod<CuspGenericCoo<ValueType, IndexType>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<CuspGenericCoo>;
+    friend class gko::EnablePolymorphicObject<CuspGenericCoo, CuspBase>;
+
+public:
+    using coo = gko::matrix::Coo<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+    cusparseIndexType_t cu_index =
+        gko::kernels::cuda::cusparse_index_type<IndexType>();
+    cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+
+    void read(const mat_data &data) override
+    {
+        using gko::kernels::cuda::as_culibs_type;
+        coo_->read(data);
+        this->set_size(gko::dim<2>{coo_->get_size()});
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseCreateCoo(&mat_, coo_->get_size()[0], coo_->get_size()[1],
+                              coo_->get_num_stored_elements(),
+                              as_culibs_type(coo_->get_row_idxs()),
+                              as_culibs_type(coo_->get_col_idxs()),
+                              as_culibs_type(coo_->get_values()), cu_index,
+                              CUSPARSE_INDEX_BASE_ZERO, cu_value));
+    }
+
+    gko::size_type get_num_stored_elements() const noexcept
+    {
+        return coo_->get_num_stored_elements();
+    }
+
+    ~CuspGenericCoo() override
+    {
+        const auto id = this->get_gpu_exec()->get_device_id();
+        try {
+            gko::cuda::device_guard g{id};
+            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
+        } catch (const std::exception &e) {
+            std::cerr << "Error when unallocating CuspGenericCoo mat_ matrix: "
+                      << e.what() << std::endl;
+        }
+    }
+
+    CuspGenericCoo(const CuspGenericCoo &other) = delete;
+
+    CuspGenericCoo &operator=(const CuspGenericCoo &other) = default;
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
+                          CUSPARSE_MV_ALG_DEFAULT);
+    }
+
+    CuspGenericCoo(std::shared_ptr<const gko::Executor> exec,
+                   const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<CuspGenericCoo, CuspBase>(exec, size),
+          coo_(std::move(coo::create(exec))),
+          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+    {}
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    std::shared_ptr<coo> coo_;
+    cusparseOperation_t trans_;
+    cusparseSpMatDescr_t mat_;
+};
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
+        // !(defined(_WIN32) || defined(__CYGWIN__))
+
+
 }  // namespace detail
 
 
@@ -485,6 +694,20 @@ using cusp_csrmp = detail::CuspCsrmp<>;
 using cusp_csrmm = detail::CuspCsrmm<>;
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
+    !(defined(_WIN32) || defined(__CYGWIN__))
+
+
+using cusp_gcsr = detail::CuspGenericCsr<>;
+using cusp_gcsr2 =
+    detail::CuspGenericCsr<double, gko::int32, CUSPARSE_CSRMV_ALG2>;
+using cusp_gcoo = detail::CuspGenericCoo<>;
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
+        // !(defined(_WIN32) || defined(__CYGWIN__))
+
+
 using cusp_coo =
     detail::CuspHybrid<double, gko::int32, CUSPARSE_HYB_PARTITION_USER, 0>;
 using cusp_ell =
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index aa757030017..c4379f834f7 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,8 +45,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #ifdef HAS_CUDA
-#include "cuda_linops.hpp"
+#include "benchmark/utils/cuda_linops.hpp"
 #endif  // HAS_CUDA
+#ifdef HAS_HIP
+#include "benchmark/utils/hip_linops.hip.hpp"
+#endif  // HAS_HIP
 
 
 namespace formats {
@@ -60,6 +63,9 @@ std::string available_format =
     ", cusp_csr, cusp_csrex, cusp_csrmp, cusp_csrmm, cusp_coo, cusp_ell, "
     "cusp_hybrid"
 #endif  // HAS_CUDA
+#ifdef HAS_HIP
+    ", hipsp_csr, hipsp_csrmm, hipsp_coo, hipsp_ell, hipsp_hybrid"
+#endif  // HAS_HIP
     ".\n";
 
 std::string format_description =
@@ -91,7 +97,28 @@ std::string format_description =
     "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function.\n"
     "cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n"
     "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function."
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
+    !(defined(_WIN32) || defined(__CYGWIN__))
+    "\n"
+    "cusp_gcsr: benchmark CuSPARSE with the generic csr with default "
+    "algorithm.\n"
+    "cusp_gcsr2: benchmark CuSPARSE with the generic csr with "
+    "CUSPARSE_CSRMV_ALG2.\n"
+    "cusp_gcoo: benchmark CuSPARSE with the generic coo with default "
+    "algorithm.\n"
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
+        // !(defined(_WIN32) || defined(__CYGWIN__))
 #endif  // HAS_CUDA
+#ifdef HAS_HIP
+    "\n"
+    "hipsp_csr: benchmark HipSPARSE with the hipsparseXcsrmv function.\n"
+    "hipsp_csrmm: benchmark HipSPARSE with the hipsparseXcsrmv_mm function.\n"
+    "hipsp_hybrid: benchmark HipSPARSE spmv with hipsparseXhybmv and an "
+    "automatic partition.\n"
+    "hipsp_coo: use hipsparseXhybmv with a HIPSPARSE_HYB_PARTITION_USER "
+    "partition.\n"
+    "hipsp_ell: use hipsparseXhybmv with HIPSPARSE_HYB_PARTITION_MAX partition."
+#endif  // HAS_HIP
     ;
 
 std::string format_command =
@@ -148,6 +175,7 @@ std::unique_ptr<MatrixType> read_matrix_from_data(
     }
 
 
+// clang-format off
 const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
                                 std::shared_ptr<const gko::Executor>,
                                 const gko::matrix_data<> &)>>
@@ -166,7 +194,21 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
+    !(defined(_WIN32) || defined(__CYGWIN__))
+        {"cusp_gcsr", read_matrix_from_data<cusp_gcsr>},
+        {"cusp_gcsr2", read_matrix_from_data<cusp_gcsr2>},
+        {"cusp_gcoo", read_matrix_from_data<cusp_gcoo>},
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
+        // !(defined(_WIN32) || defined(__CYGWIN__))
 #endif  // HAS_CUDA
+#ifdef HAS_HIP
+        {"hipsp_csr", read_matrix_from_data<hipsp_csr>},
+        {"hipsp_csrmm", read_matrix_from_data<hipsp_csrmm>},
+        {"hipsp_hybrid", read_matrix_from_data<hipsp_hybrid>},
+        {"hipsp_coo", read_matrix_from_data<hipsp_coo>},
+        {"hipsp_ell", read_matrix_from_data<hipsp_ell>},
+#endif  // HAS_HIP
         {"hybrid", read_matrix_from_data<hybrid>},
         {"hybrid0",
          READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_limit>(0))},
@@ -194,8 +236,9 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
          READ_MATRIX(hybrid,
                      std::make_shared<hybrid::minimal_storage_limit>())},
         {"sellp", read_matrix_from_data<gko::matrix::Sellp<>>}};
+// clang-format on
 
 
 }  // namespace formats
 
-#endif  // GKO_BENCHMARK_UTILS_FORMATS_HPP_
\ No newline at end of file
+#endif  // GKO_BENCHMARK_UTILS_FORMATS_HPP_
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 267109f0dcc..2cb738ce1dd 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/ginkgo.hpp>
 
 
+#include <algorithm>
 #include <array>
 #include <fstream>
 #include <functional>
@@ -57,9 +58,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 // Global command-line arguments
-DEFINE_string(
-    executor, "reference",
-    "The executor used to run the benchmarks, one of: reference, omp, cuda");
+DEFINE_string(executor, "reference",
+              "The executor used to run the benchmarks, one of: reference, "
+              "omp, cuda, hip");
 
 DEFINE_uint32(device_id, 0, "ID of the device where to run the code");
 
@@ -79,6 +80,8 @@ DEFINE_string(double_buffer, "",
 DEFINE_bool(detailed, true,
             "If set, performs several runs to obtain more detailed results");
 
+DEFINE_bool(nested_names, false, "If set, separately logs nested operations");
+
 DEFINE_uint32(seed, 42, "Seed used for the random number generator");
 
 DEFINE_uint32(warmup, 2, "Warm-up repetitions");
@@ -167,7 +170,10 @@ std::ranlux24 &get_engine()
 std::ostream &operator<<(std::ostream &os, const rapidjson::Value &value)
 {
     rapidjson::OStreamWrapper jos(os);
-    rapidjson::PrettyWriter<rapidjson::OStreamWrapper> writer(jos);
+    rapidjson::PrettyWriter<rapidjson::OStreamWrapper, rapidjson::UTF8<>,
+                            rapidjson::UTF8<>, rapidjson::CrtAllocator,
+                            rapidjson::kWriteNanAndInfFlag>
+        writer(jos);
     value.Accept(writer);
     return os;
 }
@@ -251,9 +257,14 @@ const std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
     executor_factory{
         {"reference", [] { return gko::ReferenceExecutor::create(); }},
         {"omp", [] { return gko::OmpExecutor::create(); }},
-        {"cuda", [] {
+        {"cuda",
+         [] {
              return gko::CudaExecutor::create(FLAGS_device_id,
-                                              gko::OmpExecutor::create());
+                                              gko::OmpExecutor::create(), true);
+         }},
+        {"hip", [] {
+             return gko::HipExecutor::create(FLAGS_device_id,
+                                             gko::OmpExecutor::create(), true);
          }}};
 
 
@@ -322,7 +333,7 @@ double get_norm(const vec<ValueType> *norm)
 
 
 template <typename ValueType>
-double compute_norm(const vec<ValueType> *b)
+double compute_norm2(const vec<ValueType> *b)
 {
     auto exec = b->get_executor();
     auto b_norm = gko::initialize<vec<ValueType>>({0.0}, exec);
@@ -340,8 +351,35 @@ double compute_residual_norm(const gko::LinOp *system_matrix,
     auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
     auto res = clone(b);
     system_matrix->apply(lend(one), lend(x), lend(neg_one), lend(res));
-    return compute_norm(lend(res));
+    return compute_norm2(lend(res));
+}
+
+
+template <typename ValueType>
+double compute_max_relative_norm2(vec<ValueType> *result,
+                                  const vec<ValueType> *answer)
+{
+    auto exec = answer->get_executor();
+    auto answer_norm =
+        vec<ValueType>::create(exec, gko::dim<2>{1, answer->get_size()[1]});
+    answer->compute_norm2(lend(answer_norm));
+    auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
+    result->add_scaled(lend(neg_one), lend(answer));
+    auto absolute_norm =
+        vec<ValueType>::create(exec, gko::dim<2>{1, answer->get_size()[1]});
+    result->compute_norm2(lend(absolute_norm));
+    auto host_answer_norm =
+        clone(answer_norm->get_executor()->get_master(), answer_norm);
+    auto host_absolute_norm =
+        clone(absolute_norm->get_executor()->get_master(), absolute_norm);
+    double max_relative_norm2 = 0;
+    for (gko::size_type i = 0; i < host_answer_norm->get_size()[1]; i++) {
+        max_relative_norm2 =
+            std::max(host_absolute_norm->at(0, i) / host_answer_norm->at(0, i),
+                     max_relative_norm2);
+    }
+    return max_relative_norm2;
 }
 
 
-#endif  // GKO_BENCHMARK_UTILS_GENERAL_HPP_
\ No newline at end of file
+#endif  // GKO_BENCHMARK_UTILS_GENERAL_HPP_
diff --git a/benchmark/utils/hip_linops.hip.hpp b/benchmark/utils/hip_linops.hip.hpp
new file mode 100644
index 00000000000..5d62d605d24
--- /dev/null
+++ b/benchmark/utils/hip_linops.hip.hpp
@@ -0,0 +1,334 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_HIP_LINOPS_HIP_HPP_
+#define GKO_BENCHMARK_UTILS_HIP_LINOPS_HIP_HPP_
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <memory>
+
+
+#include <hipsparse.h>
+
+
+#include "hip/base/device_guard.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+
+
+namespace detail {
+
+
+struct hipsparseMatDescr;
+
+
+class HipspBase : public gko::LinOp {
+public:
+    hipsparseMatDescr_t get_descr() const { return this->descr_.get(); }
+
+    const gko::HipExecutor *get_gpu_exec() const { return gpu_exec_.get(); }
+
+protected:
+    void apply_impl(const gko::LinOp *, const gko::LinOp *, const gko::LinOp *,
+                    gko::LinOp *) const override
+    {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    HipspBase(std::shared_ptr<const gko::Executor> exec,
+              const gko::dim<2> &size = gko::dim<2>{})
+        : gko::LinOp(exec, size)
+    {
+        gpu_exec_ = std::dynamic_pointer_cast<const gko::HipExecutor>(exec);
+        if (gpu_exec_ == nullptr) {
+            GKO_NOT_IMPLEMENTED;
+        }
+        this->initialize_descr();
+    }
+
+    ~HipspBase() = default;
+
+    HipspBase(const HipspBase &other) = delete;
+
+    HipspBase &operator=(const HipspBase &other)
+    {
+        if (this != &other) {
+            gko::LinOp::operator=(other);
+            this->gpu_exec_ = other.gpu_exec_;
+            this->initialize_descr();
+        }
+        return *this;
+    }
+
+    void initialize_descr()
+    {
+        const auto id = this->gpu_exec_->get_device_id();
+        gko::hip::device_guard g{id};
+        this->descr_ = handle_manager<hipsparseMatDescr>(
+            reinterpret_cast<hipsparseMatDescr *>(
+                gko::kernels::hip::hipsparse::create_mat_descr()),
+            [id](hipsparseMatDescr *descr) {
+                gko::hip::device_guard g{id};
+                gko::kernels::hip::hipsparse::destroy(descr);
+            });
+    }
+
+private:
+    std::shared_ptr<const gko::HipExecutor> gpu_exec_;
+    template <typename T>
+    using handle_manager = std::unique_ptr<T, std::function<void(T *)>>;
+    handle_manager<hipsparseMatDescr> descr_;
+};
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32>
+class HipspCsr
+    : public gko::EnableLinOp<HipspCsr<ValueType, IndexType>, HipspBase>,
+      public gko::EnableCreateMethod<HipspCsr<ValueType, IndexType>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<HipspCsr>;
+    friend class gko::EnablePolymorphicObject<HipspCsr, HipspBase>;
+
+public:
+    using csr = gko::matrix::Csr<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+
+    void read(const mat_data &data) override
+    {
+        csr_->read(data);
+        this->set_size(gko::dim<2>{csr_->get_size()});
+    }
+
+    gko::size_type get_num_stored_elements() const noexcept
+    {
+        return csr_->get_num_stored_elements();
+    }
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
+        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
+        auto db = dense_b->get_const_values();
+        auto dx = dense_x->get_values();
+
+        const auto id = this->get_gpu_exec()->get_device_id();
+        gko::hip::device_guard g{id};
+        gko::kernels::hip::hipsparse::spmv(
+            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_size()[0], this->get_size()[1],
+            csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
+            this->get_descr(), csr_->get_const_values(),
+            csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db,
+            &scalars.get_const_data()[1], dx);
+    }
+
+    HipspCsr(std::shared_ptr<const gko::Executor> exec,
+             const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<HipspCsr, HipspBase>(exec, size),
+          csr_(std::move(
+              csr::create(exec, std::make_shared<typename csr::classical>()))),
+          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+    {}
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    std::shared_ptr<csr> csr_;
+    hipsparseOperation_t trans_;
+};
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32>
+class HipspCsrmm
+    : public gko::EnableLinOp<HipspCsrmm<ValueType, IndexType>, HipspBase>,
+      public gko::EnableCreateMethod<HipspCsrmm<ValueType, IndexType>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<HipspCsrmm>;
+    friend class gko::EnablePolymorphicObject<HipspCsrmm, HipspBase>;
+
+public:
+    using csr = gko::matrix::Csr<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+
+    void read(const mat_data &data) override
+    {
+        csr_->read(data);
+        this->set_size(gko::dim<2>{csr_->get_size()});
+    }
+
+    gko::size_type get_num_stored_elements() const noexcept
+    {
+        return csr_->get_num_stored_elements();
+    }
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
+        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
+        auto db = dense_b->get_const_values();
+        auto dx = dense_x->get_values();
+
+        const auto id = this->get_gpu_exec()->get_device_id();
+        gko::hip::device_guard g{id};
+        gko::kernels::hip::hipsparse::spmm(
+            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
+            csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
+            this->get_descr(), csr_->get_const_values(),
+            csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db,
+            dense_b->get_size()[0], &scalars.get_const_data()[1], dx,
+            dense_x->get_size()[0]);
+    }
+
+    HipspCsrmm(std::shared_ptr<const gko::Executor> exec,
+               const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<HipspCsrmm, HipspBase>(exec, size),
+          csr_(std::move(
+              csr::create(exec, std::make_shared<typename csr::classical>()))),
+          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+    {}
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    std::shared_ptr<csr> csr_;
+    hipsparseOperation_t trans_;
+};
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32,
+          hipsparseHybPartition_t Partition = HIPSPARSE_HYB_PARTITION_AUTO,
+          int Threshold = 0>
+class HipspHybrid
+    : public gko::EnableLinOp<
+          HipspHybrid<ValueType, IndexType, Partition, Threshold>, HipspBase>,
+      public gko::EnableCreateMethod<
+          HipspHybrid<ValueType, IndexType, Partition, Threshold>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<HipspHybrid>;
+    friend class gko::EnablePolymorphicObject<HipspHybrid, HipspBase>;
+
+public:
+    using csr = gko::matrix::Csr<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+
+    void read(const mat_data &data) override
+    {
+        auto t_csr = csr::create(this->get_executor(),
+                                 std::make_shared<typename csr::classical>());
+        t_csr->read(data);
+        this->set_size(gko::dim<2>{t_csr->get_size()});
+
+        const auto id = this->get_gpu_exec()->get_device_id();
+        gko::hip::device_guard g{id};
+        gko::kernels::hip::hipsparse::csr2hyb(
+            this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0],
+            this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
+            t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_,
+            Threshold, Partition);
+    }
+
+    ~HipspHybrid() override
+    {
+        const auto id = this->get_gpu_exec()->get_device_id();
+        try {
+            gko::hip::device_guard g{id};
+            GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyHybMat(hyb_));
+        } catch (const std::exception &e) {
+            std::cerr << "Error when unallocating HipspHybrid hyb_ matrix: "
+                      << e.what() << std::endl;
+        }
+    }
+
+    HipspHybrid(const HipspHybrid &other) = delete;
+
+    HipspHybrid &operator=(const HipspHybrid &other) = default;
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
+        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
+        auto db = dense_b->get_const_values();
+        auto dx = dense_x->get_values();
+
+        const auto id = this->get_gpu_exec()->get_device_id();
+        gko::hip::device_guard g{id};
+        gko::kernels::hip::hipsparse::spmv(
+            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
+            &scalars.get_const_data()[1], dx);
+    }
+
+    HipspHybrid(std::shared_ptr<const gko::Executor> exec,
+                const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<HipspHybrid, HipspBase>(exec, size),
+          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+    {
+        const auto id = this->get_gpu_exec()->get_device_id();
+        gko::hip::device_guard g{id};
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_));
+    }
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    hipsparseOperation_t trans_;
+    hipsparseHybMat_t hyb_;
+};
+
+
+}  // namespace detail
+
+
+// Some shortcuts
+using hipsp_csr = detail::HipspCsr<>;
+using hipsp_csrmm = detail::HipspCsrmm<>;
+
+
+using hipsp_coo =
+    detail::HipspHybrid<double, gko::int32, HIPSPARSE_HYB_PARTITION_USER, 0>;
+using hipsp_ell =
+    detail::HipspHybrid<double, gko::int32, HIPSPARSE_HYB_PARTITION_MAX, 0>;
+using hipsp_hybrid = detail::HipspHybrid<>;
+
+#endif  // GKO_BENCHMARK_UTILS_HIP_LINOPS_HIP_HPP_
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index 69ec16bd769..ea6bbea2797 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <chrono>
+#include <mutex>
 #include <regex>
 #include <unordered_map>
 
@@ -104,6 +105,7 @@ struct OperationLogger : gko::log::Logger {
                     rapidjson::MemoryPoolAllocator<> &alloc,
                     gko::uint32 repetitions)
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         for (const auto &entry : total) {
             add_or_set_member(
                 object, entry.first.c_str(),
@@ -115,37 +117,48 @@ struct OperationLogger : gko::log::Logger {
         }
     }
 
-    OperationLogger(std::shared_ptr<const gko::Executor> exec)
-        : gko::log::Logger(exec)
+    OperationLogger(std::shared_ptr<const gko::Executor> exec, bool nested_name)
+        : gko::log::Logger(exec), use_nested_name{nested_name}
     {}
 
 private:
     void start_operation(const gko::Executor *exec,
                          const std::string &name) const
     {
-        nested.emplace_back(0);
         exec->synchronize();
-        start[name] = std::chrono::steady_clock::now();
+        const std::lock_guard<std::mutex> lock(mutex);
+        auto nested_name = nested.empty() || !use_nested_name
+                               ? name
+                               : nested.back().first + "::" + name;
+        nested.emplace_back(nested_name, std::chrono::steady_clock::duration{});
+        start[nested_name] = std::chrono::steady_clock::now();
     }
 
     void end_operation(const gko::Executor *exec, const std::string &name) const
     {
         exec->synchronize();
+        const std::lock_guard<std::mutex> lock(mutex);
+        // if operations are properly nested, nested_name now ends with name
+        auto nested_name = nested.back().first;
         const auto end = std::chrono::steady_clock::now();
-        const auto diff = end - start[name];
+        const auto diff = end - start[nested_name];
         // make sure timings for nested operations are not counted twice
-        total[name] += diff - nested.back();
+        total[nested_name] += diff - nested.back().second;
         nested.pop_back();
-        if (nested.size() > 0) {
-            nested.back() += diff;
+        if (!nested.empty()) {
+            nested.back().second += diff;
         }
     }
 
+    bool use_nested_name;
+    mutable std::mutex mutex;
     mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
     mutable std::map<std::string, std::chrono::steady_clock::duration> total;
     // the position i of this vector holds the total time spend on child
     // operations on nesting level i
-    mutable std::vector<std::chrono::steady_clock::duration> nested;
+    mutable std::vector<
+        std::pair<std::string, std::chrono::steady_clock::duration>>
+        nested;
 };
 
 
@@ -154,18 +167,21 @@ struct StorageLogger : gko::log::Logger {
                                  const gko::size_type &num_bytes,
                                  const gko::uintptr &location) const override
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         storage[location] = num_bytes;
     }
 
     void on_free_completed(const gko::Executor *,
                            const gko::uintptr &location) const override
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         storage[location] = 0;
     }
 
     void write_data(rapidjson::Value &output,
                     rapidjson::MemoryPoolAllocator<> &allocator)
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         gko::size_type total{};
         for (const auto &e : storage) {
             total += e.second;
@@ -178,6 +194,7 @@ struct StorageLogger : gko::log::Logger {
     {}
 
 private:
+    mutable std::mutex mutex;
     mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
 };
 
@@ -190,12 +207,17 @@ struct ResidualLogger : gko::log::Logger {
                                const gko::LinOp *solution,
                                const gko::LinOp *residual_norm) const override
     {
+        timestamps.PushBack(
+            std::chrono::duration_cast<std::chrono::nanoseconds>(
+                std::chrono::steady_clock::now() - start)
+                .count(),
+            alloc);
         if (residual_norm) {
             rec_res_norms.PushBack(
                 get_norm(gko::as<vec<ValueType>>(residual_norm)), alloc);
         } else {
             rec_res_norms.PushBack(
-                compute_norm(gko::as<vec<ValueType>>(residual)), alloc);
+                compute_norm2(gko::as<vec<ValueType>>(residual)), alloc);
         }
         if (solution) {
             true_res_norms.PushBack(
@@ -211,22 +233,52 @@ struct ResidualLogger : gko::log::Logger {
                    const gko::LinOp *matrix, const vec<ValueType> *b,
                    rapidjson::Value &rec_res_norms,
                    rapidjson::Value &true_res_norms,
+                   rapidjson::Value &timestamps,
                    rapidjson::MemoryPoolAllocator<> &alloc)
         : gko::log::Logger(exec, gko::log::Logger::iteration_complete_mask),
           matrix{matrix},
           b{b},
+          start{std::chrono::steady_clock::now()},
           rec_res_norms{rec_res_norms},
           true_res_norms{true_res_norms},
+          timestamps{timestamps},
           alloc{alloc}
     {}
 
 private:
     const gko::LinOp *matrix;
     const vec<ValueType> *b;
+    std::chrono::steady_clock::time_point start;
     rapidjson::Value &rec_res_norms;
     rapidjson::Value &true_res_norms;
+    rapidjson::Value &timestamps;
     rapidjson::MemoryPoolAllocator<> &alloc;
 };
 
 
+// Logs the number of iteration executed
+struct IterationLogger : gko::log::Logger {
+    void on_iteration_complete(const gko::LinOp *,
+                               const gko::size_type &num_iterations,
+                               const gko::LinOp *, const gko::LinOp *,
+                               const gko::LinOp *) const override
+    {
+        this->num_iters = num_iterations;
+    }
+
+    IterationLogger(std::shared_ptr<const gko::Executor> exec)
+        : gko::log::Logger(exec, gko::log::Logger::iteration_complete_mask)
+    {}
+
+    void write_data(rapidjson::Value &output,
+                    rapidjson::MemoryPoolAllocator<> &allocator)
+    {
+        add_or_set_member(output, "iterations", this->num_iters, allocator);
+    }
+
+private:
+    mutable gko::size_type num_iters{0};
+};
+
+
 #endif  // GKO_BENCHMARK_UTILS_LOGGERS_HPP_
diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp
new file mode 100644
index 00000000000..8db715e553f
--- /dev/null
+++ b/benchmark/utils/overhead_linop.hpp
@@ -0,0 +1,226 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_OVERHEAD_LINOP_HPP_
+#define GKO_BENCHMARK_UTILS_OVERHEAD_LINOP_HPP_
+
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/stop/criterion.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace overhead {
+
+
+#define GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(_type, _num)            \
+    static volatile std::uintptr_t val_operation_##_num = 0;          \
+    template <typename _type>                                         \
+    void operation##_num(std::shared_ptr<const DefaultExecutor> exec, \
+                         const matrix::Dense<_type> *b,               \
+                         matrix::Dense<_type> *x)                     \
+    {                                                                 \
+        val_operation_##_num = reinterpret_cast<std::uintptr_t>(x);   \
+    }
+
+
+#define GKO_DECLARE_ALL                                                      \
+    GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 1)                      \
+    GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 2)                      \
+    GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 3)                      \
+    GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 4)                      \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
+}  // namespace overhead
+
+
+namespace omp {
+namespace overhead {
+
+GKO_DECLARE_ALL;
+
+}  // namespace overhead
+}  // namespace omp
+
+
+namespace cuda {
+namespace overhead {
+
+GKO_DECLARE_ALL;
+
+}  // namespace overhead
+}  // namespace cuda
+
+
+namespace reference {
+namespace overhead {
+
+GKO_DECLARE_ALL;
+
+}  // namespace overhead
+}  // namespace reference
+
+
+namespace hip {
+namespace overhead {
+
+GKO_DECLARE_ALL;
+
+}  // namespace overhead
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL
+
+
+}  // namespace kernels
+
+
+namespace overhead {
+
+
+GKO_REGISTER_OPERATION(operation1, overhead::operation1);
+GKO_REGISTER_OPERATION(operation2, overhead::operation2);
+GKO_REGISTER_OPERATION(operation3, overhead::operation3);
+GKO_REGISTER_OPERATION(operation4, overhead::operation4);
+
+
+}  // namespace overhead
+
+
+template <typename ValueType = default_precision>
+class Overhead : public EnableLinOp<Overhead<ValueType>>,
+                 public Preconditionable {
+    friend class EnableLinOp<Overhead>;
+    friend class EnablePolymorphicObject<Overhead, LinOp>;
+
+public:
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * Criterion factories.
+         */
+        std::vector<std::shared_ptr<const stop::CriterionFactory>>
+            GKO_FACTORY_PARAMETER(criteria, nullptr);
+
+        /**
+         * Preconditioner factory.
+         */
+        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER(
+            preconditioner, nullptr);
+
+        /**
+         * Already generated preconditioner. If one is provided, the factory
+         * `preconditioner` will be ignored.
+         */
+        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER(
+            generated_preconditioner, nullptr);
+    };
+
+    GKO_ENABLE_LIN_OP_FACTORY(Overhead, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    void apply_impl(const LinOp *b, LinOp *x) const override
+    {
+        using Vector = matrix::Dense<ValueType>;
+
+        auto exec = this->get_executor();
+        auto dense_b = as<const Vector>(b);
+        auto dense_x = as<Vector>(x);
+
+        system_matrix_->apply(dense_b, dense_x);
+        get_preconditioner()->apply(dense_b, dense_x);
+
+        exec->run(overhead::make_operation1(dense_b, dense_x));
+        exec->run(overhead::make_operation2(dense_b, dense_x));
+        exec->run(overhead::make_operation3(dense_b, dense_x));
+        exec->run(overhead::make_operation4(dense_b, dense_x));
+    }
+
+    void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta,
+                    LinOp *x) const override
+    {
+        auto dense_x = as<matrix::Dense<ValueType>>(x);
+
+        auto x_clone = dense_x->clone();
+        this->apply(b, x_clone.get());
+        dense_x->scale(beta);
+        dense_x->add_scaled(alpha, x_clone.get());
+    }
+
+    explicit Overhead(std::shared_ptr<const Executor> exec)
+        : EnableLinOp<Overhead>(std::move(exec))
+    {}
+
+    explicit Overhead(const Factory *factory,
+                      std::shared_ptr<const LinOp> system_matrix)
+        : EnableLinOp<Overhead>(factory->get_executor(),
+                                transpose(system_matrix->get_size())),
+          parameters_{factory->get_parameters()},
+          system_matrix_{std::move(system_matrix)}
+    {
+        if (parameters_.generated_preconditioner) {
+            GKO_ASSERT_EQUAL_DIMENSIONS(parameters_.generated_preconditioner,
+                                        this);
+            set_preconditioner(parameters_.generated_preconditioner);
+        } else if (parameters_.preconditioner) {
+            set_preconditioner(
+                parameters_.preconditioner->generate(system_matrix_));
+        } else {
+            set_preconditioner(matrix::Identity<ValueType>::create(
+                this->get_executor(), this->get_size()[0]));
+        }
+        stop_criterion_factory_ =
+            stop::combine(std::move(parameters_.criteria));
+    }
+
+private:
+    std::shared_ptr<const LinOp> system_matrix_{};
+    std::shared_ptr<const stop::CriterionFactory> stop_criterion_factory_{};
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_BENCHMARK_UTILS_OVERHEAD_LINOP_HPP_
diff --git a/benchmark/utils/spmv_common.hpp b/benchmark/utils/spmv_common.hpp
index f027d52c0ce..34cd51067ae 100644
--- a/benchmark/utils/spmv_common.hpp
+++ b/benchmark/utils/spmv_common.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
 #define GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
 
+
 #include <ginkgo/ginkgo.hpp>
 
 
@@ -50,7 +51,8 @@ using csr = gko::matrix::Csr<>;
 /**
  * Function which outputs the input format for benchmarks similar to the spmv.
  */
-[[noreturn]] void print_config_error_and_exit() {
+[[noreturn]] void print_config_error_and_exit()
+{
     std::cerr << "Input has to be a JSON array of matrix configurations:\n"
               << "  [\n"
               << "    { \"filename\": \"my_file.mtx\" },\n"
@@ -74,4 +76,4 @@ void validate_option_object(const rapidjson::Value &value)
 }
 
 
-#endif  // GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
\ No newline at end of file
+#endif  // GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
diff --git a/cmake/CTestCustom.cmake.in b/cmake/CTestCustom.cmake.in
index afcaf60fb28..1b26cadf225 100644
--- a/cmake/CTestCustom.cmake.in
+++ b/cmake/CTestCustom.cmake.in
@@ -5,15 +5,15 @@ list(APPEND CTEST_CUSTOM_COVERAGE_EXCLUDE
   # Exclude try_compile sources from coverage results:
   "/CMakeFiles/CMakeTmp/"
 
-  "third_party"
+  ".*/third_party/.*"
 
-  "test"
+  ".*/doc/.*"
 
-  "benchmark"
+  ".*/benchmark/.*"
 
-  "examples"
+  ".*/examples/.*"
 
-  "c\\+\\+"
+  ".*/c\\+\\+/.*"
 )
 
 set(CTEST_SOURCE_DIRECTORY "@Ginkgo_SOURCE_DIR@" CACHE STRING "" FORCE)
diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake
index ff4828d01ce..27f03bce996 100644
--- a/cmake/CTestScript.cmake
+++ b/cmake/CTestScript.cmake
@@ -8,13 +8,17 @@
 # CDash dashboard. The supported runs are:
 # + With or without coverage, requires the gcov tool.
 # + With or without address sanitizers.
+# + With or without memory sanitizers.
 # + With or without thread sanitizers.
+# + With or without leak sanitizers.
+# + With or without undefined behavior (UB) sanitizers.
 # + With or without valgrind, requires the valgrind tool.
 #
 # Note that only one of these can be ran at once, as the build types
-# conflict. Ginkgo is always configured with CUDA, OpenMP and Reference
-# support. The results are always sent to the dashboard:
-# https://my.cdash.org/index.php?project=Ginkgo+Project
+# conflict. Ginkgo is always configured with CUDA, HIP, OpenMP and Reference
+# support, except for ThreadSanitizer, AddressSanitizer, LeakSanitizer,
+# UndefinedBehaviorSanitizer builds. The results are always sent to the
+# dashboard: https://my.cdash.org/index.php?project=Ginkgo+Project
 #
 # Running the script
 # ^^^^^^^^^^^^^^^^^^
@@ -46,11 +50,13 @@
 # A string to describe the machine this is ran on. Default FineCI.
 #
 # ``CTEST_CMAKE_GENERATOR``
-# Which generator should be used for the build. Default `Unix Makefiles`
+# Which generator should be used for the build. Default `Ninja`, except
+# for COVERAGE builds where `Unix Makefiles` is used.
 #
 # ``CTEST_BUILD_CONFIGURATION``
 # Which configuration should Ginkgo be built with. Default `DEBUG`.
-# The supported values are: COVERAGE, ASAN, TSAN, DEBUG and RELEASE.
+# The supported values are: COVERAGE, TSAN, UBSAN, DEBUG, and
+# RELEASE.
 #
 # ``CTEST_TEST_MODEL``
 # Which CTest test model should be used. Default `Continuous`.
@@ -61,8 +67,9 @@
 # The name of the build being ran. Default: `CTEST_BUILD_CONFIGURATION`
 #
 # ``CTEST_MEMORYCHECK_TYPE``
-# Whether memorycheck should be ran. Default: `None`. Supported values are:
-# Valgrind, ThreadSanitizer, AddressSanitizer and None.
+# Whether memorycheck should be ran. Default: `NONE`. Supported values are:
+# Valgrind, AddressSanitizer, LeakSanitizer, ThreadSanitizer,
+# UndefinedBehaviorSanitizer and NONE.
 #
 
 if (NOT DEFINED CTEST_SOURCE_DIRECTORY)
@@ -78,10 +85,14 @@ if (NOT DEFINED CTEST_SITE)
 endif()
 
 if (NOT DEFINED CTEST_CMAKE_GENERATOR)
-    set(CTEST_CMAKE_GENERATOR "Unix Makefiles")
+    if (CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
+        set(CTEST_CMAKE_GENERATOR "Unix Makefiles")
+    else()
+        set(CTEST_CMAKE_GENERATOR "Ninja")
+    endif()
 endif()
 
-# Supported: COVERAGE, ASAN, TSAN, DEBUG and RELEASE
+# Supported: COVERAGE, ASAN, LSAN, TSAN, UBSAN, DEBUG and RELEASE
 if (NOT DEFINED CTEST_BUILD_CONFIGURATION)
     set(CTEST_BUILD_CONFIGURATION "DEBUG")
 endif()
@@ -94,9 +105,10 @@ if (NOT DEFINED CTEST_BUILD_NAME)
     set(CTEST_BUILD_NAME "${CTEST_BUILD_CONFIGURATION}")
 endif()
 
-#Supported: Valgrind, ThreadSanitizer, AddressSanitizer.
+#Supported: Valgrind, ThreadSanitizer, AddressSanitizer, LeakSanitizer
+#and UndefinedBehaviorSanitizer.
 if (NOT DEFINED CTEST_MEMORYCHECK_TYPE)
-    set(CTEST_MEMORYCHECK_TYPE "None")
+    set(CTEST_MEMORYCHECK_TYPE "NONE")
 endif()
 
 # Find coverage and valgrind tools
@@ -112,28 +124,39 @@ if(CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
 endif()
 
 if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind")
-    set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "verbosity=1")
+    set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "${CTEST_MEMORYCHECK_SANITIZER_OPTIONS}:allocator_may_return_null=1:verbosity=1")
 endif()
 
 include(ProcessorCount)
 ProcessorCount(PROC_COUNT)
 if(NOT PROC_COUNT EQUAL 0)
-    if (PROC_COUNT GREATER 10)
-        set(PROCT_COUNT 10)
+    if (DEFINED ENV{CI_PARALLELISM})
+        set(PROC_COUNT "$ENV{CI_PARALLELISM}")
+    elseif(PROC_COUNT LESS 4)
+        set(PROC_COUNT 1)
+    else()
+        set(PROC_COUNT 4)
     endif()
     if(NOT WIN32)
         set(CTEST_BUILD_FLAGS "-j${PROC_COUNT}")
     endif(NOT WIN32)
 endif()
 
+
 ctest_start("${CTEST_TEST_MODEL}")
 ctest_submit(PARTS Start)
 
-if(CTEST_MEMORYCHECK_TYPE STREQUAL "AddressSanitizer" OR CTEST_MEMORYCHECK_TYPE STREQUAL "ThreadSanitizer")
-    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
+if((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
+    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
 else()
-    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
+    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
 endif()
+
+# UBSAN needs gold linker
+if (CTEST_MEMORYCHECK_TYPE STREQUAL "UndefinedBehaviorSanitizer")
+    set(GINKGO_CONFIGURE_OPTIONS "${GINKGO_CONFIGURE_OPTIONS};-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=gold;-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=gold")
+endif()
+
 ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" OPTIONS "${GINKGO_CONFIGURE_OPTIONS}" APPEND)
 ctest_submit(PARTS Configure)
 
@@ -146,7 +169,7 @@ ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
 ctest_submit(PARTS Build)
 
 
-if (CTEST_MEMORYCHECK_TYPE STREQUAL "None")
+if (CTEST_MEMORYCHECK_TYPE STREQUAL "NONE")
     ctest_test(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
     ctest_submit(PARTS Test)
 endif()
@@ -156,7 +179,7 @@ if (CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
     ctest_submit(PARTS Coverage)
 endif()
 
-if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "None")
+if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE")
     ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
     ctest_submit(PARTS MemCheck)
 endif()
diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index e944ff6e933..0348f956e7b 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -35,6 +35,7 @@ set(GINKGO_EXE_LINKER_FLAGS_RELEASE "@CMAKE_EXE_LINKER_FLAGS_RELEASE@")
 set(GINKGO_BUILD_REFERENCE @GINKGO_BUILD_REFERENCE@)
 set(GINKGO_BUILD_OMP @GINKGO_BUILD_OMP@)
 set(GINKGO_BUILD_CUDA @GINKGO_BUILD_CUDA@)
+set(GINKGO_BUILD_HIP @GINKGO_BUILD_HIP@)
 
 set(GINKGO_DEVEL_TOOLS @GINKGO_DEVEL_TOOLS@)
 set(GINKGO_BUILD_TESTS @GINKGO_BUILD_TESTS@)
@@ -59,8 +60,16 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@)
 set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@)
 
 set(GINKGO_CUDA_ARCHITECTURES @GINKGO_CUDA_ARCHITECTURES@)
+set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@)
 set(GINKGO_CUDA_HOST_COMPILER @CMAKE_CUDA_HOST_COMPILER@)
 
+set(GINKGO_HIP_COMPILER_FLAGS @GINKGO_HIP_COMPILER_FLAGS@)
+set(GINKGO_HIP_HCC_COMPILER_FLAGS @GINKGO_HIP_HCC_COMPILER_FLAGS@)
+set(GINKGO_HIP_NVCC_COMPILER_FLAGS @GINKGO_HIP_NVCC_COMPILER_FLAGS@)
+set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@)
+set(GINKGO_HIP_AMDGPU @GINKGO_HIP_AMDGPU@)
+set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@)
+
 set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@)
 
 # Ginkgo external package variables
@@ -87,7 +96,10 @@ set(GINKGO_INSTALL_LIBRARY_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_LIBRARY
 set(GINKGO_INSTALL_PKGCONFIG_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_PKGCONFIG_DIR@")
 set(GINKGO_INSTALL_CONFIG_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_CONFIG_DIR@")
 set(GINKGO_INSTALL_MODULE_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_MODULE_DIR@")
-set(CMAKE_MODULE_PATH "${GINKGO_INSTALL_MODULE_DIR}")
+
+# Forward Ginkgo's MODULE PATH and the PREFIX PATH for HIP and more
+list(APPEND CMAKE_MODULE_PATH "@CMAKE_MODULE_PATH@" "${GINKGO_INSTALL_MODULE_DIR}")
+list(APPEND CMAKE_PREFIX_PATH "@CMAKE_PREFIX_PATH@")
 
 
 set(GINKGO_INTERFACE_LINK_LIBRARIES "@GINKGO_INTERFACE_LINK_LIBRARIES@")
@@ -104,9 +116,9 @@ set(GINKGO_CUSPARSE_LIBRARIES @CUSPARSE@)
 set(GINKGO_CUDA_LIBRARIES @CUDA_RUNTIME_LIBS@)
 set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@")
 
-set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@")
-set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG@")
-set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE@")
+set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@")
+set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG_MODIFY@")
+set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@")
 
 # OpenMP
 set(GINKGO_OPENMP_VERSION @OpenMP_CXX_VERSION@)
@@ -116,10 +128,47 @@ set(GINKGO_OPENMP_LIBRARIES @OpenMP_CXX_LIBRARIES@)
 
 set(GINKGO_OPENMP_FLAGS "@OpenMP_CXX_FLAGS@")
 
-# Modulepath configuration
+# Provide useful HIP helper functions
+include(${CMAKE_CURRENT_LIST_DIR}/hip_helpers.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/windows_helpers.cmake)
 
 # NOTE: we do not export benchmarks, examples, tests or devel tools
 #     so `third_party` libraries are currently unneeded.
 
+# propagate CUDA_HOST_COMPILER if Ginkgo was built with CUDA
+if (GINKGO_BUILD_CUDA AND GINKGO_CUDA_HOST_COMPILER AND NOT CMAKE_CUDA_HOST_COMPILER)
+    message(STATUS "Ginkgo: Setting CUDA host compiler to ${GINKGO_CXX_COMPILER}")
+    set(CMAKE_CUDA_HOST_COMPILER "${GINKGO_CXX_COMPILER}" CACHE STRING "" FORCE)
+endif()
+
+if(GINKGO_HAVE_PAPI_SDE)
+    find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde)
+endif()
+
+# HIP depends on Threads::Threads in some circumstances, but doesn't find it
+if (GINKGO_BUILD_HIP)
+    find_package(Threads REQUIRED)
+endif()
+
+# Needed because of a known issue with CUDA while linking statically.
+# For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614
+if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA)
+    enable_language(CUDA)
+endif()
+
+if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
+    find_package(HIP REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(hipsparse REQUIRED)
+    if(GINKGO_HIP_PLATFORM MATCHES "hcc")
+        ginkgo_hip_ban_link_hcflag(hcc::hccrt)
+        ginkgo_hip_ban_link_hcflag(hcc::hc_am)
+        ginkgo_hip_ban_link_hcflag(hcc::mcwamp)
+        ginkgo_hip_ban_compile_hcflag(hcc::hccrt)
+        ginkgo_hip_ban_compile_hcflag(hcc::hc_am)
+        ginkgo_hip_ban_compile_hcflag(hcc::mcwamp)
+    endif()
+endif()
+
 
 include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake
new file mode 100644
index 00000000000..1f90640acb9
--- /dev/null
+++ b/cmake/autodetect_executors.cmake
@@ -0,0 +1,27 @@
+set(GINKGO_HAS_OMP OFF)
+set(GINKGO_HAS_CUDA OFF)
+set(GINKGO_HAS_HIP OFF)
+find_package(OpenMP)
+include(CheckLanguage)
+check_language(CUDA)
+
+if(OpenMP_CXX_FOUND)
+    if(NOT DEFINED GINKGO_BUILD_OMP)
+        message(STATUS "Enabling OpenMP executor")
+    endif()
+    set(GINKGO_HAS_OMP ON)
+endif()
+
+if(CMAKE_CUDA_COMPILER)
+    if(NOT DEFINED GINKGO_BUILD_CUDA)
+        message(STATUS "Enabling CUDA executor")
+    endif()
+    set(GINKGO_HAS_CUDA ON)
+endif()
+
+if(GINKGO_HIPCONFIG_PATH)
+    if(NOT DEFINED GINKGO_BUILD_HIP)
+        message(STATUS "Enabling HIP executor")
+    endif()
+    set(GINKGO_HAS_HIP ON)
+endif()
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index 8a8ad047d56..860926add11 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -17,6 +17,9 @@ function(ginkgo_compile_features name)
     if(GINKGO_WITH_IWYU AND GINKGO_IWYU_PATH)
         set_property(TARGET "${name}" PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${GINKGO_IWYU_PATH})
     endif()
+    # Set an appropriate SONAME
+    set_property(TARGET "${name}" PROPERTY
+        SOVERSION "${Ginkgo_VERSION}")
     if(GINKGO_CHANGED_SHARED_LIBRARY)
         # Put all shared libraries and corresponding imported libraries into the specified path
         set_property(TARGET "${name}" PROPERTY
@@ -37,6 +40,61 @@ function(ginkgo_compile_features name)
             ginkgo_check_shared_library("${CMAKE_SHARED_LIBRARY_PREFIX}${name}${CMAKE_SHARED_LIBRARY_SUFFIX}")
         endif()
     endif()
+
+    if (GINKGO_CHECK_CIRCULAR_DEPS)
+        target_link_libraries("${name}" PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+    endif()
+
+    set_target_properties("${name}" PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction()
+
+function(ginkgo_check_headers target)
+    # build object library used to "compile" the headers
+    # add a proxy source file for each header in the target source list
+    file(GLOB_RECURSE CUDA_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" CONFIGURE_DEPENDS "*.cuh")
+    file(GLOB_RECURSE HIP_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" CONFIGURE_DEPENDS "*.hip.hpp")
+    file(GLOB_RECURSE CXX_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" CONFIGURE_DEPENDS "*.hpp")
+    list(FILTER CXX_HEADERS EXCLUDE REGEX ".*\.hip\.hpp$")
+    list(FILTER CXX_HEADERS EXCLUDE REGEX "^test.*")
+    list(FILTER CUDA_HEADERS EXCLUDE REGEX "^test.*")
+    list(FILTER HIP_HEADERS EXCLUDE REGEX "^test.*")
+
+    set(SOURCES "")
+    foreach(HEADER ${CUDA_HEADERS})
+        set(HEADER_SOURCEFILE "${CMAKE_CURRENT_BINARY_DIR}/${HEADER}.cu")
+        file(WRITE "${HEADER_SOURCEFILE}" "#include \"${HEADER}\"")
+        list(APPEND SOURCES "${HEADER_SOURCEFILE}")
+    endforeach()
+
+    foreach(HEADER ${CXX_HEADERS})
+        set(HEADER_SOURCEFILE "${CMAKE_CURRENT_BINARY_DIR}/${HEADER}.cpp")
+        file(WRITE "${HEADER_SOURCEFILE}" "#include \"${HEADER}\"")
+        list(APPEND SOURCES "${HEADER_SOURCEFILE}")
+    endforeach()
+    if (SOURCES)
+        add_library(${target}_headers OBJECT ${SOURCES})
+        target_link_libraries(${target}_headers PRIVATE ${target})
+        target_include_directories(${target}_headers PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+    endif()
+
+    set(HIP_SOURCES "")
+    foreach(HEADER ${HIP_HEADERS})
+        set(HEADER_SOURCEFILE "${CMAKE_CURRENT_BINARY_DIR}/${HEADER}.hip.cpp")
+        file(WRITE "${HEADER_SOURCEFILE}" "#include \"${HEADER}\"")
+        list(APPEND HIP_SOURCES "${HEADER_SOURCEFILE}")
+    endforeach()
+    if (HIP_SOURCES)
+        set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
+        hip_add_library(${target}_headers_hip ${HIP_SOURCES}) # the compiler options get set by linking to ginkgo_hip
+        target_link_libraries(${target}_headers_hip PRIVATE ${target} roc::hipblas roc::hipsparse)
+        target_include_directories(${target}_headers_hip
+            PRIVATE
+            "${CMAKE_CURRENT_SOURCE_DIR}"
+            "${GINKGO_HIP_THRUST_PATH}"
+            "${HIPBLAS_INCLUDE_DIRS}"
+            "${HIPSPARSE_INCLUDE_DIRS}"
+            "${ROCPRIM_INCLUDE_DIRS}")
+    endif()
 endfunction()
 
 function(ginkgo_check_shared_library name)
@@ -73,25 +131,8 @@ function(ginkgo_check_shared_library name)
     endif()
 endfunction()
 
-function(ginkgo_switch_windows_link lang from to)
-    foreach(flag_var
-        "CMAKE_${lang}_FLAGS" "CMAKE_${lang}_FLAGS_DEBUG" "CMAKE_${lang}_FLAGS_RELEASE"
-        "CMAKE_${lang}_FLAGS_MINSIZEREL" "CMAKE_${lang}_FLAGS_RELWITHDEBINFO"
-        )
-        if(${flag_var} MATCHES "/${from}")
-            string(REGEX REPLACE "/${from}" "/${to}" ${flag_var} "${${flag_var}}")
-        endif(${flag_var} MATCHES "/${from}")
-        if(${flag_var} MATCHES "-${from}")
-            string(REGEX REPLACE "-${from}" "-${to}" ${flag_var} "${${flag_var}}")
-        endif(${flag_var} MATCHES "-${from}")
-        set(${flag_var} "${${flag_var}}" CACHE STRING "" FORCE)
-    endforeach()
-endfunction()
-
-macro(ginkgo_switch_to_windows_static lang)
-    ginkgo_switch_windows_link(${lang} "MD" "MT")
-endmacro()
-
-macro(ginkgo_switch_to_windows_dynamic lang)
-    ginkgo_switch_windows_link(${lang} "MT" "MD")
+macro(ginkgo_modify_flags name)
+    # add escape before "
+    # the result var is ${name}_MODIFY
+    string(REPLACE "\"" "\\\"" ${name}_MODIFY "${${name}}")
 endmacro()
diff --git a/cmake/build_type_helpers.cmake b/cmake/build_type_helpers.cmake
index eba35b828d6..f3366b031e0 100644
--- a/cmake/build_type_helpers.cmake
+++ b/cmake/build_type_helpers.cmake
@@ -27,18 +27,52 @@
 
 include(CMakeDependentOption)
 
-set(${PROJECT_NAME}_CUSTOM_BUILD_TYPES "COVERAGE;TSAN;ASAN" CACHE INTERNAL "")
+set(${PROJECT_NAME}_CUSTOM_BUILD_TYPES      "COVERAGE;TSAN;ASAN;LSAN;UBSAN" CACHE INTERNAL "")
+
+# LLVM provides all sanitizers in a single library, but they are separate in GCC
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(GKO_TSAN_LIBRARIES "-static-libsan")
+    set(GKO_UBSAN_LIBRARIES "-static-libsan")
+else()
+    set(GKO_TSAN_LIBRARIES "-static-libtsan")
+    set(GKO_UBSAN_LIBRARIES "-static-libubsan")
+endif()
 
 set(${PROJECT_NAME}_COVERAGE_COMPILER_FLAGS "-g -O0 --coverage" CACHE INTERNAL "")
 set(${PROJECT_NAME}_COVERAGE_LINKER_FLAGS   "--coverage"        CACHE INTERNAL "")
-set(${PROJECT_NAME}_TSAN_COMPILER_FLAGS "-g -O1 -fsanitize=thread -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "")
-set(${PROJECT_NAME}_TSAN_LINKER_FLAGS   "-fsanitize=thread -static-libtsan -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "")
-set(${PROJECT_NAME}_ASAN_COMPILER_FLAGS "-g -O1 -fsanitize=address -fno-omit-frame-pointer" CACHE INTERNAL "")
-set(${PROJECT_NAME}_ASAN_LINKER_FLAGS   "-fsanitize=address -fno-omit-frame-pointer"        CACHE INTERNAL "")
+set(${PROJECT_NAME}_TSAN_COMPILER_FLAGS     "-g -O1 -fsanitize=thread -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "")
+set(${PROJECT_NAME}_TSAN_LINKER_FLAGS       "-fsanitize=thread ${GKO_TSAN_LIBRARIES} -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "")
+set(${PROJECT_NAME}_ASAN_COMPILER_FLAGS     "-g -O1 -fsanitize=address -fno-omit-frame-pointer" CACHE INTERNAL "")
+set(${PROJECT_NAME}_ASAN_LINKER_FLAGS       "-fsanitize=address -fno-omit-frame-pointer"        CACHE INTERNAL "")
+set(${PROJECT_NAME}_LSAN_COMPILER_FLAGS     "-g -O1 -fsanitize=leak" CACHE INTERNAL "")
+set(${PROJECT_NAME}_LSAN_LINKER_FLAGS       "-fsanitize=leak"        CACHE INTERNAL "")
+set(${PROJECT_NAME}_UBSAN_COMPILER_FLAGS    "-g -O1 -fsanitize=undefined ${GKO_UBSAN_LIBRARIES}" CACHE INTERNAL "")
+set(${PROJECT_NAME}_UBSAN_LINKER_FLAGS      "-fsanitize=undefined ${GKO_UBSAN_LIBRARIES}"        CACHE INTERNAL "")
+
+# We need to wrap all flags with `-Xcomplier` for HIP when using the NVCC backend
+function(GKO_XCOMPILER varname varlist)
+    set(tmp "")
+    foreach(item IN LISTS varlist)
+        set(tmp "${tmp} -Xcompiler \\\\\\\"${item}\\\\\\\"")
+    endforeach()
+    set(${varname} "${tmp}" CACHE INTERNAL "")
+endfunction()
+
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_COVERAGE_COMPILER_FLAGS "-g;-O0;--coverage")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_COVERAGE_LINKER_FLAGS   "--coverage")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_TSAN_COMPILER_FLAGS     "-g;-O1;-fsanitize=thread;-fno-omit-frame-pointer;-fPIC")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_TSAN_LINKER_FLAGS       "-fsanitize=thread;-static-libtsan;-fno-omit-frame-pointer;-fPIC")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_ASAN_COMPILER_FLAGS     "-g;-O1;-fsanitize=address;-fno-omit-frame-pointer")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_ASAN_LINKER_FLAGS       "-fsanitize=address;-fno-omit-frame-pointer")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_LSAN_COMPILER_FLAGS     "-g;-O1;-fsanitize=leak")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_LSAN_LINKER_FLAGS       "-fsanitize=leak")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_UBSAN_COMPILER_FLAGS    "-g;-O1;-fsanitize=undefined;-static-libubsan")
+GKO_XCOMPILER(${PROJECT_NAME}_NVCC_UBSAN_LINKER_FLAGS      "-fsanitize=undefined;-static-libubsan")
+
 
 get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
 
-foreach(_LANG IN LISTS ENABLED_LANGUAGES)
+foreach(_LANG IN LISTS ENABLED_LANGUAGES ITEMS "HIP")
     include(Check${_LANG}CompilerFlag OPTIONAL)
     foreach(_TYPE IN LISTS ${PROJECT_NAME}_CUSTOM_BUILD_TYPES)
         # Required for check_<LANG>_compiler_flag. Caution, this can break several
@@ -49,7 +83,7 @@ foreach(_LANG IN LISTS ENABLED_LANGUAGES)
         if(_LANG STREQUAL "C")
             check_c_compiler_flag("${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}"
                 ${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED)
-        elseif(_LANG STREQUAL "CXX")
+        elseif(_LANG STREQUAL "CXX" OR _LANG STREQUAL "HIP")
             check_cxx_compiler_flag("${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}"
                 ${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED)
         else()
@@ -60,13 +94,23 @@ foreach(_LANG IN LISTS ENABLED_LANGUAGES)
 		        continue()
         endif()
         if(${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED)
-            set(CMAKE_${_LANG}_FLAGS_${_TYPE}
-                ${${PROJECT_NAME}_${_TYPE}_COMPILER_FLAGS}
-                CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE
-            )
-            mark_as_advanced(CMAKE_${_LANG}_FLAGS_${_TYPE})
-            set(${PROJECT_NAME}_${_TYPE}_SUPPORTED TRUE CACHE
-                STRING "Whether or not coverage is supported by at least one compiler." FORCE)
+            if(_LANG STREQUAL "HIP" AND GINKGO_HIP_PLATFORM STREQUAL "nvcc")
+                set(CMAKE_${_LANG}_FLAGS_${_TYPE}
+                    ${${PROJECT_NAME}_NVCC_${_TYPE}_COMPILER_FLAGS}
+                    CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE
+                )
+                mark_as_advanced(CMAKE_${_LANG}_FLAGS_${_TYPE})
+                set(${PROJECT_NAME}_${_TYPE}_SUPPORTED TRUE CACHE
+                    STRING "Whether or not coverage is supported by at least one compiler." FORCE)
+            else()
+                set(CMAKE_${_LANG}_FLAGS_${_TYPE}
+                    ${${PROJECT_NAME}_${_TYPE}_COMPILER_FLAGS}
+                    CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE
+                )
+                mark_as_advanced(CMAKE_${_LANG}_FLAGS_${_TYPE})
+                set(${PROJECT_NAME}_${_TYPE}_SUPPORTED TRUE CACHE
+                    STRING "Whether or not coverage is supported by at least one compiler." FORCE)
+            endif()
         endif()
         set(CMAKE_REQUIRED_LIBRARIES ${_CMAKE_REQUIRED_LIBRARIES})
     endforeach()
@@ -74,21 +118,6 @@ endforeach()
 
 
 foreach(_TYPE IN LISTS ${PROJECT_NAME}_CUSTOM_BUILD_TYPES)
-    if(${PROJECT_NAME}_${_TYPE}_SUPPORTED)
-        set(CMAKE_EXE_LINKER_FLAGS_${_TYPE}
-            "${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}"
-            CACHE STRING "Flags used for linking binaries during ${_TYPE} builds." FORCE
-        )
-        set(CMAKE_SHARED_LINKER_FLAGS_${_TYPE}
-            "${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}"
-            CACHE STRING "Flags used by the shared libraries linker during ${_TYPE} builds." FORCE
-        )
-        mark_as_advanced(
-           CMAKE_EXE_LINKER_FLAGS_${_TYPE}
-           CMAKE_SHARED_LINKER_FLAGS_${_TYPE}
-        )
-    endif()
-
     cmake_dependent_option(${PROJECT_NAME}_${_TYPE}_IN_CONFIGURATION_TYPES
         "Should the ${_TYPE} target be in the CMAKE_CONFIGURATION_TYPES list if supported ?" ON
         # No need for this option if we are not using a multi-config generator
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index f6e2e165437..ca639fe5278 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -1,29 +1,158 @@
 function(ginkgo_create_test test_name)
     file(RELATIVE_PATH REL_BINARY_DIR
-         ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
     string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
     add_executable(${TEST_TARGET_NAME} ${test_name}.cpp)
     target_include_directories("${TEST_TARGET_NAME}"
         PRIVATE
-            "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
+        "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
         )
     set_target_properties(${TEST_TARGET_NAME} PROPERTIES
         OUTPUT_NAME ${test_name})
-    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::GTest GTest::Main ${ARGN})
+    if (GINKGO_CHECK_CIRCULAR_DEPS)
+        target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+    endif()
+    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN})
     add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME})
 endfunction(ginkgo_create_test)
 
+function(ginkgo_create_thread_test test_name)
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
+    find_package(Threads REQUIRED)
+    file(RELATIVE_PATH REL_BINARY_DIR
+        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
+    add_executable(${TEST_TARGET_NAME} ${test_name}.cpp)
+    target_include_directories("${TEST_TARGET_NAME}"
+        PRIVATE
+        "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
+        )
+    set_target_properties(${TEST_TARGET_NAME} PROPERTIES
+        OUTPUT_NAME ${test_name})
+    if (GINKGO_CHECK_CIRCULAR_DEPS)
+        target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+    endif()
+    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest Threads::Threads ${ARGN})
+    add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME})
+endfunction(ginkgo_create_thread_test)
+
+function(ginkgo_create_test_cpp_cuda_header test_name)
+    file(RELATIVE_PATH REL_BINARY_DIR
+        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
+    add_executable(${TEST_TARGET_NAME} ${test_name}.cpp)
+    target_include_directories("${TEST_TARGET_NAME}"
+        PRIVATE
+        "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
+        "${CUDA_INCLUDE_DIRS}"
+        )
+    set_target_properties(${TEST_TARGET_NAME} PROPERTIES
+        OUTPUT_NAME ${test_name})
+    if (GINKGO_CHECK_CIRCULAR_DEPS)
+        target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+    endif()
+    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN})
+    add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME})
+endfunction(ginkgo_create_test_cpp_cuda_header)
+
 function(ginkgo_create_cuda_test test_name)
     file(RELATIVE_PATH REL_BINARY_DIR
-         ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
     string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
     add_executable(${TEST_TARGET_NAME} ${test_name}.cu)
     target_include_directories("${TEST_TARGET_NAME}"
         PRIVATE
-            "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
+        "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
         )
     set_target_properties(${TEST_TARGET_NAME} PROPERTIES
         OUTPUT_NAME ${test_name})
-    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::GTest GTest::Main ${ARGN})
+
+    if (GINKGO_CHECK_CIRCULAR_DEPS)
+        target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+    endif()
+    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN})
     add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME})
 endfunction(ginkgo_create_cuda_test)
+
+function(ginkgo_create_hip_test_special_linkage test_name)
+    # use gcc to compile but use hip to link
+    file(RELATIVE_PATH REL_BINARY_DIR
+        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
+    add_executable(${TEST_TARGET_NAME} ${test_name}.cpp)
+    # Fix the missing metadata when building static library.
+    if(GINKGO_HIP_PLATFORM MATCHES "hcc" AND NOT BUILD_SHARED_LIBS)
+        set_target_properties(${TEST_TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    endif()
+    target_include_directories("${TEST_TARGET_NAME}"
+        PRIVATE
+        "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
+        )
+    set_target_properties(${TEST_TARGET_NAME} PROPERTIES
+        OUTPUT_NAME ${test_name})
+    if (GINKGO_CHECK_CIRCULAR_DEPS)
+        target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+    endif()
+    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN})
+    add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME})
+endfunction(ginkgo_create_hip_test_special_linkage)
+
+function(ginkgo_create_hip_test test_name)
+    file(RELATIVE_PATH REL_BINARY_DIR
+        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
+
+    set_source_files_properties(${test_name}.hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
+
+    if (HIP_VERSION GREATER_EQUAL "3.5")
+        hip_add_executable(${TEST_TARGET_NAME} ${test_name}.hip.cpp
+            HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
+            NVCC_OPTIONS  ${GINKGO_HIP_NVCC_OPTIONS}
+            HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS}
+            CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS})
+    else()
+        hip_add_executable(${TEST_TARGET_NAME} ${test_name}.hip.cpp
+            HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
+            NVCC_OPTIONS  ${GINKGO_HIP_NVCC_OPTIONS}
+            HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS})
+    endif()
+
+    # Let's really not use nvcc for linking here
+    if (GINKGO_HIP_PLATFORM MATCHES "nvcc")
+        set_target_properties(${TEST_TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+    endif()
+
+    target_include_directories("${TEST_TARGET_NAME}"
+        PRIVATE
+        "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
+        # Only `math` requires it so far, but it's much easier
+        # to put these this way.
+        ${GINKGO_HIP_THRUST_PATH}
+        # Only `exception_helpers` requires thess so far, but it's much easier
+        # to put these this way.
+        ${HIPBLAS_INCLUDE_DIRS}
+        ${HIPSPARSE_INCLUDE_DIRS}
+        )
+    set_target_properties(${TEST_TARGET_NAME} PROPERTIES
+        OUTPUT_NAME ${test_name})
+
+    # Pass in the `--amdgpu-target` flags if asked
+    if(GINKGO_HIP_AMDGPU AND GINKGO_HIP_PLATFORM MATCHES "hcc")
+        foreach(target ${GINKGO_HIP_AMDGPU})
+            target_link_libraries(${TEST_TARGET_NAME} PRIVATE --amdgpu-target=${target})
+        endforeach()
+    endif()
+
+    # GINKGO_RPATH_FOR_HIP needs to be populated before calling this for the linker to include
+    # our libraries path into the executable's runpath.
+    if(BUILD_SHARED_LIBS)
+        target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_RPATH_FOR_HIP}")
+
+        if (GINKGO_CHECK_CIRCULAR_DEPS)
+            target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
+        endif()
+    endif()
+
+    target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN})
+    add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME})
+endfunction(ginkgo_create_hip_test)
diff --git a/cmake/hip_helpers.cmake b/cmake/hip_helpers.cmake
new file mode 100644
index 00000000000..c296ffc1228
--- /dev/null
+++ b/cmake/hip_helpers.cmake
@@ -0,0 +1,30 @@
+macro(ginkgo_hip_ban_link_hcflag target)
+    if(TARGET ${target})
+        get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_LINK_LIBRARIES)
+        string(REPLACE "-hc " "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}")
+        set_target_properties(${target} PROPERTIES INTERFACE_LINK_LIBRARIES "${GINKGO_TARGET_NEW_ILL}")
+    endif()
+endmacro()
+
+macro(ginkgo_hip_ban_compile_hcflag target)
+    if(TARGET ${target})
+        get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_COMPILE_OPTIONS)
+        string(REPLACE "-hc" "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}")
+        set_target_properties(${target} PROPERTIES INTERFACE_COMPILE_OPTIONS "${GINKGO_TARGET_NEW_ILL}")
+    endif()
+endmacro()
+
+macro(ginkgo_hip_clang_ban_hip_device_flags)
+    if (GINKGO_HIP_VERSION VERSION_GREATER_EQUAL "3.5")
+        # Compile options somehow add hip-clang specific flags. Wipe them.
+        # Currently, the flags wiped out should be:
+        # -x;hip;--hip-device-lib-path=/opt/rocm/lib;--cuda-gpu-arch=gfx900;
+        # --cuda-gpu-arch=gfx906
+        set_target_properties(hip::device PROPERTIES INTERFACE_COMPILE_OPTIONS "")
+        # In addition, link libraries have a similar problem. We only keep
+        # `hip::host`. Currently, the flags should be:
+        # hip::host;--hip-device-lib-path=/opt/rocm/lib;--hip-link;
+        # --cuda-gpu-arch=gfx900;--cuda-gpu-arch=gfx906
+        set_target_properties(hip::device PROPERTIES INTERFACE_LINK_LIBRARIES "hip::host")
+    endif()
+endmacro()
diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake
new file mode 100644
index 00000000000..aa0e116527b
--- /dev/null
+++ b/cmake/hip_path.cmake
@@ -0,0 +1,13 @@
+if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+        set(ENV{HIP_PATH} ${HIP_PATH})
+    else()
+        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+endif()
+
+find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
+if(GINKGO_HIPCONFIG_PATH)
+    message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}")
+endif()
\ No newline at end of file
diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake
index 13b3f85061d..e128fb5869a 100644
--- a/cmake/information_helpers.cmake
+++ b/cmake/information_helpers.cmake
@@ -18,12 +18,12 @@ macro(ginkgo_git_information)
                 OUTPUT_VARIABLE GINKGO_GIT_BRANCH
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
             execute_process(
-                COMMAND ${GIT_EXECUTABLE} log -1 --format=%H ${Gingko_SOURCE_DIR}
+                COMMAND ${GIT_EXECUTABLE} log -1 --format=%H ${Ginkgo_SOURCE_DIR}
                 WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}
                 OUTPUT_VARIABLE GINKGO_GIT_REVISION
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
             execute_process(
-                COMMAND ${GIT_EXECUTABLE} log -1 --format=%h ${Gingko_SOURCE_DIR}
+                COMMAND ${GIT_EXECUTABLE} log -1 --format=%h ${Ginkgo_SOURCE_DIR}
                 WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}
                 OUTPUT_VARIABLE GINKGO_GIT_SHORTREV
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake
index fd0c90d383f..ba7ea3fd468 100644
--- a/cmake/install_helpers.cmake
+++ b/cmake/install_helpers.cmake
@@ -9,12 +9,23 @@ set(GINKGO_INSTALL_CONFIG_DIR "lib/cmake/Ginkgo")
 set(GINKGO_INSTALL_MODULE_DIR "lib/cmake/Ginkgo/Modules")
 
 function(ginkgo_install_library name subdir)
-    # install .so and .a files
-    install(TARGETS "${name}"
-        EXPORT Ginkgo
-        LIBRARY DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
-        ARCHIVE DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
+    
+    if (WIN32 OR CYGWIN)
+        # dll is considered as runtime
+        install(TARGETS "${name}"
+            EXPORT Ginkgo
+            LIBRARY DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
+            ARCHIVE DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
+            RUNTIME DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
+            )
+    else ()
+        # install .so and .a files
+        install(TARGETS "${name}"
+            EXPORT Ginkgo
+            LIBRARY DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
+            ARCHIVE DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR}
         )
+    endif ()
 endfunction()
 
 function(ginkgo_install)
@@ -59,9 +70,14 @@ function(ginkgo_install)
     install(FILES
         "${Ginkgo_BINARY_DIR}/GinkgoConfig.cmake"
         "${Ginkgo_BINARY_DIR}/GinkgoConfigVersion.cmake"
-        "${Ginkgo_BINARY_DIR}/GinkgoTargets.cmake"
+        "${Ginkgo_SOURCE_DIR}/cmake/hip_helpers.cmake"
+        "${Ginkgo_SOURCE_DIR}/cmake/windows_helpers.cmake"
         DESTINATION "${GINKGO_INSTALL_CONFIG_DIR}"
         )
+      install(EXPORT Ginkgo
+        NAMESPACE Ginkgo::
+        FILE GinkgoTargets.cmake
+        DESTINATION "${GINKGO_INSTALL_CONFIG_DIR}")
 
     # Export package for use from the build tree
     if (GINKGO_EXPORT_BUILD_DIR)
diff --git a/cmake/windows_helpers.cmake b/cmake/windows_helpers.cmake
new file mode 100644
index 00000000000..5f517a555ad
--- /dev/null
+++ b/cmake/windows_helpers.cmake
@@ -0,0 +1,22 @@
+function(ginkgo_switch_windows_link lang from to)
+    foreach(flag_var
+        "CMAKE_${lang}_FLAGS" "CMAKE_${lang}_FLAGS_DEBUG" "CMAKE_${lang}_FLAGS_RELEASE"
+        "CMAKE_${lang}_FLAGS_MINSIZEREL" "CMAKE_${lang}_FLAGS_RELWITHDEBINFO"
+        )
+        if(${flag_var} MATCHES "/${from}")
+            string(REGEX REPLACE "/${from}" "/${to}" ${flag_var} "${${flag_var}}")
+        endif(${flag_var} MATCHES "/${from}")
+        if(${flag_var} MATCHES "-${from}")
+            string(REGEX REPLACE "-${from}" "-${to}" ${flag_var} "${${flag_var}}")
+        endif(${flag_var} MATCHES "-${from}")
+        set(${flag_var} "${${flag_var}}" CACHE STRING "" FORCE)
+    endforeach()
+endfunction()
+
+macro(ginkgo_switch_to_windows_static lang)
+    ginkgo_switch_windows_link(${lang} "MD" "MT")
+endmacro()
+
+macro(ginkgo_switch_to_windows_dynamic lang)
+    ginkgo_switch_windows_link(${lang} "MT" "MD")
+endmacro()
diff --git a/codecov.yml b/codecov.yml
index a065fc381f9..bdb86838644 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -5,10 +5,12 @@ coverage:
         target: auto
         threshold: 5
         base: auto
+        informational: True
     project:
       default:
         target: auto
         threshold: 2
         base: auto
   ignore:
-    - "**/test/"
+    - "examples"
+    - "benchmark"
diff --git a/common/base/executor.hpp.inc b/common/base/executor.hpp.inc
new file mode 100644
index 00000000000..5b85069c4d8
--- /dev/null
+++ b/common/base/executor.hpp.inc
@@ -0,0 +1,85 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace {
+
+
+// The function is copied from _ConvertSMVer2Cores of
+// cuda-9.2/samples/common/inc/helper_cuda.h
+inline int convert_sm_ver_to_cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the # of cores per SM
+    typedef struct {
+        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+        // and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] = {
+        {0x30, 192},  // Kepler Generation (SM 3.0) GK10x class
+        {0x32, 192},  // Kepler Generation (SM 3.2) GK10x class
+        {0x35, 192},  // Kepler Generation (SM 3.5) GK11x class
+        {0x37, 192},  // Kepler Generation (SM 3.7) GK21x class
+        {0x50, 128},  // Maxwell Generation (SM 5.0) GM10x class
+        {0x52, 128},  // Maxwell Generation (SM 5.2) GM20x class
+        {0x53, 128},  // Maxwell Generation (SM 5.3) GM20x class
+        {0x60, 64},   // Pascal Generation (SM 6.0) GP100 class
+        {0x61, 128},  // Pascal Generation (SM 6.1) GP10x class
+        {0x62, 128},  // Pascal Generation (SM 6.2) GP10x class
+        {0x70, 64},   // Volta Generation (SM 7.0) GV100 class
+        {0x72, 64},   // Volta Generation (SM 7.2) GV11b class
+        {0x75, 64},   // Turing Generation (SM 7.5) TU1xx class
+        {-1, -1}};
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+        index++;
+    }
+
+#if GKO_VERBOSE_LEVEL >= 1
+    // If we don't find the values, we use the last valid value by default
+    // to allow proper execution
+    std::cerr << "MapSMtoCores for SM " << major << "." << minor
+              << "is undefined. The default value of "
+              << nGpuArchCoresPerSM[index - 1].Cores << " Cores/SM is used."
+              << std::endl;
+#endif
+    return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+
+}  // namespace
\ No newline at end of file
diff --git a/common/base/math.hpp.inc b/common/base/math.hpp.inc
new file mode 100644
index 00000000000..3ba49b585c3
--- /dev/null
+++ b/common/base/math.hpp.inc
@@ -0,0 +1,63 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// We need this struct, because otherwise we would call a __host__ function in a
+// __device__ function (even though it is constexpr)
+template <typename T>
+struct device_numeric_limits {
+    static constexpr auto inf = std::numeric_limits<T>::infinity();
+    static constexpr auto max = std::numeric_limits<T>::max();
+    static constexpr auto min = std::numeric_limits<T>::min();
+};
+
+
+namespace detail {
+
+
+template <typename T>
+struct remove_complex_impl<thrust::complex<T>> {
+    using type = T;
+};
+
+
+template <typename T>
+struct is_complex_impl<thrust::complex<T>>
+    : public std::integral_constant<bool, true> {};
+
+
+template <typename T>
+struct truncate_type_impl<thrust::complex<T>> {
+    using type = thrust::complex<typename truncate_type_impl<T>::type>;
+};
+
+
+}  // namespace detail
\ No newline at end of file
diff --git a/common/components/atomic.hpp.inc b/common/components/atomic.hpp.inc
new file mode 100644
index 00000000000..e36306e04d9
--- /dev/null
+++ b/common/components/atomic.hpp.inc
@@ -0,0 +1,152 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+namespace detail {
+
+
+template <typename ValueType, typename = void>
+struct atomic_helper {
+    __forceinline__ __device__ static ValueType atomic_add(ValueType *,
+                                                           ValueType)
+    {
+        static_assert(sizeof(ValueType) == 0,
+                      "This default function is not implemented, only the "
+                      "specializations are.");
+        // TODO: add proper implementation of generic atomic add
+    }
+};
+
+
+template <typename ResultType, typename ValueType>
+__forceinline__ __device__ ResultType reinterpret(ValueType val)
+{
+    static_assert(sizeof(ValueType) == sizeof(ResultType),
+                  "The type to reinterpret to must be of the same size as the "
+                  "original type.");
+    return reinterpret_cast<ResultType &>(val);
+}
+
+
+#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE)                     \
+    template <typename ValueType>                                            \
+    struct atomic_helper<ValueType,                                          \
+                         gko::xstd::enable_if_t<(sizeof(ValueType) ==        \
+                                                 sizeof(CONVERTER_TYPE))>> { \
+        __forceinline__ __device__ static ValueType atomic_add(              \
+            ValueType *__restrict__ addr, ValueType val)                     \
+        {                                                                    \
+            CONVERTER_TYPE *address_as_converter =                           \
+                reinterpret_cast<CONVERTER_TYPE *>(addr);                    \
+            CONVERTER_TYPE old = *address_as_converter;                      \
+            CONVERTER_TYPE assumed;                                          \
+            do {                                                             \
+                assumed = old;                                               \
+                old = atomicCAS(address_as_converter, assumed,               \
+                                reinterpret<CONVERTER_TYPE>(                 \
+                                    val + reinterpret<ValueType>(assumed))); \
+            } while (assumed != old);                                        \
+            return reinterpret<ValueType>(old);                              \
+        }                                                                    \
+    };
+
+// Support 64-bit ATOMIC_ADD
+GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
+// Support 32-bit ATOMIC_ADD
+GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
+
+
+#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
+// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
+GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
+#endif
+
+#undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
+
+
+}  // namespace detail
+
+
+template <typename T>
+__forceinline__ __device__ T atomic_add(T *__restrict__ addr, T val)
+{
+    return detail::atomic_helper<T>::atomic_add(addr, val);
+}
+
+
+#define GKO_BIND_ATOMIC_ADD(ValueType)               \
+    __forceinline__ __device__ ValueType atomic_add( \
+        ValueType *__restrict__ addr, ValueType val) \
+    {                                                \
+        return atomicAdd(addr, val);                 \
+    }
+
+GKO_BIND_ATOMIC_ADD(int);
+GKO_BIND_ATOMIC_ADD(unsigned int);
+GKO_BIND_ATOMIC_ADD(unsigned long long int);
+GKO_BIND_ATOMIC_ADD(float);
+
+
+#if !defined(__HIPCC__) || \
+    (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
+
+
+#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
+      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
+// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute
+// capability 6.x and higher
+GKO_BIND_ATOMIC_ADD(double);
+#endif
+
+#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
+      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices
+// of compute capability 7.x and higher.
+GKO_BIND_ATOMIC_ADD(__half);
+#endif
+
+#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
+      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
+// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on
+// devices of compute capability 6.x and higher. note: The atomicity of the
+// __half2 add operation is guaranteed separately for each of the two __half
+// elements; the entire __half2 is not guaranteed to be atomic as a single
+// 32-bit access.
+GKO_BIND_ATOMIC_ADD(__half2);
+#endif
+
+
+#endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
+        // GINKGO_HIP_PLATFORM_HCC)
+
+
+#undef GKO_BIND_ATOMIC_ADD
\ No newline at end of file
diff --git a/common/components/diagonal_block_manipulation.hpp.inc b/common/components/diagonal_block_manipulation.hpp.inc
new file mode 100644
index 00000000000..2270bc293d7
--- /dev/null
+++ b/common/components/diagonal_block_manipulation.hpp.inc
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ *
+ * @note assumes that block dimensions are in "standard format":
+ *       (subwarp_size, config::warp_size / subwarp_size, z)
+ */
+template <
+    int max_block_size, int warps_per_block, typename Group, typename ValueType,
+    typename IndexType,
+    typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
+__device__ __forceinline__ void extract_transposed_diag_blocks(
+    const Group &group, int processed_blocks,
+    const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values,
+    const IndexType *__restrict__ block_ptrs, size_type num_blocks,
+    ValueType *__restrict__ block_row, int increment,
+    ValueType *__restrict__ workspace)
+{
+    const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+    const auto warp = group::tiled_partition<config::warp_size>(group);
+    auto bid = static_cast<size_type>(blockIdx.x) * warps_per_block *
+                   processed_blocks +
+               threadIdx.z * processed_blocks;
+    auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero<IndexType>();
+    IndexType bsize = 0;
+#pragma unroll
+    for (int b = 0; b < processed_blocks; ++b, ++bid) {
+        if (bid < num_blocks) {
+            bstart += bsize;
+            bsize = block_ptrs[bid + 1] - bstart;
+#pragma unroll
+            for (int i = 0; i < max_block_size; ++i) {
+                if (i < bsize) {
+                    if (threadIdx.y == b && threadIdx.x < max_block_size) {
+                        workspace[threadIdx.x] = zero<ValueType>();
+                    }
+                    warp.sync();
+                    const auto row = bstart + i;
+                    const auto rstart = row_ptrs[row] + tid;
+                    const auto rend = row_ptrs[row + 1];
+                    // use the entire warp to ensure coalesced memory access
+                    for (auto j = rstart; j < rend; j += config::warp_size) {
+                        const auto col = col_idxs[j] - bstart;
+                        if (col >= bsize) {
+                            break;
+                        }
+                        if (col >= 0) {
+                            workspace[col] = values[j];
+                        }
+                    }
+                    warp.sync();
+                    if (threadIdx.y == b && threadIdx.x < bsize) {
+                        block_row[i * increment] = workspace[threadIdx.x];
+                    }
+                    warp.sync();
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/common/components/fill_array.hpp.inc b/common/components/fill_array.hpp.inc
new file mode 100644
index 00000000000..04e6fe67b79
--- /dev/null
+++ b/common/components/fill_array.hpp.inc
@@ -0,0 +1,48 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+namespace kernel {
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void fill_array(
+    size_type n, ValueType *__restrict__ array, ValueType val)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < n) {
+        array[tidx] = val;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/components/intrinsics.hpp.inc b/common/components/intrinsics.hpp.inc
new file mode 100644
index 00000000000..f89fa434eb4
--- /dev/null
+++ b/common/components/intrinsics.hpp.inc
@@ -0,0 +1,66 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ * Returns the number of set bits in the given mask.
+ */
+__forceinline__ __device__ int popcnt(uint32 mask) { return __popc(mask); }
+
+/** @copydoc popcnt */
+__forceinline__ __device__ int popcnt(uint64 mask) { return __popcll(mask); }
+
+
+/**
+ * @internal
+ * Returns the (1-based!) index of the first set bit in the given mask,
+ * starting from the least significant bit.
+ */
+__forceinline__ __device__ int ffs(uint32 mask) { return __ffs(mask); }
+
+/** @copydoc ffs */
+__forceinline__ __device__ int ffs(uint64 mask)
+{
+    // the cast is necessary, as the overloads defined by HIP are ambiguous
+    return __ffsll(static_cast<unsigned long long int>(mask));
+}
+
+
+/**
+ * @internal
+ * Returns the number of zero bits before the first set bit in the given mask,
+ * starting from the most significant bit.
+ */
+__forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); }
+
+/** @copydoc clz */
+__forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }
diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
new file mode 100644
index 00000000000..c91f76e1fd4
--- /dev/null
+++ b/common/components/merging.hpp.inc
@@ -0,0 +1,310 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace detail {
+
+
+/**
+ * @internal
+ * The result from the @ref group_merge_step function.
+ */
+template <typename ValueType>
+struct merge_result {
+    /** The element of a being merged in the current thread. */
+    ValueType a_val;
+    /** The element of b being merged in the current thread. */
+    ValueType b_val;
+    /** The index from a that is being merged in the current thread. */
+    int a_idx;
+    /** The index from b that is being merged in the current thread. */
+    int b_idx;
+    /** The number of elements from a that have been merged in total. */
+    int a_advance;
+    /** The number of elements from b that have been merged in total. */
+    int b_advance;
+};
+
+}  // namespace detail
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges the first `warp_size` elements from
+ * two ranges, where each warp stores a single element from each range.
+ * It assumes that the elements are sorted in ascending order, i.e. for i < j,
+ * the value of `a` at thread i is smaller or equal to the value at thread j,
+ * and the same holds for `b`.
+ *
+ * This implementation is based on ideas from  Green et al.,
+ * "GPU merge path: a GPU merging algorithm", but uses random-access warp
+ * shuffles instead of shared-memory to exchange values of a and b.
+ *
+ * @param a      the element from the first range
+ * @param b      the element from the second range
+ * @param size   the number of elements in the output range
+ * @param group  the cooperative group that executes the merge
+ * @return  a structure containing the merge result distributed over the group.
+ */
+template <int group_size, typename ValueType, typename Group>
+__forceinline__ __device__ detail::merge_result<ValueType> group_merge_step(
+    ValueType a, ValueType b, Group group)
+{
+    // thread i takes care of ith element of the merged sequence
+    auto i = int(group.thread_rank());
+
+    // we want to find the smallest index `x` such that a[x] >= b[i - x - 1]
+    // or `i` if no such index exists
+    //
+    // if x = i then c[0...i - 1] = a[0...i - 1]
+    //     => merge a[i] with b[0]
+    // if x = 0 then c[0...i - 1] = b[0...i - 1]
+    //     => merge a[0] with b[i]
+    // otherwise c[0...i - 1] contains a[0...x - 1] and b[0...i - x - 1]
+    //   because the minimality of `x` implies
+    //   b[i - x] >= a[x - 1]
+    //   and a[x] >= a[0...x - 1], b[0...i - x - 1]
+    //     => merge a[x] with b[i - x]
+    auto minx = synchronous_fixed_binary_search<group_size>([&](int x) {
+        auto a_remote = group.shfl(a, x);
+        auto b_remote = group.shfl(b, max(i - x - 1, 0));
+        return a_remote >= b_remote || x >= i;
+    });
+
+    auto a_idx = minx;
+    auto b_idx = max(i - minx, 0);
+    auto a_val = group.shfl(a, a_idx);
+    auto b_val = group.shfl(b, b_idx);
+    auto cmp = a_val < b_val;
+    auto a_advance = popcnt(group.ballot(cmp));
+    auto b_advance = int(group.size()) - a_advance;
+
+    return {a_val, b_val, a_idx, b_idx, a_advance, b_advance};
+}
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary
+ * size. `merge_fn` will be called for each merged element.
+ *
+ * @param a       the first range
+ * @param a_size  the size of the first range
+ * @param b       the second range
+ * @param b_size  the size of the second range
+ * @param group   the group that executes the merge
+ * @param merge_fn  the callback that is being called for each merged element.
+ *                  It takes six parameters:
+ *                  `IndexType a_idx, ValueType a_val, IndexType b_idx,
+ *                   ValueType b_val, IndexType c_index, bool valid`.
+ *                  `*_val` and `*_idx` are the values resp. the indices of the
+ *                  values from a/b being compared at output index `c_index`.
+ *                  `valid` specifies if the current thread has to merge an
+ *                  element (this is necessary for shfl and ballot operations).
+ *                  It must return `false` on all threads of the group iff the
+ *                  merge shouldn't be continued.
+ */
+template <int group_size, typename ValueType, typename IndexType,
+          typename Group, typename Callback>
+__forceinline__ __device__ void group_merge(const ValueType *__restrict__ a,
+                                            IndexType a_size,
+                                            const ValueType *__restrict__ b,
+                                            IndexType b_size, Group group,
+                                            Callback merge_fn)
+{
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    auto lane = static_cast<IndexType>(group.thread_rank());
+    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
+    auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
+    for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
+        auto merge_result = group_merge_step<group_size>(a_cur, b_cur, group);
+        auto valid = c_begin + lane < c_size;
+        auto cont = merge_fn(merge_result.a_idx + a_begin, merge_result.a_val,
+                             merge_result.b_idx + b_begin, merge_result.b_val,
+                             c_begin + lane, valid);
+        if (!group.any(cont && valid)) {
+            break;
+        }
+        auto a_advance = merge_result.a_advance;
+        auto b_advance = merge_result.b_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+
+        // shuffle the unmerged elements to the front
+        a_cur = group.shfl_down(a_cur, a_advance);
+        b_cur = group.shfl_down(b_cur, b_advance);
+        /*
+         * To optimize memory access, we load the new elements for `a` and `b`
+         * with a single load instruction:
+         * the lower part of the group loads new elements for `a`
+         * the upper part of the group loads new elements for `b`
+         * `load_lane` is the part-local lane idx
+         * The elements for `a` have to be shuffled up afterwards.
+         */
+        auto load_a = lane < a_advance;
+        auto load_lane = load_a ? lane : lane - a_advance;
+        auto load_source = load_a ? a : b;
+        auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance;
+        auto load_size = load_a ? a_size : b_size;
+
+        auto load_idx = load_begin + load_lane;
+        auto loaded = checked_load(load_source, load_idx, load_size, sentinel);
+        // shuffle the `a` values to the end of the warp
+        auto lower_loaded = group.shfl_up(loaded, b_advance);
+        a_cur = lane < b_advance ? a_cur : lower_loaded;
+        b_cur = lane < a_advance ? b_cur : loaded;
+    }
+}
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that reports matching elements from two sorted
+ * ranges of arbitrary size. `merge_fn` will be called for each pair of matching
+ * element.
+ *
+ * @param a       the first range
+ * @param a_size  the size of the first range
+ * @param b       the second range
+ * @param b_size  the size of the second range
+ * @param group   the group that executes the merge
+ * @param match_fn  the callback that is being called for each matching pair.
+ *                  It takes five parameters:
+ *                  `ValueType val, IndexType a_idx, IndexType b_idx,
+ *                   lane_mask_type match_mask, bool valid`.
+ *                  `val` is the matching element, `*_idx` are the indices of
+ *                  the matching values from a and b, match_mask is a lane mask
+ *                  that is 1 for every subwarp lane that found a match.
+ *                  `valid` is true iff there is actually a match.
+ *                  (necessary for warp-synchronous operations)
+ */
+template <int group_size, typename IndexType, typename ValueType,
+          typename Group, typename Callback>
+__forceinline__ __device__ void group_match(const ValueType *__restrict__ a,
+                                            IndexType a_size,
+                                            const ValueType *__restrict__ b,
+                                            IndexType b_size, Group group,
+                                            Callback match_fn)
+{
+    group_merge<group_size>(
+        a, a_size, b, b_size, group,
+        [&](IndexType a_idx, ValueType a_val, IndexType b_idx, ValueType b_val,
+            IndexType, bool valid) {
+            auto matchmask = group.ballot(a_val == b_val && valid);
+            match_fn(a_val, a_idx, b_idx, matchmask, a_val == b_val && valid);
+            return a_idx < a_size && b_idx < b_size;
+        });
+}
+
+
+/**
+ * @internal
+ * Sequential merge algorithm that merges two sorted ranges of arbitrary
+ * size. `merge_fn` will be called for each merged element.
+ *
+ * @param a  the first range
+ * @param a_size the size of the first range
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param merge_fn  the callback that will be called for each merge step.
+ *                  It takes five parameters:
+ *                  `IndexType a_idx, ValueType a_val,
+ *                   IndexType b_idx, ValueType b_val, IndexType c_idx`.
+ *                  `*_val` and `*_idx` are the values resp. the indices of
+ *                  the values from a/b being compared in step `c_idx`.
+ *                  It must return `false` iff the merge should stop.
+ */
+template <typename ValueType, typename IndexType, typename Callback>
+__forceinline__ __device__ void sequential_merge(
+    const ValueType *__restrict__ a, IndexType a_size,
+    const ValueType *__restrict__ b, IndexType b_size, Callback merge_fn)
+{
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto a_cur = checked_load(a, a_begin, a_size, sentinel);
+    auto b_cur = checked_load(b, b_begin, b_size, sentinel);
+    for (IndexType c_begin{}; c_begin < c_size; c_begin++) {
+        auto cont = merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin);
+        if (!cont) {
+            break;
+        }
+        auto a_advance = a_cur < b_cur;
+        auto b_advance = !a_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+
+        auto load = a_advance ? a : b;
+        auto load_size = a_advance ? a_size : b_size;
+        auto load_idx = a_advance ? a_begin : b_begin;
+        auto loaded = checked_load(load, load_idx, load_size, sentinel);
+        a_cur = a_advance ? loaded : a_cur;
+        b_cur = b_advance ? loaded : b_cur;
+    }
+}
+
+
+/**
+ * @internal
+ * Sequential algorithm that finds matching elements in two sorted ranges of
+ * arbitrary size. `merge_fn` will be called for each pair of matching
+ * elements.
+ *
+ * @param a  the first range
+ * @param a_size the size of the first range
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param match_fn  the callback that is being called for each match.
+ *                  It takes three parameters:
+ *                  `ValueType val, IndexType a_idx, IndexType b_idx`.
+ *                  `val` is the matching element, `*_idx` are the
+ *                  indices of the matching values from a and b.
+ */
+template <typename IndexType, typename ValueType, typename Callback>
+__forceinline__ __device__ void sequential_match(const ValueType *a,
+                                                 IndexType a_size,
+                                                 const ValueType *b,
+                                                 IndexType b_size,
+                                                 Callback match_fn)
+{
+    sequential_merge(a, a_size, b, b_size,
+                     [&](IndexType a_idx, ValueType a_val, IndexType b_idx,
+                         ValueType b_val, IndexType) {
+                         if (a_val == b_val) {
+                             match_fn(a_val, a_idx, b_idx);
+                         }
+                         return a_idx < a_size && b_idx < b_size;
+                     });
+}
\ No newline at end of file
diff --git a/common/components/precision_conversion.hpp.inc b/common/components/precision_conversion.hpp.inc
new file mode 100644
index 00000000000..c486354f156
--- /dev/null
+++ b/common/components/precision_conversion.hpp.inc
@@ -0,0 +1,41 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename SourceType, typename TargetType>
+__global__ void convert_precision(size_type size, const SourceType *in,
+                                  TargetType *out)
+{
+    auto tnum = thread::get_thread_num_flat();
+    for (auto i = thread::get_thread_id_flat(); i < size; i += tnum) {
+        out[i] = in[i];
+    }
+}
\ No newline at end of file
diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
new file mode 100644
index 00000000000..9db51a3dc4f
--- /dev/null
+++ b/common/components/prefix_sum.hpp.inc
@@ -0,0 +1,186 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+/**
+ * @internal
+ * Computes the prefix sum and total sum of `element` over a subwarp.
+ *
+ * @param element     the element over which we compute the prefix sum.
+ * @param prefix_sum  will be set to the sum of all `element`s from lower
+ *                    lanes, plus the local `element` if `inclusive` is `true`.
+ * @param total_sum   will be set to the total sum of `element` in this subwarp.
+ * @param subwarp     the cooperative group representing the subwarp.
+ *
+ * @tparam inclusive  if this is true, the computed prefix sum will be
+ *                    inclusive, otherwise it will be exclusive.
+ *
+ * @note For this function to work on architectures with independent thread
+ * scheduling, all threads of the subwarp have to execute it.
+ */
+template <bool inclusive, typename ValueType, typename Group>
+__forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
+                                                   ValueType &prefix_sum,
+                                                   ValueType &total_sum,
+                                                   Group subwarp)
+{
+    prefix_sum = inclusive ? element : zero<ValueType>();
+    total_sum = element;
+#pragma unroll
+    // hypercube prefix sum
+    for (auto step = 1; step < subwarp.size(); step *= 2) {
+        auto neighbor = subwarp.shfl_xor(total_sum, step);
+        total_sum += neighbor;
+        prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
+    }
+}
+
+/**
+ * @internal
+ * Computes the prefix sum of `element` over a subwarp.
+ *
+ * @param element     the element over which we compute the prefix sum.
+ * @param prefix_sum  will be set to the sum of all `element`s from lower
+ *                    lanes, plus the local `element` if `inclusive` is `true`.
+ * @param subwarp     the cooperative group representing the subwarp.
+ *
+ * @tparam inclusive  if this is true, the computed prefix sum will be
+ *                    inclusive, otherwise it will be exclusive.
+ *
+ * @note All threads of the subwarp have to execute this function for it to work
+ *       (and not dead-lock on newer architectures).
+ */
+template <bool inclusive, typename ValueType, typename Group>
+__forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
+                                                   ValueType &prefix_sum,
+                                                   Group subwarp)
+{
+    ValueType tmp{};
+    subwarp_prefix_sum<inclusive>(element, prefix_sum, tmp, subwarp);
+}
+
+
+/**
+ * @internal
+ * First step of the calculation of a prefix sum. Calculates the prefix sum
+ * in-place on parts of the array `elements`.
+ *
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array which stores the total sum of each block, requires at
+ *                   least `ceildiv(num_elements, block_size)` elements
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @tparam block_size  thread block size for this kernel, also size of blocks on
+ *                     which this kernel calculates the prefix sum in-place
+ *
+ * @note To calculate the prefix sum over an array of size bigger than
+ *       `block_size`, `finalize_prefix_sum` has to be used as well.
+ */
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void start_prefix_sum(
+    size_type num_elements, ValueType *__restrict__ elements,
+    ValueType *__restrict__ block_sum)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto element_id = threadIdx.x;
+    __shared__ size_type prefix_helper[block_size];
+    prefix_helper[element_id] =
+        (tidx < num_elements) ? elements[tidx] : zero<ValueType>();
+    auto this_block = group::this_thread_block();
+    this_block.sync();
+
+    // Do a normal reduction
+#pragma unroll
+    for (int i = 1; i < block_size; i <<= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            prefix_helper[bi] += prefix_helper[ai];
+        }
+        this_block.sync();
+    }
+
+    if (element_id == 0) {
+        // Store the total sum
+        block_sum[blockIdx.x] = prefix_helper[block_size - 1];
+        prefix_helper[block_size - 1] = zero<ValueType>();
+    }
+
+    this_block.sync();
+
+    // Perform the down-sweep phase to get the true prefix sum
+#pragma unroll
+    for (int i = block_size >> 1; i > 0; i >>= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            auto tmp = prefix_helper[ai];
+            prefix_helper[ai] = prefix_helper[bi];
+            prefix_helper[bi] += tmp;
+        }
+        this_block.sync();
+    }
+    if (tidx < num_elements) {
+        elements[tidx] = prefix_helper[element_id];
+    }
+}
+
+
+/**
+ * @internal
+ * Second step of the calculation of a prefix sum. Increases the value of each
+ * entry of `elements` by the total sum of all preceding blocks.
+ *
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array storing the total sum of each block
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @tparam block_size  thread block size for this kernel, has to be the same as
+ *                    for `start_prefix_sum`
+ *
+ * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
+ */
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_prefix_sum(
+    size_type num_elements, ValueType *__restrict__ elements,
+    const ValueType *__restrict__ block_sum)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_elements) {
+        ValueType prefix_block_sum = zero<ValueType>();
+        for (size_type i = 0; i < blockIdx.x; i++) {
+            prefix_block_sum += block_sum[i];
+        }
+        elements[tidx] += prefix_block_sum;
+    }
+}
\ No newline at end of file
diff --git a/common/components/reduction.hpp.inc b/common/components/reduction.hpp.inc
new file mode 100644
index 00000000000..0bc44e08bb4
--- /dev/null
+++ b/common/components/reduction.hpp.inc
@@ -0,0 +1,177 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on a group
+ * `group`. Each thread contributes with one element `local_data`. The local
+ * thread element is always passed as the first parameter to the `reduce_op`.
+ * The function returns the result of the reduction on all threads.
+ *
+ * @note The function is guaranteed to return the correct value on all threads
+ *       only if `reduce_op` is commutative (in addition to being associative).
+ *       Otherwise, the correct value is returned only to the thread with
+ *       subwarp index 0.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ ValueType reduce(const Group &group,
+                                            ValueType local_data,
+                                            Operator reduce_op = Operator{})
+{
+#pragma unroll
+    for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) {
+        const auto remote_data = group.shfl_xor(local_data, bitmask);
+        local_data = reduce_op(local_data, remote_data);
+    }
+    return local_data;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the index of the thread that has the element with the largest
+ * magnitude among all the threads in the group.
+ * Only the values from threads which set `is_pivoted` to `false` will be
+ * considered.
+ */
+template <
+    typename Group, typename ValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ int choose_pivot(const Group &group,
+                                            ValueType local_data,
+                                            bool is_pivoted)
+{
+    using real = remove_complex<ValueType>;
+    real lmag = is_pivoted ? -one<real>() : abs(local_data);
+    const auto pivot =
+        reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
+            const auto rmag = group.shfl(lmag, ridx);
+            if (rmag > lmag) {
+                lmag = rmag;
+                lidx = ridx;
+            }
+            return lidx;
+        });
+    // pivot operator not commutative, make sure everyone has the same pivot
+    return group.shfl(pivot, 0);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on entire block.
+ * The data for the reduction is taken from the `data` array which has to be of
+ * size `block_size` and accessible from all threads. The `data` array is also
+ * used as work space (so its content will be destroyed in the process), as well
+ * as to store the return value - which is stored in the 0-th position of the
+ * array.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
+__device__ void reduce(const Group &__restrict__ group,
+                       ValueType *__restrict__ data,
+                       Operator reduce_op = Operator{})
+{
+    const auto local_id = group.thread_rank();
+
+    for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
+        group.sync();
+        if (local_id < k) {
+            data[local_id] = reduce_op(data[local_id], data[local_id + k]);
+        }
+    }
+
+    const auto warp = group::tiled_partition<config::warp_size>(group);
+    const auto warp_id = group.thread_rank() / warp.size();
+    if (warp_id > 0) {
+        return;
+    }
+    auto result = reduce(warp, data[warp.thread_rank()], reduce_op);
+    if (warp.thread_rank() == 0) {
+        data[0] = result;
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `block_size`.
+ */
+template <typename Operator, typename ValueType>
+__device__ void reduce_array(size_type size,
+                             const ValueType *__restrict__ source,
+                             ValueType *__restrict__ result,
+                             Operator reduce_op = Operator{})
+{
+    const auto tidx = thread::get_thread_id_flat();
+    auto thread_result = zero<ValueType>();
+    for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) {
+        thread_result = reduce_op(thread_result, source[i]);
+    }
+    result[threadIdx.x] = thread_result;
+
+    group::this_thread_block().sync();
+
+    // Stores the result of the reduction inside `result[0]`
+    reduce(group::this_thread_block(), result, reduce_op);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the add operation (+) on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `default_block_size`.
+ */
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void reduce_add_array(
+    size_type size, const ValueType *__restrict__ source,
+    ValueType *__restrict__ result)
+{
+    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
+    reduce_array(size, source, static_cast<ValueType *>(block_sum),
+                 [](const ValueType &x, const ValueType &y) { return x + y; });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_sum[0];
+    }
+}
diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc
new file mode 100644
index 00000000000..e7e558508f0
--- /dev/null
+++ b/common/components/searching.hpp.inc
@@ -0,0 +1,238 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ * Generic binary search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ * The implementation is based on Stepanov & McJones, "Elements of Programming".
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Predicate>
+__forceinline__ __device__ IndexType binary_search(IndexType offset,
+                                                   IndexType length,
+                                                   Predicate p)
+{
+    while (length > 0) {
+        auto half_length = length / 2;
+        auto mid = offset + half_length;
+        auto pred = p(mid);
+        length = pred ? half_length : length - (half_length + 1);
+        offset = pred ? offset : mid + 1;
+    }
+    return offset;
+}
+
+
+/**
+ * @internal
+ * Generic implementation of a fixed-size binary search.
+ * The implementation makes sure that the number of predicate evaluations only
+ * depends on `length` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * @tparam size  the length of the partitioned range - must be a power of two
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `int` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <int size, typename Predicate>
+__forceinline__ __device__ int synchronous_fixed_binary_search(Predicate p)
+{
+    if (size == 0) {
+        return 0;
+    }
+    int begin{};
+    static_assert(size > 0, "size must be positive");
+    static_assert(!(size & (size - 1)), "size must be a power of two");
+#pragma unroll
+    for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
+        auto half_size = cur_size / 2;
+        auto mid = begin + half_size;
+        // invariant: [begin, begin + cur_size] contains partition point
+        begin = p(mid) ? begin : mid;
+    }
+    // cur_size is now 1, so the partition point is either begin or begin + 1
+    return p(begin) ? begin : begin + 1;
+}
+
+
+/**
+ * @internal
+ * Generic implementation of a synchronous binary search.
+ * The implementation makes sure that the number of predicate evaluations only
+ * depends on `length` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * @param size  the length of the partitioned range - must be a power of two
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `int` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename Predicate>
+__forceinline__ __device__ int synchronous_binary_search(int size, Predicate p)
+{
+    if (size == 0) {
+        return 0;
+    }
+    int begin{};
+    for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
+        auto half_size = cur_size / 2;
+        auto mid = begin + half_size;
+        // invariant: [begin, begin + cur_size] contains partition point
+        begin = p(mid) ? begin : mid;
+    }
+    // cur_size is now 1, so the partition point is either begin or begin + 1
+    return p(begin) ? begin : begin + 1;
+}
+
+
+/**
+ * @internal
+ * Generic search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * It executes `log2(length / group.size())` coalescing calls to `p`.
+ *
+ * This implementation is based on the w-wide search mentioned in
+ * Green et al., "GPU merge path: a GPU merging algorithm"
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param group   the coalescing group executing the search
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Group, typename Predicate>
+__forceinline__ __device__ IndexType group_wide_search(IndexType offset,
+                                                       IndexType length,
+                                                       Group group, Predicate p)
+{
+    // binary search on the group-sized blocks
+    IndexType num_blocks = (length + group.size() - 1) / group.size();
+    auto group_pos = binary_search(IndexType{}, num_blocks, [&](IndexType i) {
+        auto idx = i * group.size();
+        return p(offset + idx);
+    });
+    // case 1: p is true everywhere: middle is at the beginning
+    if (group_pos == 0) {
+        return offset;
+    }
+    /*
+     * case 2: p is false somewhere:
+     *
+     * p(group_pos * g.size()) is true, so either this is the partition point,
+     * or the partition point is one of the g.size() - 1 previous indices.
+     *   |block group_pos-1|
+     * 0 | 0 * * * * * * * | 1
+     *       ^               ^
+     *       we load this range, with the 1 acting as a sentinel for ffs(...)
+     *
+     * additionally, this means that we can't call p out-of-bounds
+     */
+    auto base_idx = (group_pos - 1) * group.size() + 1;
+    auto idx = base_idx + group.thread_rank();
+    auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1;
+    return offset + base_idx + pos;
+}
+
+
+/**
+ * @internal
+ * Generic search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * It executes `log2(length) / log2(group.size())` calls to `p` that effectively
+ * follow a random-access pattern.
+ *
+ * This implementation is based on the w-partition search mentioned in
+ * Green et al., "GPU merge path: a GPU merging algorithm"
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param group   the coalescing group executing the search
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Group, typename Predicate>
+__forceinline__ __device__ IndexType group_ary_search(IndexType offset,
+                                                      IndexType length,
+                                                      Group group, Predicate p)
+{
+    IndexType end = offset + length;
+    // invariant: [offset, offset + length] contains middle
+    while (length > group.size()) {
+        auto stride = length / group.size();
+        auto idx = offset + group.thread_rank() * stride;
+        auto mask = group.ballot(p(idx));
+        // if the mask is 0, the partition point is in the last block
+        // if the mask is ~0, the partition point is in the first block
+        // otherwise, we go to the last block that returned a 0.
+        auto pos = mask == 0 ? group.size() - 1 : ffs(mask >> 1) - 1;
+        auto last_length = length - stride * (group.size() - 1);
+        length = pos == group.size() - 1 ? last_length : stride;
+        offset += stride * pos;
+    }
+    auto idx = offset + group.thread_rank();
+    // if the mask is 0, the partition point is at the end
+    // otherwise it is the first set bit
+    auto mask = group.ballot(idx >= end || p(idx));
+    auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
+    return offset + pos;
+}
diff --git a/common/components/segment_scan.hpp.inc b/common/components/segment_scan.hpp.inc
new file mode 100644
index 00000000000..3aac34832dc
--- /dev/null
+++ b/common/components/segment_scan.hpp.inc
@@ -0,0 +1,63 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+/**
+ * @internal
+ *
+ * Compute a segement scan using add operation (+) of a subwarp. Each segment
+ * performs suffix sum. Works on the source array and returns whether the thread
+ * is the first element of its segment with same `ind`.
+ */
+template <size_type subwarp_size, typename ValueType, typename IndexType>
+__device__ __forceinline__ bool segment_scan(
+    const group::thread_block_tile<subwarp_size> &group, const IndexType ind,
+    ValueType *__restrict__ val)
+{
+    bool head = true;
+#pragma unroll
+    for (int i = 1; i < subwarp_size; i <<= 1) {
+        const IndexType add_ind = group.shfl_up(ind, i);
+        ValueType add_val = zero<ValueType>();
+        if (add_ind == ind && threadIdx.x >= i) {
+            add_val = *val;
+            if (i == 1) {
+                head = false;
+            }
+        }
+        add_val = group.shfl_down(add_val, i);
+        if (threadIdx.x < subwarp_size - i) {
+            *val += add_val;
+        }
+    }
+    return head;
+}
\ No newline at end of file
diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
new file mode 100644
index 00000000000..cc042a08d90
--- /dev/null
+++ b/common/components/sorting.hpp.inc
@@ -0,0 +1,320 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace detail {
+
+
+/**
+ * @internal
+ * Bitonic sorting operation for two elements.
+ *
+ * @param reverse  sorts in ascending order if `false` and
+ *                 descending order if `true`.
+ */
+template <typename ValueType>
+__forceinline__ __device__ void bitonic_cas(ValueType &a, ValueType &b,
+                                            bool reverse)
+{
+    auto tmp = a;
+    bool cmp = (a < b) != reverse;
+    a = cmp ? a : b;
+    b = cmp ? b : tmp;
+}
+
+
+/**
+ * @internal
+ * This is a recursive implementation of a bitonic sorting network,
+ * executed sequentially on locally stored data.
+ *
+ * Based on Batcher, "Sorting Networks and Their Applications", 1968.
+ */
+template <typename ValueType, int num_elements>
+struct bitonic_local {
+    using half = bitonic_local<ValueType, num_elements / 2>;
+    static_assert(num_elements > 0, "number of elements must be positive");
+    static_assert((num_elements & (num_elements - 1)) == 0,
+                  "number of elements must be a power of two");
+
+    // merges two bitonic sequences els[0, n / 2), els[n / 2, n)
+    __forceinline__ __host__ __device__ static void merge(ValueType *els,
+                                                          bool reverse)
+    {
+        auto els_mid = els + (num_elements / 2);
+        for (auto i = 0; i < num_elements / 2; ++i) {
+            bitonic_cas(els[i], els_mid[i], reverse);
+        }
+        half::merge(els, reverse);
+        half::merge(els_mid, reverse);
+    }
+
+    // sorts an unsorted sequence els [0, n)
+    __forceinline__ __device__ static void sort(ValueType *els, bool reverse)
+    {
+        auto els_mid = els + (num_elements / 2);
+        // sort first half normally
+        half::sort(els, reverse);
+        // sort second half reversed
+        half::sort(els_mid, !reverse);
+        // merge two halves
+        merge(els, reverse);
+    }
+};
+
+template <typename ValueType>
+struct bitonic_local<ValueType, 1> {
+    // nothing to do for a single element
+    __forceinline__ __device__ static void merge(ValueType *, bool) {}
+    __forceinline__ __device__ static void sort(ValueType *, bool) {}
+};
+
+
+/**
+ * @internal
+ * This is a recursive implementation of a bitonic sorting network,
+ * executed in parallel within a warp using lane shuffle instructions.
+ *
+ * Based on Hou et al., "Fast Segmented Sort on GPUs", 2017.
+ */
+template <typename ValueType, int num_local, int num_threads>
+struct bitonic_warp {
+    constexpr static auto num_elements = num_local * num_threads;
+    using half = bitonic_warp<ValueType, num_local, num_threads / 2>;
+    static_assert(num_threads > 0, "number of threads must be positive");
+    static_assert(num_local > 0, "number of local elements must be positive");
+    static_assert(
+        config::warp_size % num_threads == 0 &&
+            num_threads <= config::warp_size,
+        "number of threads must be a power of two smaller than warp_size");
+
+    // check if we are in the upper half of all threads in this group
+    // this is important as
+    // 1. for sorting, we have to reverse the sort order in the upper half
+    // 2. for merging, we have to determine for the XOR shuffle if we are
+    //    the "smaller" thread, as this thread gets the "smaller" element.
+    __forceinline__ __device__ static bool upper_half()
+    {
+        return bool(threadIdx.x & (num_threads / 2));
+    }
+
+    __forceinline__ __device__ static void merge(ValueType *els, bool reverse)
+    {
+        auto tile = group::thread_block_tile<num_threads>{};
+        auto new_reverse = reverse != upper_half();
+        for (auto i = 0; i < num_local; ++i) {
+            auto other = tile.shfl_xor(els[i], num_threads / 2);
+            bitonic_cas(els[i], other, new_reverse);
+        }
+        half::merge(els, reverse);
+    }
+
+    __forceinline__ __device__ static void sort(ValueType *els, bool reverse)
+    {
+        auto new_reverse = reverse != upper_half();
+        half::sort(els, new_reverse);
+        merge(els, reverse);
+    }
+};
+
+template <typename ValueType, int NumLocalElements>
+struct bitonic_warp<ValueType, NumLocalElements, 1> {
+    using local = bitonic_local<ValueType, NumLocalElements>;
+    __forceinline__ __device__ static void merge(ValueType *els, bool reverse)
+    {
+        local::merge(els, reverse);
+    }
+    __forceinline__ __device__ static void sort(ValueType *els, bool reverse)
+    {
+        local::sort(els, reverse);
+    }
+};
+
+
+/**
+ * @internal
+ * This is a recursive implementation of a bitonic sorting network,
+ * executed in parallel in a thread block using shared memory.
+ *
+ * We use a tiled storage pattern to avoid memory bank collisions on shared
+ * memory accesses, see @ref shared_idx.
+ */
+template <typename ValueType, int num_local, int num_threads, int num_groups,
+          int num_total_threads>
+struct bitonic_global {
+    constexpr static auto num_elements = num_local * num_threads * num_groups;
+    using half = bitonic_global<ValueType, num_local, num_threads,
+                                num_groups / 2, num_total_threads>;
+    static_assert(num_groups > 0, "number of groups must be positive");
+    static_assert(num_threads > 0,
+                  "number of threads per group must be positive");
+    static_assert(num_local > 0, "number of local elements must be positive");
+    static_assert(num_total_threads > 0, "number of threads must be positive");
+    static_assert(32 % num_groups == 0,
+                  "num_groups must be a power of two <= 32");
+
+    __forceinline__ __device__ static int shared_idx(int local)
+    {
+        auto rank = group::this_thread_block().thread_rank();
+        // use the same memory-bank to avoid bank conflicts
+        return rank + local * num_total_threads;
+    }
+
+    // check if we are in the upper half of all groups in this block
+    // this is important as for sorting, we have to reverse the sort order in
+    // the upper half
+    __forceinline__ __device__ static bool upper_half()
+    {
+        auto rank = group::this_thread_block().thread_rank();
+        return bool(rank & (num_groups * num_threads / 2));
+    }
+
+    __forceinline__ __device__ static void merge(ValueType *local_els,
+                                                 ValueType *shared_els,
+                                                 bool reverse)
+    {
+        group::this_thread_block().sync();
+        auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
+        // only the lower group executes the CAS
+        if (!upper_half()) {
+            for (auto i = 0; i < num_local; ++i) {
+                auto j = shared_idx(i);
+                bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
+            }
+        }
+        half::merge(local_els, shared_els, reverse);
+    }
+
+    __forceinline__ __device__ static void sort(ValueType *local_els,
+                                                ValueType *shared_els,
+                                                bool reverse)
+    {
+        auto new_reverse = reverse != upper_half();
+        half::sort(local_els, shared_els, new_reverse);
+        merge(local_els, shared_els, reverse);
+    }
+};
+
+template <typename ValueType, int num_local, int num_threads,
+          int num_total_threads>
+struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
+    using warp = bitonic_warp<ValueType, num_local, num_threads>;
+
+    __forceinline__ __device__ static int shared_idx(int local)
+    {
+        // use the indexing from the general struct
+        return bitonic_global<ValueType, num_local, num_threads, 2,
+                              num_total_threads>::shared_idx(local);
+    }
+
+    __forceinline__ __device__ static void merge(ValueType *local_els,
+                                                 ValueType *shared_els,
+                                                 bool reverse)
+    {
+        group::this_thread_block().sync();
+        for (auto i = 0; i < num_local; ++i) {
+            local_els[i] = shared_els[shared_idx(i)];
+        }
+        warp::merge(local_els, reverse);
+        for (auto i = 0; i < num_local; ++i) {
+            shared_els[shared_idx(i)] = local_els[i];
+        }
+    }
+
+    __forceinline__ __device__ static void sort(ValueType *local_els,
+                                                ValueType *shared_els,
+                                                bool reverse)
+    {
+        auto rank = group::this_thread_block().thread_rank();
+        // This is the first step, so we don't need to load from shared memory
+        warp::sort(local_els, reverse);
+        // store the sorted elements in shared memory
+        for (auto i = 0; i < num_local; ++i) {
+            shared_els[shared_idx(i)] = local_els[i];
+        }
+    }
+};
+
+
+}  // namespace detail
+
+
+/**
+ * @internal
+ *
+ * This function sorts elements within a thread block.
+ *
+ * It takes a local array of elements and the pointer to a shared buffer of size
+ * `num_elements` as input. After the execution, the thread with rank `i` in the
+ * thread block (determined by `group::this_thread_block().thread_rank()`) has
+ * the elements at index `num_local * i` up to `num_local * i + (num_local - 1)`
+ * in the sorted sequence stored in its `local_elements` at index 0 up to
+ * `num_local - 1`.
+ *
+ * @note The shared-memory buffer uses a striped layout to limit bank
+ *       collisions, so it should not directly be used to access elements from
+ *       the sorted sequence. If `num_elements <= num_local * warp_size`, the
+ *       algorithm doesn't use/need the shared-memory buffer, so it can be null.
+ *
+ * @param local_elements  the `num_local` input/output elements from this
+ *                        thread.
+ * @param shared_elements  the shared-memory buffer of size `num_elements`
+ * @tparam num_elements  the number of elements - it must be a power of two!
+ * @tparam num_local  the number of elements stored per thread - it must be a
+ *                    power of two!
+ * @tparam ValueType  the type of the elements to be sorted - it must implement
+ *                    the less-than operator!
+ */
+template <int num_elements, int num_local, typename ValueType>
+__forceinline__ __device__ void bitonic_sort(ValueType *local_elements,
+                                             ValueType *shared_elements)
+{
+    constexpr auto num_threads = num_elements / num_local;
+    constexpr auto num_warps = num_threads / config::warp_size;
+    static_assert(num_threads <= config::max_block_size,
+                  "bitonic_sort exceeds thread block");
+    if (num_warps > 1) {
+        // these checks are necessary since the `if` is not evaluated at
+        // compile-time so even though the branch is never taken, it still gets
+        // instantiated and must thus compile.
+        constexpr auto _num_warps = num_warps <= 1 ? 1 : num_warps;
+        constexpr auto _num_threads =
+            num_threads <= config::warp_size ? config::warp_size : num_threads;
+        detail::bitonic_global<ValueType, num_local, config::warp_size,
+                               _num_warps, _num_threads>::sort(local_elements,
+                                                               shared_elements,
+                                                               false);
+    } else {
+        constexpr auto _num_threads = num_warps > 1 ? 1 : num_threads;
+        detail::bitonic_warp<ValueType, num_local, _num_threads>::sort(
+            local_elements, false);
+    }
+}
diff --git a/common/components/thread_ids.hpp.inc b/common/components/thread_ids.hpp.inc
new file mode 100644
index 00000000000..3a28dad5326
--- /dev/null
+++ b/common/components/thread_ids.hpp.inc
@@ -0,0 +1,272 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ *
+ * Returns the ID of the block group this thread belongs to.
+ *
+ * @return the ID of the block group this thread belongs to
+ *
+ * @note Assumes that grid dimensions are in standard format:
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`
+ */
+__device__ __forceinline__ size_type get_block_group_id()
+{
+    return static_cast<size_type>(blockIdx.z) * gridDim.y + blockIdx.y;
+}
+
+/**
+ * @internal
+ *
+ * Returns the ID of the block this thread belongs to.
+ *
+ * @return the ID of the block this thread belongs to
+ *
+ * @note Assumes that grid dimensions are in standard format:
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`
+ */
+__device__ __forceinline__ size_type get_block_id()
+{
+    return get_block_group_id() * gridDim.x + blockIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the warp (relative to the block) this thread belongs
+ * to.
+ *
+ * @return the local ID of the warp (relative to the block) this thread belongs
+ *         to
+ *
+ * @note Assumes that block dimensions are in standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+__device__ __forceinline__ size_type get_local_warp_id()
+{
+    return static_cast<size_type>(threadIdx.z);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the sub-warp (relative to the block) this thread
+ * belongs to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the local ID of the sub-warp (relative to the block) this thread
+ *         belongs to
+ *
+ * @note Assumes that block dimensions are in standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+template <int subwarp_size>
+__device__ __forceinline__ size_type get_local_subwarp_id()
+{
+    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
+    return get_local_warp_id() * subwarps_per_warp + threadIdx.y;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the thread (relative to the block).
+ * to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the local ID of the thread (relative to the block)
+ *
+ * @note Assumes that block dimensions are in standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+template <int subwarp_size>
+__device__ __forceinline__ size_type get_local_thread_id()
+{
+    return get_local_subwarp_id<subwarp_size>() * subwarp_size + threadIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the warp this thread belongs to.
+ *
+ * @tparam warps_per_block  number of warps within each block
+ *
+ * @return the global ID of the warp this thread belongs to.
+ *
+ * @note Assumes that block dimensions and grid dimensions are in standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int warps_per_block>
+__device__ __forceinline__ size_type get_warp_id()
+{
+    return get_block_id() * warps_per_block + get_local_warp_id();
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the sub-warp this thread belongs to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the global ID of the sub-warp this thread belongs to.
+ *
+ * @note Assumes that block dimensions and grid dimensions are in standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int subwarp_size, int warps_per_block>
+__device__ __forceinline__ size_type get_subwarp_id()
+{
+    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
+    return get_warp_id<warps_per_block>() * subwarps_per_warp + threadIdx.y;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread.
+ *
+ * @return the global ID of the thread.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @note Assumes that block dimensions and grid dimensions are in standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int subwarp_size, int warps_per_block>
+__device__ __forceinline__ size_type get_thread_id()
+{
+    return get_subwarp_id<subwarp_size, warps_per_block>() * subwarp_size +
+           threadIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread in the given index type.
+ * This function assumes one-dimensional thread and block indexing.
+ *
+ * @return the global ID of the thread in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_thread_id_flat()
+{
+    return threadIdx.x + static_cast<IndexType>(blockDim.x) * blockIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of threads in the given index type.
+ * This function assumes one-dimensional thread and block indexing.
+ *
+ * @return the total number of threads in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_thread_num_flat()
+{
+    return blockDim.x * static_cast<IndexType>(gridDim.x);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the subwarp in the given index type.
+ * This function assumes one-dimensional thread and block indexing
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the global ID of the subwarp in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_subwarp_id_flat()
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return threadIdx.x / subwarp_size +
+           static_cast<IndexType>(blockDim.x / subwarp_size) * blockIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of subwarps in the given index type.
+ * This function assumes one-dimensional thread and block indexing
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the total number of subwarps in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_subwarp_num_flat()
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return blockDim.x / subwarp_size * static_cast<IndexType>(gridDim.x);
+}
\ No newline at end of file
diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
new file mode 100644
index 00000000000..ced072c40f4
--- /dev/null
+++ b/common/components/uninitialized_array.hpp.inc
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+/**
+ * Stores an array with uninitialized contents.
+ *
+ * This class needed for datatypes that do have a non-empty constructor when`
+ * using them as shared memory, for example `thrust::complex<float>`.
+ *
+ * @tparam ValueType the type of values
+ * @tparam size the size of the array
+ */
+template <typename ValueType, size_type size>
+class UninitializedArray {
+public:
+    /**
+     * Operator for casting an UninitializedArray into its constexpr value
+     * pointer.
+     *
+     * @return the constexpr pointer to the first entry of the array.
+     */
+    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    {
+        return &(*this)[0];
+    }
+
+    /**
+     * Operator for casting an UninitializedArray into its non-const value
+     * pointer.
+     *
+     * @return the non-const pointer to the first entry of the array.
+     */
+    GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; }
+
+    /**
+     * constexpr array access operator.
+     *
+     * @param pos The array index. Using a value outside [0, size) is undefined
+     * behavior.
+     *
+     * @return a reference to the array entry at the given index.
+     */
+    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    {
+        return reinterpret_cast<const ValueType *>(data_)[pos];
+    }
+
+    /**
+     * Non-const array access operator.
+     *
+     * @param pos The array index. Using a value outside [0, size) is undefined
+     * behavior.
+     *
+     * @return a reference to the array entry at the given index.
+     */
+    GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept
+    {
+        return reinterpret_cast<ValueType *>(data_)[pos];
+    }
+
+private:
+    unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+};
diff --git a/common/components/warp_blas.hpp.inc b/common/components/warp_blas.hpp.inc
new file mode 100644
index 00000000000..d99b009d9bb
--- /dev/null
+++ b/common/components/warp_blas.hpp.inc
@@ -0,0 +1,391 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ *
+ * Defines a postprocessing transformation that should be performed on the
+ * result of a function call.
+ *
+ * @note This functionality should become useless once accessors and ranges are
+ *       in place, as they will define the storage scheme.
+ */
+enum postprocess_transformation { and_return, and_transpose };
+
+
+/**
+ * @internal
+ *
+ * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan
+ * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using the
+ * thread group `group. Each thread contributes one `row` of the matrix, and the
+ * routine uses warp shuffles to exchange data between rows. The transform is
+ * performed by using the `key_row`-th row and `key_col`-th column of the
+ * matrix.
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ void apply_gauss_jordan_transform(
+    const Group &__restrict__ group, int32 key_row, int32 key_col,
+    ValueType *__restrict__ row, bool &__restrict__ status)
+{
+    auto key_col_elem = group.shfl(row[key_col], key_row);
+    if (key_col_elem == zero<ValueType>()) {
+        // TODO: implement error handling for GPUs to be able to properly
+        //       report it here
+        status = false;
+        return;
+    }
+    if (group.thread_rank() == key_row) {
+        key_col_elem = one<ValueType>() / key_col_elem;
+    } else {
+        key_col_elem = -row[key_col] / key_col_elem;
+    }
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        const auto key_row_elem = group.shfl(row[i], key_row);
+        if (group.thread_rank() == key_row) {
+            row[i] = zero<ValueType>();
+        }
+        row[i] += key_col_elem * key_row_elem;
+    }
+    row[key_col] = key_col_elem;
+}
+
+
+/**
+ * @internal
+ *
+ * Inverts a matrix using Gauss-Jordan elimination. The inversion is
+ * done in-place, so the original matrix will be overridden with the inverse.
+ * The inversion routine uses implicit pivoting, so the returned matrix will be
+ * a permuted inverse (from both sides). To obtain the correct inverse, the
+ * rows of the result should be permuted with $P$, and the columns with
+ * $ P^T $ (i.e. $ A^{-1} = P X P $, where $ X $ is the returned matrix). These
+ * permutation matrices are returned compressed as vectors `perm`
+ * and`trans_perm`, respectively. `i`-th value of each of the vectors is
+ * returned to thread of the group with rank `i`.
+ *
+ * @tparam max_problem_size  the maximum problem size that will be passed to the
+ *                           inversion routine (a tighter bound results in
+ *                           faster code
+ * @tparam Group  type of the group of threads
+ * @tparam ValueType  type of values stored in the matrix
+ *
+ * @param group  the group of threads which participate in the inversion
+ * @param problem_size  the actual size of the matrix (cannot be larger than
+ *                      max_problem_size)
+ * @param row  a pointer to the matrix row (i-th thread in the group should
+ *             pass the pointer to the i-th row), has to have at least
+ *             max_problem_size elements
+ * @param perm  a value to hold an element of permutation matrix $ P $
+ * @param trans_perm  a value to hold an element of permutation matrix $ P^T $
+ *
+ * @return true if the inversion succeeded, false otherwise
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ bool invert_block(const Group &__restrict__ group,
+                                             uint32 problem_size,
+                                             ValueType *__restrict__ row,
+                                             uint32 &__restrict__ perm,
+                                             uint32 &__restrict__ trans_perm)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+    // prevent rows after problem_size to become pivots
+    auto pivoted = group.thread_rank() >= problem_size;
+    auto status = true;
+#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
+#pragma unroll
+#else
+#pragma unroll 1
+#endif
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            const auto piv = choose_pivot(group, row[i], pivoted);
+            if (group.thread_rank() == piv) {
+                perm = i;
+                pivoted = true;
+            }
+            if (group.thread_rank() == i) {
+                trans_perm = piv;
+            }
+            apply_gauss_jordan_transform<max_problem_size>(group, piv, i, row,
+                                                           status);
+        }
+    }
+    return status;
+}
+
+
+/**
+ * @internal
+ *
+ * Performs the correct index calculation for the given postprocess operation.
+ */
+template <postprocess_transformation mod, typename T1, typename T2, typename T3>
+__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col,
+                                                             T3 stride) ->
+    typename std::enable_if<
+        mod != and_transpose,
+        typename std::decay<decltype(row * stride + col)>::type>::type
+{
+    return row * stride + col;
+}
+
+
+template <postprocess_transformation mod, typename T1, typename T2, typename T3>
+__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col,
+                                                             T3 stride) ->
+    typename std::enable_if<
+        mod == and_transpose,
+        typename std::decay<decltype(col * stride + row)>::type>::type
+{
+    return col * stride + row;
+}
+
+
+/**
+ * @internal
+ *
+ * Copies a matrix stored as a collection of rows in different threads of the
+ * warp in a block of memory accessible by all threads in row-major order.
+ * Optionally permutes rows and columns of the matrix in the process.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam mod  the transformation to perform on the return data
+ * @tparam Group  type of the group of threads
+ * @tparam SourceValueType  type of values stored in the source matrix
+ * @tparam ResultValueType  type of values stored in the result matrix
+ *
+ * @param group  group of threads participating in the copy
+ * @param problem_size  actual size of the matrix
+ *                      (`problem_size <= max_problem_size`)
+ * @param source_row  pointer to memory used to store a row of the source matrix
+ *                    `i`-th thread of the sub-warp should pass in the `i`-th
+ *                    row of the matrix
+ * @param increment  offset between two consecutive elements of the row
+ * @param row_perm  permutation vector to apply on the rows of the matrix
+ *                  (thread `i` supplies the `i`-th value of the vector)
+ * @param col_perm  permutation vector to apply on the column of the matrix
+ *                  (thread `i` supplies the `i`-th value of the vector)
+ * @param destination  pointer to memory where the result will be stored
+ *                     (all threads supply the same value)
+ * @param stride  offset between two consecutive rows of the matrix
+ */
+template <
+    int max_problem_size, postprocess_transformation mod = and_return,
+    typename Group, typename SourceValueType, typename ResultValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ void copy_matrix(
+    const Group &__restrict__ group, uint32 problem_size,
+    const SourceValueType *__restrict__ source_row, uint32 increment,
+    uint32 row_perm, uint32 col_perm, ResultValueType *__restrict__ destination,
+    size_type stride)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            const auto idx = group.shfl(col_perm, i);
+            if (group.thread_rank() < problem_size) {
+                // Need to assign a variable for the source_row, or hip
+                // will use a lot of VGPRs in unroll. This might lead to
+                // problems.
+                const auto val = source_row[i * increment];
+                destination[get_row_major_index<mod>(idx, row_perm, stride)] =
+                    static_cast<ResultValueType>(val);
+            }
+        }
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Multiplies a transposed vector and a matrix stored in column-major order.
+ *
+ * In mathematical terms, performs the operation $ res^T = vec^T \cdot mtx$.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam Group  type of the group of threads
+ * @tparam MatrixValueType  type of values stored in the matrix
+ * @tparam VectorValueType  type of values stored in the vectors
+ *
+ * @param group  group of threads participating in the operation
+ * @param problem_size  actual size of the matrix
+ *                      (`problem_size <= max_problem_size`)
+ * @param vec  input vector to multiply (thread `i` supplies the `i`-th value of
+ *             the vector)
+ * @param mtx_row  pointer to memory used to store a row of the input matrix,
+ *                    `i`-th thread of the sub-warp should pass in the
+ *                    `i`-th row of the matrix
+ * @param mtx_increment  offset between two consecutive elements of the row
+ * @param res  pointer to a block of memory where the result will be written
+ *             (only thread 0 of the group has to supply a valid value)
+ * @param mtx_increment  offset between two consecutive elements of the result
+ */
+template <
+    int max_problem_size, typename Group, typename MatrixValueType,
+    typename VectorValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ void multiply_transposed_vec(
+    const Group &__restrict__ group, uint32 problem_size,
+    const VectorValueType &__restrict__ vec,
+    const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment,
+    VectorValueType *__restrict__ res, uint32 res_increment)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+    auto mtx_elem = zero<VectorValueType>();
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            if (group.thread_rank() < problem_size) {
+                mtx_elem =
+                    static_cast<VectorValueType>(mtx_row[i * mtx_increment]);
+            }
+            const auto out = reduce(
+                group, mtx_elem * vec,
+                [](VectorValueType x, VectorValueType y) { return x + y; });
+            if (group.thread_rank() == 0) {
+                res[i * res_increment] = out;
+            }
+        }
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Multiplies a matrix and a vector stored in column-major order.
+ *
+ * In mathematical terms, performs the operation $res = mtx \cdot vec$.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam Group  type of the group of threads
+ * @tparam MatrixValueType  type of values stored in the matrix
+ * @tparam VectorValueType  type of values stored in the vectors
+ * @tparam Closure  type of the function used to write the result
+ *
+ * @param group  group of threads participating in the operation
+ * @param problem_size  actual size of the matrix
+ *                      (`problem_size <= max_problem_size`)
+ * @param vec  input vector to multiply (thread `i` supplies the `i`-th value of
+ *             the vector)
+ * @param mtx_row  pointer to memory used to store a row of the input matrix,
+ *                    `i`-th thread of the sub-warp should pass in the
+ *                    `i`-th row of the matrix
+ * @param mtx_increment  offset between two consecutive elements of the row
+ * @param res  pointer to a block of memory where the result will be written
+ *             (only thread 0 of the group has to supply a valid value)
+ * @param mtx_increment  offset between two consecutive elements of the result
+ * @param closure_op  Operation that is performed when writing to
+                     `res[group.thread_rank() * res_increment]` as
+                     `closure_op(res[group.thread_rank() * res_increment], out)`
+                      where `out` is the result of $mtx \cdot vec$.
+ */
+template <
+    int max_problem_size, typename Group, typename MatrixValueType,
+    typename VectorValueType, typename Closure,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ void multiply_vec(
+    const Group &__restrict__ group, uint32 problem_size,
+    const VectorValueType &__restrict__ vec,
+    const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment,
+    VectorValueType *__restrict__ res, uint32 res_increment, Closure closure_op)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+    auto mtx_elem = zero<VectorValueType>();
+    auto out = zero<VectorValueType>();
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            if (group.thread_rank() < problem_size) {
+                mtx_elem =
+                    static_cast<VectorValueType>(mtx_row[i * mtx_increment]);
+            }
+            out += mtx_elem * group.shfl(vec, i);
+        }
+    }
+    if (group.thread_rank() < problem_size) {
+        closure_op(res[group.thread_rank() * res_increment], out);
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Computes the infinity norm of a matrix. Each thread in the group supplies
+ * one row of the matrix.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam Group  type of the group of threads
+ * @tparam ValueType  type of values stored in the matrix
+ *
+ * @param group  group of threads participating in the operation
+ * @param num_rows  number of rows of the matrix
+ *                  (`num_rows <= max_problem_size`)
+ * @param num_cols  number of columns of the matrix
+ * @param row  pointer to memory used to store a row of the input matrix,
+ *             `i`-th thread of the group should pass in the `i`-th row of the
+ *             matrix
+ *
+ * @return the infinity norm of the matrix
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
+    const Group &group, uint32 num_rows, uint32 num_cols, const ValueType *row)
+{
+    using result_type = remove_complex<ValueType>;
+    auto sum = zero<result_type>();
+    if (group.thread_rank() < num_rows) {
+#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
+#pragma unroll
+#else
+#pragma unroll 1
+#endif
+        for (uint32 i = 0; i < max_problem_size; ++i) {
+            if (i < num_cols) {
+                sum += abs(row[i]);
+            }
+        }
+    }
+    return reduce(group, sum,
+                  [](result_type x, result_type y) { return max(x, y); });
+}
diff --git a/common/factorization/factorization_kernels.hpp.inc b/common/factorization/factorization_kernels.hpp.inc
new file mode 100644
index 00000000000..7050c5ce116
--- /dev/null
+++ b/common/factorization/factorization_kernels.hpp.inc
@@ -0,0 +1,364 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+namespace detail {
+
+
+// Default implementation for the unsorted case
+template <bool IsSorted>
+struct find_helper {
+    template <typename Group, typename IndexType>
+    static __forceinline__ __device__ bool find(Group subwarp_grp,
+                                                const IndexType *first,
+                                                const IndexType *last,
+                                                IndexType value)
+    {
+        auto subwarp_idx = subwarp_grp.thread_rank();
+        bool found{false};
+        for (auto curr_start = first; curr_start < last;
+             curr_start += subwarp_grp.size()) {
+            const auto curr = curr_start + subwarp_idx;
+            found = (curr < last && *curr == value);
+            found = subwarp_grp.any(found);
+            if (found) {
+                break;
+            }
+        }
+        return found;
+    }
+};
+
+
+// Improved version in case the CSR matrix is sorted
+template <>
+struct find_helper<true> {
+    template <typename Group, typename IndexType>
+    static __forceinline__ __device__ bool find(Group subwarp_grp,
+                                                const IndexType *first,
+                                                const IndexType *last,
+                                                IndexType value)
+    {
+        const auto length = static_cast<IndexType>(last - first);
+        const auto pos =
+            group_wide_search(IndexType{}, length, subwarp_grp,
+                              [&](IndexType i) { return first[i] >= value; });
+        return pos < length && first[pos] == value;
+    }
+};
+
+
+}  // namespace detail
+
+
+// SubwarpSize needs to be a power of 2
+// Each subwarp works on one row
+template <bool IsSorted, int SubwarpSize, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void find_missing_diagonal_elements(
+        IndexType num_rows, IndexType num_cols,
+        const IndexType *__restrict__ col_idxs,
+        const IndexType *__restrict__ row_ptrs,
+        IndexType *__restrict__ elements_to_add_per_row,
+        bool *__restrict__ changes_required)
+{
+    const auto total_subwarp_count =
+        thread::get_subwarp_num_flat<SubwarpSize, IndexType>();
+    const auto begin_row =
+        thread::get_subwarp_id_flat<SubwarpSize, IndexType>();
+
+    auto thread_block = group::this_thread_block();
+    auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
+    const auto subwarp_idx = subwarp_grp.thread_rank();
+
+    bool local_change{false};
+    for (auto row = begin_row; row < num_rows; row += total_subwarp_count) {
+        if (row >= num_cols) {
+            if (subwarp_idx == 0) {
+                elements_to_add_per_row[row] = 0;
+            }
+            continue;
+        }
+        const auto *start_cols = col_idxs + row_ptrs[row];
+        const auto *end_cols = col_idxs + row_ptrs[row + 1];
+        if (detail::find_helper<IsSorted>::find(subwarp_grp, start_cols,
+                                                end_cols, row)) {
+            if (subwarp_idx == 0) {
+                elements_to_add_per_row[row] = 0;
+            }
+        } else {
+            if (subwarp_idx == 0) {
+                elements_to_add_per_row[row] = 1;
+            }
+            local_change = true;
+        }
+    }
+    // Could also be reduced (not sure if that leads to a performance benefit)
+    if (local_change && subwarp_idx == 0) {
+        *changes_required = true;
+    }
+}
+
+
+// SubwarpSize needs to be a power of 2
+// Each subwarp works on one row
+template <int SubwarpSize, typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void add_missing_diagonal_elements(
+        IndexType num_rows, const ValueType *__restrict__ old_values,
+        const IndexType *__restrict__ old_col_idxs,
+        const IndexType *__restrict__ old_row_ptrs,
+        ValueType *__restrict__ new_values,
+        IndexType *__restrict__ new_col_idxs,
+        const IndexType *__restrict__ row_ptrs_addition)
+{
+    // Precaution in case not enough threads were created
+    const auto total_subwarp_count =
+        thread::get_subwarp_num_flat<SubwarpSize, IndexType>();
+    const auto begin_row =
+        thread::get_subwarp_id_flat<SubwarpSize, IndexType>();
+
+    auto thread_block = group::this_thread_block();
+    auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
+    const auto subwarp_idx = subwarp_grp.thread_rank();
+
+    for (auto row = begin_row; row < num_rows; row += total_subwarp_count) {
+        const IndexType old_row_start{old_row_ptrs[row]};
+        const IndexType old_row_end{old_row_ptrs[row + 1]};
+        const IndexType new_row_start{old_row_start + row_ptrs_addition[row]};
+        const IndexType new_row_end{old_row_end + row_ptrs_addition[row + 1]};
+
+        // if no element needs to be added, do a simple copy of the whole row
+        if (new_row_end - new_row_start == old_row_end - old_row_start) {
+            for (IndexType i = subwarp_idx; i < new_row_end - new_row_start;
+                 i += SubwarpSize) {
+                const IndexType new_idx = new_row_start + i;
+                const IndexType old_idx = old_row_start + i;
+                new_values[new_idx] = old_values[old_idx];
+                new_col_idxs[new_idx] = old_col_idxs[old_idx];
+            }
+        } else {
+            IndexType new_idx = new_row_start + subwarp_idx;
+            bool diagonal_added{false};
+            for (IndexType old_idx_start = old_row_start;
+                 old_idx_start < old_row_end;
+                 old_idx_start += SubwarpSize, new_idx += SubwarpSize) {
+                const auto old_idx = old_idx_start + subwarp_idx;
+                bool thread_is_active = old_idx < old_row_end;
+                const auto col_idx =
+                    thread_is_active ? old_col_idxs[old_idx] : IndexType{};
+                // automatically false if thread is not active
+                bool diagonal_add_required = !diagonal_added && row < col_idx;
+                auto ballot = subwarp_grp.ballot(diagonal_add_required);
+
+                if (ballot) {
+                    auto first_subwarp_idx = ffs(ballot) - 1;
+                    if (first_subwarp_idx == subwarp_idx) {
+                        new_values[new_idx] = zero<ValueType>();
+                        new_col_idxs[new_idx] = row;
+                    }
+                    if (thread_is_active) {
+                        // if diagonal was inserted in a thread below this one,
+                        // add it to the new_idx.
+                        bool is_thread_after_diagonal =
+                            (first_subwarp_idx <= subwarp_idx);
+                        new_idx += is_thread_after_diagonal;
+                        new_values[new_idx] = old_values[old_idx];
+                        new_col_idxs[new_idx] = col_idx;
+                        // if diagonal is inserted in a thread after this one,
+                        // it needs to be considered after writing the values
+                        new_idx += !is_thread_after_diagonal;
+                    }
+                    diagonal_added = true;
+                } else if (thread_is_active) {
+                    new_values[new_idx] = old_values[old_idx];
+                    new_col_idxs[new_idx] = col_idx;
+                }
+            }
+            if (!diagonal_added && subwarp_idx == 0) {
+                new_idx = new_row_end - 1;
+                new_values[new_idx] = zero<ValueType>();
+                new_col_idxs[new_idx] = row;
+            }
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void update_row_ptrs(
+    IndexType num_rows, IndexType *__restrict__ row_ptrs,
+    IndexType *__restrict__ row_ptr_addition)
+{
+    const auto total_thread_count = thread::get_thread_num_flat<IndexType>();
+    const auto begin_row = thread::get_thread_id_flat<IndexType>();
+
+    for (auto row = begin_row; row < num_rows; row += total_thread_count) {
+        row_ptrs[row] += row_ptr_addition[row];
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row,
+    IndexType *__restrict__ u_nnz_row)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        IndexType l_row_nnz{};
+        IndexType u_row_nnz{};
+        for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) {
+            auto col = col_idxs[idx];
+            // skip diagonal
+            l_row_nnz += (col < row);
+            u_row_nnz += (row < col);
+        }
+        // add the diagonal entry
+        l_nnz_row[row] = l_row_nnz + 1;
+        u_nnz_row[row] = u_row_nnz + 1;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void initialize_l_u(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values,
+    const IndexType *__restrict__ l_row_ptrs,
+    IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values,
+    const IndexType *__restrict__ u_row_ptrs,
+    IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
+        // default diagonal to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = val;
+                ++l_idx;
+            }
+            if (row < col) {
+                u_col_idxs[u_idx] = col;
+                u_values[u_idx] = val;
+                ++u_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        auto u_diag_idx = u_row_ptrs[row];
+        l_col_idxs[l_diag_idx] = row;
+        u_col_idxs[u_diag_idx] = row;
+        l_values[l_diag_idx] = one<ValueType>();
+        u_values[u_diag_idx] = diag_val;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        IndexType l_row_nnz{};
+        for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) {
+            auto col = col_idxs[idx];
+            // skip the diagonal entry
+            l_row_nnz += col < row;
+        }
+        // add the diagonal entry
+        l_nnz_row[row] = l_row_nnz + 1;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void initialize_l(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values,
+    const IndexType *__restrict__ l_row_ptrs,
+    IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values,
+    bool use_sqrt)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        // if there was no diagonal entry, default to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = val;
+                ++l_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        l_col_idxs[l_diag_idx] = row;
+        // compute square root with sentinel
+        if (use_sqrt) {
+            diag_val = sqrt(diag_val);
+            if (!is_finite(diag_val)) {
+                diag_val = one<ValueType>();
+            }
+        }
+        l_values[l_diag_idx] = diag_val;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/factorization/par_ict_spgeam_kernels.hpp.inc b/common/factorization/par_ict_spgeam_kernels.hpp.inc
new file mode 100644
index 00000000000..7a9febf3f03
--- /dev/null
+++ b/common/factorization/par_ict_spgeam_kernels.hpp.inc
@@ -0,0 +1,237 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int subwarp_size, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_nnz(
+    const IndexType *__restrict__ llt_row_ptrs,
+    const IndexType *__restrict__ llt_col_idxs,
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    IndexType *__restrict__ l_new_row_ptrs, IndexType num_rows)
+{
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    auto llt_begin = llt_row_ptrs[row];
+    auto llt_size = llt_row_ptrs[row + 1] - llt_begin;
+    auto a_begin = a_row_ptrs[row];
+    auto a_size = a_row_ptrs[row + 1] - a_begin;
+    IndexType count{};
+    group_merge<subwarp_size>(
+        a_col_idxs + a_begin, a_size, llt_col_idxs + llt_begin, llt_size,
+        subwarp,
+        [&](IndexType a_nz, IndexType a_col, IndexType llt_nz,
+            IndexType llt_col, IndexType out_nz, bool valid) {
+            auto col = min(a_col, llt_col);
+            // count the number of unique elements being merged
+            count +=
+                popcnt(subwarp.ballot(col <= row && a_col != llt_col && valid));
+            return true;
+        });
+    if (subwarp.thread_rank() == 0) {
+        l_new_row_ptrs[row] = count;
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
+    const IndexType *__restrict__ llt_row_ptrs,
+    const IndexType *__restrict__ llt_col_idxs,
+    const ValueType *__restrict__ llt_vals,
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const ValueType *__restrict__ a_vals,
+    const IndexType *__restrict__ l_row_ptrs,
+    const IndexType *__restrict__ l_col_idxs,
+    const ValueType *__restrict__ l_vals,
+    const IndexType *__restrict__ l_new_row_ptrs,
+    IndexType *__restrict__ l_new_col_idxs, ValueType *__restrict__ l_new_vals,
+    IndexType num_rows)
+{
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    auto lane = static_cast<IndexType>(subwarp.thread_rank());
+    auto lanemask_eq = config::lane_mask_type{1} << lane;
+    auto lanemask_lt = lanemask_eq - 1;
+
+    // merge lower triangle of A, L*L^T (and L)
+    auto l_begin = l_row_ptrs[row];
+    auto l_end = l_row_ptrs[row + 1];
+
+    auto llt_begin = llt_row_ptrs[row];
+    auto llt_end = llt_row_ptrs[row + 1];
+    auto llt_size = llt_end - llt_begin;
+
+    auto a_begin = a_row_ptrs[row];
+    auto a_end = a_row_ptrs[row + 1];
+    auto a_size = a_end - a_begin;
+
+    IndexType out_begin{};
+    auto out_size = llt_size + a_size;
+
+    IndexType l_new_begin = l_new_row_ptrs[row];
+
+    constexpr auto sentinel = device_numeric_limits<IndexType>::max;
+    // load column indices and values for the first merge step
+    auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
+    auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
+    auto llt_col =
+        checked_load(llt_col_idxs, llt_begin + lane, llt_end, sentinel);
+    auto llt_val =
+        checked_load(llt_vals, llt_begin + lane, llt_end, zero<ValueType>());
+    auto l_col = checked_load(l_col_idxs, l_begin + lane, l_end, sentinel);
+    auto l_val = checked_load(l_vals, l_begin + lane, l_end, zero<ValueType>());
+    bool skip_first{};
+    while (out_begin < out_size) {
+        // merge subwarp.size() elements from A and L*L^T
+        auto merge_result =
+            group_merge_step<subwarp_size>(a_col, llt_col, subwarp);
+        auto a_cur_col = merge_result.a_val;
+        auto llt_cur_col = merge_result.b_val;
+        auto a_cur_val = subwarp.shfl(a_val, merge_result.a_idx);
+        auto llt_cur_val = subwarp.shfl(llt_val, merge_result.b_idx);
+        auto valid = out_begin + lane < out_size;
+        // check if the previous thread has matching columns
+        auto equal_mask = subwarp.ballot(a_cur_col == llt_cur_col && valid);
+        auto prev_equal_mask = equal_mask << 1 | skip_first;
+        skip_first = bool(equal_mask >> (subwarp_size - 1));
+        auto prev_equal = bool(prev_equal_mask & lanemask_eq);
+
+        auto r_col = min(a_cur_col, llt_cur_col);
+        // find matching entry of L
+        // S(L) is a subset of S(A - L * L^T) since L has a diagonal
+        auto l_source = synchronous_fixed_binary_search<subwarp_size>(
+            [&](int i) { return subwarp.shfl(l_col, i) >= r_col; });
+        auto l_cur_col = subwarp.shfl(l_col, l_source);
+        auto l_cur_val = subwarp.shfl(l_val, l_source);
+
+        // determine actual values of A and L*L^T at r_col
+        if (r_col != a_cur_col) {
+            a_cur_val = zero<ValueType>();
+        }
+        if (r_col != llt_cur_col) {
+            llt_cur_val = zero<ValueType>();
+        }
+        auto r_val = a_cur_val - llt_cur_val;
+
+        // early return when reaching the upper diagonal
+        if (subwarp.all(r_col > row)) {
+            break;
+        }
+
+        // determine which threads will write output to L
+        auto use_l = l_cur_col == r_col;
+        auto do_write = !prev_equal && valid && r_col <= row;
+        auto l_new_advance_mask = subwarp.ballot(do_write);
+        // store values
+        if (do_write) {
+            auto diag = l_vals[l_row_ptrs[r_col + 1] - 1];
+            auto out_val = use_l ? l_cur_val : r_val / diag;
+            auto ofs = popcnt(l_new_advance_mask & lanemask_lt);
+            l_new_col_idxs[l_new_begin + ofs] = r_col;
+            l_new_vals[l_new_begin + ofs] = out_val;
+        }
+
+        // advance *_begin offsets
+        auto a_advance = merge_result.a_advance;
+        auto llt_advance = merge_result.b_advance;
+        auto l_advance = popcnt(subwarp.ballot(do_write && use_l));
+        auto l_new_advance = popcnt(l_new_advance_mask);
+        a_begin += a_advance;
+        llt_begin += llt_advance;
+        l_begin += l_advance;
+        l_new_begin += l_new_advance;
+        out_begin += subwarp_size;
+
+        // shuffle the unmerged elements to the front
+        a_col = subwarp.shfl_down(a_col, a_advance);
+        a_val = subwarp.shfl_down(a_val, a_advance);
+        llt_col = subwarp.shfl_down(llt_col, llt_advance);
+        llt_val = subwarp.shfl_down(llt_val, llt_advance);
+        l_col = subwarp.shfl_down(l_col, l_advance);
+        l_val = subwarp.shfl_down(l_val, l_advance);
+        /*
+         * To optimize memory access, we load the new elements for `a` and `llt`
+         * with a single load instruction:
+         * the lower part of the group loads new elements for `a`
+         * the upper part of the group loads new elements for `llt`
+         * `load_lane` is the part-local lane idx
+         * The elements for `a` have to be shuffled up afterwards.
+         */
+        auto load_a = lane < a_advance;
+        auto load_lane = load_a ? lane : lane - a_advance;
+        auto load_source_col = load_a ? a_col_idxs : llt_col_idxs;
+        auto load_source_val = load_a ? a_vals : llt_vals;
+        auto load_begin =
+            load_a ? a_begin + llt_advance : llt_begin + a_advance;
+        auto load_end = load_a ? a_end : llt_end;
+
+        auto load_idx = load_begin + load_lane;
+        auto loaded_col =
+            checked_load(load_source_col, load_idx, load_end, sentinel);
+        auto loaded_val = checked_load(load_source_val, load_idx, load_end,
+                                       zero<ValueType>());
+        // shuffle the `a` values to the end of the warp
+        auto lower_loaded_col = subwarp.shfl_up(loaded_col, llt_advance);
+        auto lower_loaded_val = subwarp.shfl_up(loaded_val, llt_advance);
+        if (lane >= llt_advance) {
+            a_col = lower_loaded_col;
+            a_val = lower_loaded_val;
+        }
+        if (lane >= a_advance) {
+            llt_col = loaded_col;
+            llt_val = loaded_val;
+        }
+        // load the new values for l
+        if (lane >= subwarp_size - l_advance) {
+            auto l_idx = l_begin + lane;
+            l_col = checked_load(l_col_idxs, l_idx, l_end, sentinel);
+            l_val = checked_load(l_vals, l_idx, l_end, zero<ValueType>());
+        }
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/factorization/par_ict_sweep_kernels.hpp.inc b/common/factorization/par_ict_sweep_kernels.hpp.inc
new file mode 100644
index 00000000000..060bacb2144
--- /dev/null
+++ b/common/factorization/par_ict_sweep_kernels.hpp.inc
@@ -0,0 +1,103 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void ict_sweep(
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const ValueType *__restrict__ a_vals,
+    const IndexType *__restrict__ l_row_ptrs,
+    const IndexType *__restrict__ l_row_idxs,
+    const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_vals,
+    IndexType l_nnz)
+{
+    auto l_nz = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (l_nz >= l_nnz) {
+        return;
+    }
+    auto row = l_row_idxs[l_nz];
+    auto col = l_col_idxs[l_nz];
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto lt_col_begin = l_row_ptrs[col];
+    auto lt_col_size = l_row_ptrs[col + 1] - lt_col_begin;
+    ValueType sum{};
+    IndexType lt_nz{};
+    auto last_entry = col;
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lt_col_begin,
+        lt_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType lt_idx,
+            IndexType lt_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == lt_row && l_col < last_entry) {
+                sum +=
+                    l_vals[l_idx + l_row_begin] * l_vals[lt_idx + lt_col_begin];
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(lt_row == row);
+            if (found_transp) {
+                lt_nz =
+                    subwarp.shfl(lt_idx + lt_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        auto to_write = row == col
+                            ? sqrt(a_val - sum)
+                            : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1];
+        if (is_finite(to_write)) {
+            l_vals[l_nz] = to_write;
+        }
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/factorization/par_ilu_kernels.hpp.inc b/common/factorization/par_ilu_kernels.hpp.inc
new file mode 100644
index 00000000000..af28012cf81
--- /dev/null
+++ b/common/factorization/par_ilu_kernels.hpp.inc
@@ -0,0 +1,82 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
+    size_type num_elements, const IndexType *__restrict__ row_idxs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values,
+    const IndexType *__restrict__ l_row_ptrs,
+    const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values,
+    const IndexType *__restrict__ u_row_ptrs,
+    const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values)
+{
+    const auto elem_id = thread::get_thread_id_flat<IndexType>();
+    if (elem_id < num_elements) {
+        const auto row = row_idxs[elem_id];
+        const auto col = col_idxs[elem_id];
+        const auto val = values[elem_id];
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[col];
+        ValueType sum{val};
+        ValueType last_operation{};
+        while (l_idx < l_row_ptrs[row + 1] && u_idx < u_row_ptrs[col + 1]) {
+            const auto l_col = l_col_idxs[l_idx];
+            const auto u_col = u_col_idxs[u_idx];
+            last_operation = zero<ValueType>();
+            if (l_col == u_col) {
+                last_operation = l_values[l_idx] * u_values[u_idx];
+                sum -= last_operation;
+            }
+            l_idx += (l_col <= u_col);
+            u_idx += (u_col <= l_col);
+        }
+        sum += last_operation;  // undo the last operation
+        if (row > col) {
+            auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1];
+            if (is_finite(to_write)) {
+                l_values[l_idx - 1] = to_write;
+            }
+        } else {
+            auto to_write = sum;
+            if (is_finite(to_write)) {
+                u_values[u_idx - 1] = to_write;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
new file mode 100644
index 00000000000..43addc2504b
--- /dev/null
+++ b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -0,0 +1,191 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int subwarp_size, typename IndexType, typename Predicate,
+          typename BeginCallback, typename StepCallback,
+          typename FinishCallback>
+__device__ void abstract_filter_impl(const IndexType *row_ptrs,
+                                     IndexType num_rows, Predicate pred,
+                                     BeginCallback begin_cb,
+                                     StepCallback step_cb,
+                                     FinishCallback finish_cb)
+{
+    auto subwarp = group::thread_block_tile<subwarp_size>();
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    auto lane = subwarp.thread_rank();
+    auto lane_prefix_mask = (config::lane_mask_type(1) << lane) - 1;
+    if (row >= num_rows) {
+        return;
+    }
+
+    auto begin = row_ptrs[row];
+    auto end = row_ptrs[row + 1];
+    begin_cb(row);
+    auto num_steps = ceildiv(end - begin, subwarp_size);
+    for (auto step = 0; step < num_steps; ++step) {
+        auto idx = begin + lane + step * subwarp_size;
+        auto keep = idx < end && pred(idx, begin, end);
+        auto mask = subwarp.ballot(keep);
+        step_cb(row, idx, keep, popcnt(mask), popcnt(mask & lane_prefix_mask));
+    }
+    finish_cb(row, lane);
+}
+
+
+template <int subwarp_size, typename Predicate, typename IndexType>
+__device__ void abstract_filter_nnz(const IndexType *__restrict__ row_ptrs,
+                                    IndexType num_rows, Predicate pred,
+                                    IndexType *__restrict__ nnz)
+{
+    IndexType count{};
+    abstract_filter_impl<subwarp_size>(
+        row_ptrs, num_rows, pred, [&](IndexType) { count = 0; },
+        [&](IndexType, IndexType, bool, IndexType warp_count, IndexType) {
+            count += warp_count;
+        },
+        [&](IndexType row, IndexType lane) {
+            if (row < num_rows && lane == 0) {
+                nnz[row] = count;
+            }
+        });
+}
+
+
+template <int subwarp_size, typename Predicate, typename IndexType,
+          typename ValueType>
+__device__ void abstract_filter(const IndexType *__restrict__ old_row_ptrs,
+                                const IndexType *__restrict__ old_col_idxs,
+                                const ValueType *__restrict__ old_vals,
+                                IndexType num_rows, Predicate pred,
+                                const IndexType *__restrict__ new_row_ptrs,
+                                IndexType *__restrict__ new_row_idxs,
+                                IndexType *__restrict__ new_col_idxs,
+                                ValueType *__restrict__ new_vals)
+{
+    IndexType count{};
+    IndexType new_offset{};
+    abstract_filter_impl<subwarp_size>(
+        old_row_ptrs, num_rows, pred,
+        [&](IndexType row) {
+            new_offset = new_row_ptrs[row];
+            count = 0;
+        },
+        [&](IndexType row, IndexType idx, bool keep, IndexType warp_count,
+            IndexType warp_prefix_sum) {
+            if (keep) {
+                auto new_idx = new_offset + warp_prefix_sum + count;
+                if (new_row_idxs) {
+                    new_row_idxs[new_idx] = row;
+                }
+                new_col_idxs[new_idx] = old_col_idxs[idx];
+                new_vals[new_idx] = old_vals[idx];
+            }
+            count += warp_count;
+        },
+        [](IndexType, IndexType) {});
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
+    const IndexType *__restrict__ row_ptrs, const ValueType *vals,
+    IndexType num_rows, remove_complex<ValueType> threshold,
+    IndexType *__restrict__ nnz, bool lower)
+{
+    abstract_filter_nnz<subwarp_size>(
+        row_ptrs, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            auto diag_idx = lower ? row_end - 1 : row_begin;
+            return abs(vals[idx]) >= threshold || idx == diag_idx;
+        },
+        nnz);
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void threshold_filter(
+    const IndexType *__restrict__ old_row_ptrs,
+    const IndexType *__restrict__ old_col_idxs,
+    const ValueType *__restrict__ old_vals, IndexType num_rows,
+    remove_complex<ValueType> threshold,
+    const IndexType *__restrict__ new_row_ptrs,
+    IndexType *__restrict__ new_row_idxs, IndexType *__restrict__ new_col_idxs,
+    ValueType *__restrict__ new_vals, bool lower)
+{
+    abstract_filter<subwarp_size>(
+        old_row_ptrs, old_col_idxs, old_vals, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            auto diag_idx = lower ? row_end - 1 : row_begin;
+            return abs(old_vals[idx]) >= threshold || idx == diag_idx;
+        },
+        new_row_ptrs, new_row_idxs, new_col_idxs, new_vals);
+}
+
+
+template <int subwarp_size, typename IndexType, typename BucketType>
+__global__ __launch_bounds__(default_block_size) void bucket_filter_nnz(
+    const IndexType *__restrict__ row_ptrs, const BucketType *buckets,
+    IndexType num_rows, BucketType bucket, IndexType *__restrict__ nnz)
+{
+    abstract_filter_nnz<subwarp_size>(
+        row_ptrs, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            return buckets[idx] >= bucket || idx == row_end - 1;
+        },
+        nnz);
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType,
+          typename BucketType>
+__global__ __launch_bounds__(default_block_size) void bucket_filter(
+    const IndexType *__restrict__ old_row_ptrs,
+    const IndexType *__restrict__ old_col_idxs,
+    const ValueType *__restrict__ old_vals, const BucketType *buckets,
+    IndexType num_rows, BucketType bucket,
+    const IndexType *__restrict__ new_row_ptrs,
+    IndexType *__restrict__ new_row_idxs, IndexType *__restrict__ new_col_idxs,
+    ValueType *__restrict__ new_vals)
+{
+    abstract_filter<subwarp_size>(
+        old_row_ptrs, old_col_idxs, old_vals, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            return buckets[idx] >= bucket || idx == row_end - 1;
+        },
+        new_row_ptrs, new_row_idxs, new_col_idxs, new_vals);
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
new file mode 100644
index 00000000000..a7a6b5a01f7
--- /dev/null
+++ b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -0,0 +1,308 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+constexpr auto searchtree_width = 1 << sampleselect_searchtree_height;
+constexpr auto searchtree_inner_size = searchtree_width - 1;
+constexpr auto searchtree_size = searchtree_width + searchtree_inner_size;
+
+constexpr auto sample_size = searchtree_width * sampleselect_oversampling;
+
+constexpr auto basecase_size = 1024;
+constexpr auto basecase_local_size = 4;
+constexpr auto basecase_block_size = basecase_size / basecase_local_size;
+
+
+// must be launched with one thread block and block size == searchtree_width
+/**
+ * @internal
+ *
+ * Samples `searchtree_width - 1` uniformly distributed elements
+ * and stores them in a binary search tree as splitters.
+ */
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(searchtree_width) void build_searchtree(
+    const ValueType *__restrict__ input, IndexType size,
+    remove_complex<ValueType> *__restrict__ tree_output)
+{
+    using AbsType = remove_complex<ValueType>;
+    auto idx = threadIdx.x;
+    AbsType samples[sampleselect_oversampling];
+    // assuming rounding towards zero
+    auto stride = double(size) / sample_size;
+#pragma unroll
+    for (auto i = 0; i < sampleselect_oversampling; ++i) {
+        auto lidx = idx * sampleselect_oversampling + i;
+        auto val = input[static_cast<IndexType>(lidx * stride)];
+        samples[i] = abs(val);
+    }
+    __shared__ AbsType sh_samples[sample_size];
+    bitonic_sort<sample_size, sampleselect_oversampling>(samples, sh_samples);
+    if (idx > 0) {
+        // root has level 0
+        auto level = sampleselect_searchtree_height - ffs(threadIdx.x);
+        // we get the in-level index by removing trailing 10000...
+        auto idx_in_level = threadIdx.x >> ffs(threadIdx.x);
+        // we get the global index by adding previous levels
+        auto previous_levels = (1 << level) - 1;
+        tree_output[idx_in_level + previous_levels] = samples[0];
+    }
+    tree_output[threadIdx.x + searchtree_inner_size] = samples[0];
+}
+
+
+// must be launched with default_block_size >= searchtree_width
+/**
+ * @internal
+ *
+ * Computes the number of elements in each of the buckets defined
+ * by the splitter search tree. Stores the thread-block local
+ * results packed by bucket idx.
+ */
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_buckets(
+    const ValueType *__restrict__ input, IndexType size,
+    const remove_complex<ValueType> *__restrict__ tree, IndexType *counter,
+    unsigned char *oracles, int items_per_thread)
+{
+    // load tree into shared memory, initialize counters
+    __shared__ remove_complex<ValueType> sh_tree[searchtree_inner_size];
+    __shared__ IndexType sh_counter[searchtree_width];
+    if (threadIdx.x < searchtree_inner_size) {
+        sh_tree[threadIdx.x] = tree[threadIdx.x];
+    }
+    if (threadIdx.x < searchtree_width) {
+        sh_counter[threadIdx.x] = 0;
+    }
+    group::this_thread_block().sync();
+
+    // work distribution: each thread block gets a consecutive index range
+    auto begin = threadIdx.x + default_block_size *
+                                   static_cast<IndexType>(blockIdx.x) *
+                                   items_per_thread;
+    auto block_end = default_block_size *
+                     static_cast<IndexType>(blockIdx.x + 1) * items_per_thread;
+    auto end = min(block_end, size);
+    for (IndexType i = begin; i < end; i += default_block_size) {
+        // traverse the search tree with the input element
+        auto el = abs(input[i]);
+        IndexType tree_idx{};
+#pragma unroll
+        for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
+            auto cmp = !(el < sh_tree[tree_idx]);
+            tree_idx = 2 * tree_idx + 1 + cmp;
+        }
+        // increment the bucket counter and store the bucket index
+        uint32 bucket = tree_idx - searchtree_inner_size;
+        // post-condition: sample[bucket] <= el < sample[bucket + 1]
+        atomic_add<IndexType>(sh_counter + bucket, 1);
+        oracles[i] = bucket;
+    }
+    group::this_thread_block().sync();
+
+    // write back the block-wide counts to global memory
+    if (threadIdx.x < searchtree_width) {
+        counter[blockIdx.x + threadIdx.x * gridDim.x] = sh_counter[threadIdx.x];
+    }
+}
+
+
+// must be launched with default_block_size threads per block
+/**
+ * @internal
+ *
+ * Simultaneously computes a prefix and total sum of the block-local counts for
+ * each bucket. The results are then used as base offsets for the following
+ * filter step.
+ */
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void block_prefix_sum(
+    IndexType *__restrict__ counters, IndexType *__restrict__ totals,
+    IndexType num_blocks)
+{
+    constexpr auto num_warps = default_block_size / config::warp_size;
+    static_assert(num_warps < config::warp_size,
+                  "block size needs to be smaller");
+    __shared__ IndexType warp_sums[num_warps];
+
+    auto block = group::this_thread_block();
+    auto warp = group::tiled_partition<config::warp_size>(block);
+
+    auto bucket = blockIdx.x;
+    auto local_counters = counters + num_blocks * bucket;
+    auto work_per_warp = ceildiv(num_blocks, warp.size());
+    auto warp_idx = threadIdx.x / warp.size();
+    auto warp_lane = warp.thread_rank();
+
+    // compute prefix sum over warp-sized blocks
+    IndexType total{};
+    auto base_idx = warp_idx * work_per_warp * warp.size();
+    for (auto step = 0; step < work_per_warp; ++step) {
+        auto idx = warp_lane + step * warp.size() + base_idx;
+        auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
+        IndexType warp_total{};
+        IndexType warp_prefix{};
+        // compute inclusive prefix sum
+        subwarp_prefix_sum<false>(val, warp_prefix, warp_total, warp);
+
+        if (idx < num_blocks) {
+            local_counters[idx] = warp_prefix + total;
+        }
+        total += warp_total;
+    }
+
+    // store total sum
+    if (warp_lane == 0) {
+        warp_sums[warp_idx] = total;
+    }
+
+    // compute prefix sum over all warps in a single warp
+    block.sync();
+    if (warp_idx == 0) {
+        auto in_bounds = warp_lane < num_warps;
+        auto val = in_bounds ? warp_sums[warp_lane] : zero<IndexType>();
+        IndexType prefix_sum{};
+        IndexType total_sum{};
+        // compute inclusive prefix sum
+        subwarp_prefix_sum<false>(val, prefix_sum, total_sum, warp);
+        if (in_bounds) {
+            warp_sums[warp_lane] = prefix_sum;
+        }
+        if (warp_lane == 0) {
+            totals[bucket] = total_sum;
+        }
+    }
+
+    // add block prefix sum to each warp's block of data
+    block.sync();
+    auto warp_prefixsum = warp_sums[warp_idx];
+    for (auto step = 0; step < work_per_warp; ++step) {
+        auto idx = warp_lane + step * warp.size() + base_idx;
+        auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
+        if (idx < num_blocks) {
+            local_counters[idx] += warp_prefixsum;
+        }
+    }
+}
+
+
+// must be launched with default_block_size >= searchtree_width
+/**
+ * @internal
+ *
+ * This copies all elements from a single bucket of the input to the output.
+ */
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void filter_bucket(
+    const ValueType *__restrict__ input, IndexType size, unsigned char bucket,
+    const unsigned char *oracles, const IndexType *block_offsets,
+    remove_complex<ValueType> *__restrict__ output, int items_per_thread)
+{
+    // initialize the counter with the block prefix sum.
+    __shared__ IndexType counter;
+    if (threadIdx.x == 0) {
+        counter = block_offsets[blockIdx.x + bucket * gridDim.x];
+    }
+    group::this_thread_block().sync();
+
+    // same work-distribution as in count_buckets
+    auto begin = threadIdx.x + default_block_size *
+                                   static_cast<IndexType>(blockIdx.x) *
+                                   items_per_thread;
+    auto block_end = default_block_size *
+                     static_cast<IndexType>(blockIdx.x + 1) * items_per_thread;
+    auto end = min(block_end, size);
+    for (IndexType i = begin; i < end; i += default_block_size) {
+        // only copy the element when it belongs to the target bucket
+        auto found = bucket == oracles[i];
+        auto ofs = atomic_add<IndexType>(&counter, found);
+        if (found) {
+            output[ofs] = abs(input[i]);
+        }
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Selects the `rank`th smallest element from a small array by sorting it.
+ */
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(basecase_block_size) void basecase_select(
+    const ValueType *__restrict__ input, IndexType size, IndexType rank,
+    ValueType *__restrict__ out)
+{
+    constexpr auto sentinel = device_numeric_limits<ValueType>::inf;
+    ValueType local[basecase_local_size];
+    __shared__ ValueType sh_local[basecase_size];
+    for (int i = 0; i < basecase_local_size; ++i) {
+        auto idx = threadIdx.x + i * basecase_block_size;
+        local[i] = idx < size ? input[idx] : sentinel;
+    }
+    bitonic_sort<basecase_size, basecase_local_size>(local, sh_local);
+    if (threadIdx.x == rank / basecase_local_size) {
+        *out = local[rank % basecase_local_size];
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Finds the bucket that contains the element with the given rank
+ * and stores it and the bucket's base rank and size in the place of the prefix
+ * sum.
+ */
+template <typename IndexType>
+__global__ __launch_bounds__(config::warp_size) void find_bucket(
+    IndexType *prefix_sum, IndexType rank)
+{
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto idx = group_wide_search(0, searchtree_width, warp, [&](int i) {
+        return prefix_sum[i + 1] > rank;
+    });
+    if (warp.thread_rank() == 0) {
+        auto base = prefix_sum[idx];
+        auto size = prefix_sum[idx + 1] - base;
+        // don't overwrite anything before having loaded everything!
+        prefix_sum[0] = idx;
+        prefix_sum[1] = base;
+        prefix_sum[2] = size;
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/factorization/par_ilut_spgeam_kernels.hpp.inc
new file mode 100644
index 00000000000..903968bf4a6
--- /dev/null
+++ b/common/factorization/par_ilut_spgeam_kernels.hpp.inc
@@ -0,0 +1,276 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int subwarp_size, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void tri_spgeam_nnz(
+    const IndexType *__restrict__ lu_row_ptrs,
+    const IndexType *__restrict__ lu_col_idxs,
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    IndexType *__restrict__ l_new_row_ptrs,
+    IndexType *__restrict__ u_new_row_ptrs, IndexType num_rows)
+{
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    auto lu_begin = lu_row_ptrs[row];
+    auto lu_size = lu_row_ptrs[row + 1] - lu_begin;
+    auto a_begin = a_row_ptrs[row];
+    auto a_size = a_row_ptrs[row + 1] - a_begin;
+    IndexType l_count{};
+    IndexType u_count{};
+    group_merge<subwarp_size>(
+        a_col_idxs + a_begin, a_size, lu_col_idxs + lu_begin, lu_size, subwarp,
+        [&](IndexType a_nz, IndexType a_col, IndexType lu_nz, IndexType lu_col,
+            IndexType out_nz, bool valid) {
+            auto col = min(a_col, lu_col);
+            // count the number of unique elements being merged
+            l_count +=
+                popcnt(subwarp.ballot(col <= row && a_col != lu_col && valid));
+            u_count +=
+                popcnt(subwarp.ballot(col >= row && a_col != lu_col && valid));
+            return true;
+        });
+    if (subwarp.thread_rank() == 0) {
+        l_new_row_ptrs[row] = l_count;
+        u_new_row_ptrs[row] = u_count;
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
+    const IndexType *__restrict__ lu_row_ptrs,
+    const IndexType *__restrict__ lu_col_idxs,
+    const ValueType *__restrict__ lu_vals,
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const ValueType *__restrict__ a_vals,
+    const IndexType *__restrict__ l_row_ptrs,
+    const IndexType *__restrict__ l_col_idxs,
+    const ValueType *__restrict__ l_vals,
+    const IndexType *__restrict__ u_row_ptrs,
+    const IndexType *__restrict__ u_col_idxs,
+    const ValueType *__restrict__ u_vals,
+    const IndexType *__restrict__ l_new_row_ptrs,
+    IndexType *__restrict__ l_new_col_idxs, ValueType *__restrict__ l_new_vals,
+    const IndexType *__restrict__ u_new_row_ptrs,
+    IndexType *__restrict__ u_new_col_idxs, ValueType *__restrict__ u_new_vals,
+    IndexType num_rows)
+{
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    auto lane = static_cast<IndexType>(subwarp.thread_rank());
+    auto lanemask_eq = config::lane_mask_type{1} << lane;
+    auto lanemask_lt = lanemask_eq - 1;
+
+    // merge A, L*U (and L+U)
+    auto l_begin = l_row_ptrs[row];
+    auto l_end = l_row_ptrs[row + 1] - 1;  // ignore diagonal
+    auto l_size = l_end - l_begin;
+
+    auto u_begin = u_row_ptrs[row];
+    auto u_end = u_row_ptrs[row + 1];
+    auto u_size = u_end - u_begin;
+
+    // lpu_* stores the entries of L + U with the diagonal from U
+    // this allows us to act as if L and U were a single matrix
+    auto lpu_begin = l_begin;
+    auto lpu_end = lpu_begin + l_size + u_size;
+    auto lpu_col_idxs =
+        lpu_begin + lane < l_end ? l_col_idxs : u_col_idxs + u_begin - l_end;
+    auto lpu_vals =
+        lpu_begin + lane < l_end ? l_vals : u_vals + u_begin - l_end;
+
+    auto lu_begin = lu_row_ptrs[row];
+    auto lu_end = lu_row_ptrs[row + 1];
+    auto lu_size = lu_end - lu_begin;
+
+    auto a_begin = a_row_ptrs[row];
+    auto a_end = a_row_ptrs[row + 1];
+    auto a_size = a_end - a_begin;
+
+    IndexType out_begin{};
+    auto out_size = lu_size + a_size;
+
+    IndexType l_new_begin = l_new_row_ptrs[row];
+    IndexType u_new_begin = u_new_row_ptrs[row];
+
+    constexpr auto sentinel = device_numeric_limits<IndexType>::max;
+    // load column indices and values for the first merge step
+    auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
+    auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
+    auto lu_col = checked_load(lu_col_idxs, lu_begin + lane, lu_end, sentinel);
+    auto lu_val =
+        checked_load(lu_vals, lu_begin + lane, lu_end, zero<ValueType>());
+    auto lpu_col =
+        checked_load(lpu_col_idxs, lpu_begin + lane, lpu_end, sentinel);
+    auto lpu_val =
+        checked_load(lpu_vals, lpu_begin + lane, lpu_end, zero<ValueType>());
+    bool skip_first{};
+    while (out_begin < out_size) {
+        // merge subwarp.size() elements from A and L*U
+        auto merge_result =
+            group_merge_step<subwarp_size>(a_col, lu_col, subwarp);
+        auto a_cur_col = merge_result.a_val;
+        auto lu_cur_col = merge_result.b_val;
+        auto a_cur_val = subwarp.shfl(a_val, merge_result.a_idx);
+        auto lu_cur_val = subwarp.shfl(lu_val, merge_result.b_idx);
+        auto valid = out_begin + lane < out_size;
+        // check if the previous thread has matching columns
+        auto equal_mask = subwarp.ballot(a_cur_col == lu_cur_col && valid);
+        auto prev_equal_mask = equal_mask << 1 | skip_first;
+        skip_first = bool(equal_mask >> (subwarp_size - 1));
+        auto prev_equal = bool(prev_equal_mask & lanemask_eq);
+
+        auto r_col = min(a_cur_col, lu_cur_col);
+        // find matching entry of L+U
+        // S(L + U) is a subset of S(A - L * U) since L and U have a diagonal
+        auto lpu_source = synchronous_fixed_binary_search<subwarp_size>(
+            [&](int i) { return subwarp.shfl(lpu_col, i) >= r_col; });
+        auto lpu_cur_col = subwarp.shfl(lpu_col, lpu_source);
+        auto lpu_cur_val = subwarp.shfl(lpu_val, lpu_source);
+
+        // determine actual values of A and L*U at r_col
+        if (r_col != a_cur_col) {
+            a_cur_val = zero<ValueType>();
+        }
+        if (r_col != lu_cur_col) {
+            lu_cur_val = zero<ValueType>();
+        }
+        auto r_val = a_cur_val - lu_cur_val;
+
+        // determine which threads will write output to L or U
+        auto use_lpu = lpu_cur_col == r_col;
+        auto l_new_advance_mask =
+            subwarp.ballot(r_col <= row && !prev_equal && valid);
+        auto u_new_advance_mask =
+            subwarp.ballot(r_col >= row && !prev_equal && valid);
+        // store values
+        if (!prev_equal && valid) {
+            auto diag =
+                r_col < row ? u_vals[u_row_ptrs[r_col]] : one<ValueType>();
+            auto out_val = use_lpu ? lpu_cur_val : r_val / diag;
+            if (r_col <= row) {
+                auto ofs = popcnt(l_new_advance_mask & lanemask_lt);
+                l_new_col_idxs[l_new_begin + ofs] = r_col;
+                l_new_vals[l_new_begin + ofs] =
+                    r_col == row ? one<ValueType>() : out_val;
+            }
+            if (r_col >= row) {
+                auto ofs = popcnt(u_new_advance_mask & lanemask_lt);
+                u_new_col_idxs[u_new_begin + ofs] = r_col;
+                u_new_vals[u_new_begin + ofs] = out_val;
+            }
+        }
+
+        // advance *_begin offsets
+        auto a_advance = merge_result.a_advance;
+        auto lu_advance = merge_result.b_advance;
+        auto lpu_advance =
+            popcnt(subwarp.ballot(use_lpu && !prev_equal && valid));
+        auto l_new_advance = popcnt(l_new_advance_mask);
+        auto u_new_advance = popcnt(u_new_advance_mask);
+        a_begin += a_advance;
+        lu_begin += lu_advance;
+        lpu_begin += lpu_advance;
+        l_new_begin += l_new_advance;
+        u_new_begin += u_new_advance;
+        out_begin += subwarp_size;
+
+        // shuffle the unmerged elements to the front
+        a_col = subwarp.shfl_down(a_col, a_advance);
+        a_val = subwarp.shfl_down(a_val, a_advance);
+        lu_col = subwarp.shfl_down(lu_col, lu_advance);
+        lu_val = subwarp.shfl_down(lu_val, lu_advance);
+        lpu_col = subwarp.shfl_down(lpu_col, lpu_advance);
+        lpu_val = subwarp.shfl_down(lpu_val, lpu_advance);
+        /*
+         * To optimize memory access, we load the new elements for `a` and `lu`
+         * with a single load instruction:
+         * the lower part of the group loads new elements for `a`
+         * the upper part of the group loads new elements for `lu`
+         * `load_lane` is the part-local lane idx
+         * The elements for `a` have to be shuffled up afterwards.
+         */
+        auto load_a = lane < a_advance;
+        auto load_lane = load_a ? lane : lane - a_advance;
+        auto load_source_col = load_a ? a_col_idxs : lu_col_idxs;
+        auto load_source_val = load_a ? a_vals : lu_vals;
+        auto load_begin = load_a ? a_begin + lu_advance : lu_begin + a_advance;
+        auto load_end = load_a ? a_end : lu_end;
+
+        auto load_idx = load_begin + load_lane;
+        auto loaded_col =
+            checked_load(load_source_col, load_idx, load_end, sentinel);
+        auto loaded_val = checked_load(load_source_val, load_idx, load_end,
+                                       zero<ValueType>());
+        // shuffle the `a` values to the end of the warp
+        auto lower_loaded_col = subwarp.shfl_up(loaded_col, lu_advance);
+        auto lower_loaded_val = subwarp.shfl_up(loaded_val, lu_advance);
+        if (lane >= lu_advance) {
+            a_col = lower_loaded_col;
+            a_val = lower_loaded_val;
+        }
+        if (lane >= a_advance) {
+            lu_col = loaded_col;
+            lu_val = loaded_val;
+        }
+        // load the new values for lpu
+        if (lane >= subwarp_size - lpu_advance) {
+            auto lpu_idx = lpu_begin + lane;
+            // update lpu pointer if we move from l to u
+            if (lpu_idx >= l_end) {
+                lpu_col_idxs = u_col_idxs + u_begin - l_end;
+                lpu_vals = u_vals + u_begin - l_end;
+            }
+            lpu_col = checked_load(lpu_col_idxs, lpu_idx, lpu_end, sentinel);
+            lpu_val =
+                checked_load(lpu_vals, lpu_idx, lpu_end, zero<ValueType>());
+        }
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/factorization/par_ilut_sweep_kernels.hpp.inc b/common/factorization/par_ilut_sweep_kernels.hpp.inc
new file mode 100644
index 00000000000..96cfc951b64
--- /dev/null
+++ b/common/factorization/par_ilut_sweep_kernels.hpp.inc
@@ -0,0 +1,121 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void sweep(
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const ValueType *__restrict__ a_vals,
+    const IndexType *__restrict__ l_row_ptrs,
+    const IndexType *__restrict__ l_row_idxs,
+    const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_vals,
+    IndexType l_nnz, const IndexType *__restrict__ u_row_idxs,
+    const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_vals,
+    const IndexType *__restrict__ ut_col_ptrs,
+    const IndexType *__restrict__ ut_row_idxs, ValueType *__restrict__ ut_vals,
+    IndexType u_nnz)
+{
+    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (tidx >= l_nnz + u_nnz) {
+        return;
+    }
+    // split the subwarps into two halves for lower and upper triangle
+    auto l_nz = tidx;
+    auto u_nz = l_nz - l_nnz;
+    auto lower = u_nz < 0;
+    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
+    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
+    if (lower && row == col) {
+        // don't update the diagonal twice
+        return;
+    }
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto ut_col_begin = ut_col_ptrs[col];
+    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
+    ValueType sum{};
+    IndexType ut_nz{};
+    auto last_entry = min(row, col);
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
+        ut_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
+            IndexType ut_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == ut_row && l_col < last_entry) {
+                sum += l_vals[l_idx + l_row_begin] *
+                       ut_vals[ut_idx + ut_col_begin];
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(ut_row == row);
+            if (found_transp) {
+                ut_nz =
+                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        if (lower) {
+            auto to_write = (a_val - sum) / ut_vals[ut_col_ptrs[col + 1] - 1];
+            if (is_finite(to_write)) {
+                l_vals[l_nz] = to_write;
+            }
+        } else {
+            auto to_write = a_val - sum;
+            if (is_finite(to_write)) {
+                u_vals[u_nz] = to_write;
+                ut_vals[ut_nz] = to_write;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/matrix/coo_kernels.hpp.inc b/common/matrix/coo_kernels.hpp.inc
new file mode 100644
index 00000000000..b4dad369eb2
--- /dev/null
+++ b/common/matrix/coo_kernels.hpp.inc
@@ -0,0 +1,275 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+namespace {
+
+
+/**
+ * The device function of COO spmv
+ *
+ * @param nnz  the number of nonzeros in the matrix
+ * @param num_lines  the maximum round of each warp
+ * @param val  the value array of the matrix
+ * @param col  the column index array of the matrix
+ * @param row  the row index array of the matrix
+ * @param b  the input dense vector
+ * @param b_stride  the stride of the input dense vector
+ * @param c  the output dense vector
+ * @param c_stride  the stride of the output dense vector
+ * @param scale  the function on the added value
+ *
+ * @tparam ValueType  type of values stored in the matrix
+ * @tparam IndexType  type of matrix indexes stored in the structure
+ * @tparam Closure  type of the function used to write the result
+ */
+template <int subwarp_size = config::warp_size, typename ValueType,
+          typename IndexType, typename Closure>
+__device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
+                            const ValueType *__restrict__ val,
+                            const IndexType *__restrict__ col,
+                            const IndexType *__restrict__ row,
+                            const ValueType *__restrict__ b,
+                            const size_type b_stride, ValueType *__restrict__ c,
+                            const size_type c_stride, Closure scale)
+{
+    ValueType temp_val = zero<ValueType>();
+    const auto start = static_cast<size_type>(blockDim.x) * blockIdx.x *
+                           blockDim.y * num_lines +
+                       threadIdx.y * blockDim.x * num_lines;
+    const auto column_id = blockIdx.y;
+    size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size);
+    num = min(num, num_lines);
+    const IndexType ind_start = start + threadIdx.x;
+    const IndexType ind_end = ind_start + (num - 1) * subwarp_size;
+    IndexType ind = ind_start;
+    IndexType curr_row = (ind < nnz) ? row[ind] : 0;
+    const auto tile_block =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    for (; ind < ind_end; ind += subwarp_size) {
+        temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
+                                : zero<ValueType>();
+        auto next_row =
+            (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
+        // segmented scan
+        if (tile_block.any(curr_row != next_row)) {
+            bool is_first_in_segment =
+                segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+            if (is_first_in_segment) {
+                atomic_add(&(c[curr_row * c_stride + column_id]),
+                           scale(temp_val));
+            }
+            temp_val = zero<ValueType>();
+        }
+        curr_row = next_row;
+    }
+    if (num > 0) {
+        ind = ind_end;
+        temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
+                                : zero<ValueType>();
+        // segmented scan
+        bool is_first_in_segment =
+            segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+        if (is_first_in_segment) {
+            atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val));
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
+    const size_type nnz, const size_type num_lines,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const IndexType *__restrict__ row, const ValueType *__restrict__ b,
+    const size_type b_stride, ValueType *__restrict__ c,
+    const size_type c_stride)
+{
+    spmv_kernel(nnz, num_lines, val, col, row, b, b_stride, c, c_stride,
+                [](const ValueType &x) { return x; });
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
+    const size_type nnz, const size_type num_lines,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col, const IndexType *__restrict__ row,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    ValueType scale_factor = alpha[0];
+    spmv_kernel(
+        nnz, num_lines, val, col, row, b, b_stride, c, c_stride,
+        [&scale_factor](const ValueType &x) { return scale_factor * x; });
+}
+
+
+/**
+ * The device function of COO spmm
+ *
+ * @param nnz  the number of nonzeros in the matrix
+ * @param num_elems  the maximum number of nonzeros in each warp
+ * @param val  the value array of the matrix
+ * @param col  the column index array of the matrix
+ * @param row  the row index array of the matrix
+ * @param num_cols the number of columns of the matrix
+ * @param b  the input dense vector
+ * @param b_stride  the stride of the input dense vector
+ * @param c  the output dense vector
+ * @param c_stride  the stride of the output dense vector
+ * @param scale  the function on the added value
+ *
+ * @tparam ValueType  type of values stored in the matrix
+ * @tparam IndexType  type of matrix indexes stored in the structure
+ * @tparam Closure  type of the function used to write the result
+ */
+template <typename ValueType, typename IndexType, typename Closure>
+__device__ void spmm_kernel(const size_type nnz, const size_type num_elems,
+                            const ValueType *__restrict__ val,
+                            const IndexType *__restrict__ col,
+                            const IndexType *__restrict__ row,
+                            const size_type num_cols,
+                            const ValueType *__restrict__ b,
+                            const size_type b_stride, ValueType *__restrict__ c,
+                            const size_type c_stride, Closure scale)
+{
+    ValueType temp = zero<ValueType>();
+    const auto coo_idx =
+        (static_cast<size_type>(blockDim.y) * blockIdx.x + threadIdx.y) *
+        num_elems;
+    const auto column_id = blockIdx.y * blockDim.x + threadIdx.x;
+    const auto coo_end =
+        (coo_idx + num_elems > nnz) ? nnz : coo_idx + num_elems;
+    if (column_id < num_cols && coo_idx < nnz) {
+        auto curr_row = row[coo_idx];
+        auto idx = coo_idx;
+        for (; idx < coo_end - 1; idx++) {
+            temp += val[idx] * b[col[idx] * b_stride + column_id];
+            const auto next_row = row[idx + 1];
+            if (next_row != curr_row) {
+                atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp));
+                curr_row = next_row;
+                temp = zero<ValueType>();
+            }
+        }
+        temp += val[idx] * b[col[idx] * b_stride + column_id];
+        atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_spmm(
+    const size_type nnz, const size_type num_elems,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const IndexType *__restrict__ row, const size_type num_cols,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    spmm_kernel(nnz, num_elems, val, col, row, num_cols, b, b_stride, c,
+                c_stride, [](const ValueType &x) { return x; });
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_spmm(
+    const size_type nnz, const size_type num_elems,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col, const IndexType *__restrict__ row,
+    const size_type num_cols, const ValueType *__restrict__ b,
+    const size_type b_stride, ValueType *__restrict__ c,
+    const size_type c_stride)
+{
+    ValueType scale_factor = alpha[0];
+    spmm_kernel(
+        nnz, num_elems, val, col, row, num_cols, b, b_stride, c, c_stride,
+        [&scale_factor](const ValueType &x) { return scale_factor * x; });
+}
+
+
+}  // namespace
+
+
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
+    const IndexType *__restrict__ idxs, size_type num_nonzeros,
+    IndexType *__restrict__ ptrs, size_type length)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx == 0) {
+        ptrs[0] = 0;
+        ptrs[length - 1] = num_nonzeros;
+    }
+
+    if (0 < tidx && tidx < num_nonzeros) {
+        if (idxs[tidx - 1] < idxs[tidx]) {
+            for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) {
+                ptrs[i] = tidx;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (tidx_x < num_cols && tidx_y < num_rows) {
+        result[tidx_y * stride + tidx_x] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_dense(
+    size_type nnz, const IndexType *__restrict__ row_idxs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, size_type stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < nnz) {
+        result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/matrix/csr_kernels.hpp.inc b/common/matrix/csr_kernels.hpp.inc
new file mode 100644
index 00000000000..0ee4c34dad6
--- /dev/null
+++ b/common/matrix/csr_kernels.hpp.inc
@@ -0,0 +1,922 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <typename T>
+__host__ __device__ __forceinline__ T ceildivT(T nom, T denom)
+{
+    return (nom + denom - 1ll) / denom;
+}
+
+
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ bool block_segment_scan_reverse(
+    const IndexType *__restrict__ ind, ValueType *__restrict__ val)
+{
+    bool last = true;
+    const auto reg_ind = ind[threadIdx.x];
+#pragma unroll
+    for (int i = 1; i < spmv_block_size; i <<= 1) {
+        if (i == 1 && threadIdx.x < spmv_block_size - 1 &&
+            reg_ind == ind[threadIdx.x + 1]) {
+            last = false;
+        }
+        auto temp = zero<ValueType>();
+        if (threadIdx.x >= i && reg_ind == ind[threadIdx.x - i]) {
+            temp = val[threadIdx.x - i];
+        }
+        group::this_thread_block().sync();
+        val[threadIdx.x] += temp;
+        group::this_thread_block().sync();
+    }
+
+    return last;
+}
+
+
+template <bool overflow, typename IndexType>
+__device__ __forceinline__ void find_next_row(
+    const IndexType num_rows, const IndexType data_size, const IndexType ind,
+    IndexType *__restrict__ row, IndexType *__restrict__ row_end,
+    const IndexType row_predict, const IndexType row_predict_end,
+    const IndexType *__restrict__ row_ptr)
+{
+    if (!overflow || ind < data_size) {
+        if (ind >= *row_end) {
+            *row = row_predict;
+            *row_end = row_predict_end;
+            for (; ind >= *row_end; *row_end = row_ptr[++*row + 1])
+                ;
+        }
+
+    } else {
+        *row = num_rows - 1;
+        *row_end = data_size;
+    }
+}
+
+
+template <size_type subwarp_size, typename ValueType, typename IndexType,
+          typename Closure>
+__device__ __forceinline__ void warp_atomic_add(
+    const group::thread_block_tile<subwarp_size> &group, bool force_write,
+    ValueType *__restrict__ val, const IndexType row, ValueType *__restrict__ c,
+    const size_type c_stride, const IndexType column_id, Closure scale)
+{
+    // do a local scan to avoid atomic collisions
+    const bool need_write = segment_scan(group, row, val);
+    if (need_write && force_write) {
+        atomic_add(&(c[row * c_stride + column_id]), scale(*val));
+    }
+    if (!need_write || force_write) {
+        *val = zero<ValueType>();
+    }
+}
+
+
+template <bool last, size_type subwarp_size, typename ValueType,
+          typename IndexType, typename Closure>
+__device__ __forceinline__ void process_window(
+    const group::thread_block_tile<subwarp_size> &group,
+    const IndexType num_rows, const IndexType data_size, const IndexType ind,
+    IndexType *__restrict__ row, IndexType *__restrict__ row_end,
+    IndexType *__restrict__ nrow, IndexType *__restrict__ nrow_end,
+    ValueType *__restrict__ temp_val, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b,
+    const size_type b_stride, ValueType *__restrict__ c,
+    const size_type c_stride, const IndexType column_id, Closure scale)
+{
+    const IndexType curr_row = *row;
+    find_next_row<last>(num_rows, data_size, ind, row, row_end, *nrow,
+                        *nrow_end, row_ptrs);
+    // segmented scan
+    if (group.any(curr_row != *row)) {
+        warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c,
+                        c_stride, column_id, scale);
+        *nrow = group.shfl(*row, subwarp_size - 1);
+        *nrow_end = group.shfl(*row_end, subwarp_size - 1);
+    }
+
+    if (!last || ind < data_size) {
+        const auto col = col_idxs[ind];
+        *temp_val += val[ind] * b[col * b_stride + column_id];
+    }
+}
+
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType get_warp_start_idx(
+    const IndexType nwarps, const IndexType nnz, const IndexType warp_idx)
+{
+    const long long cache_lines = ceildivT<IndexType>(nnz, wsize);
+    return (warp_idx * cache_lines / nwarps) * wsize;
+}
+
+
+template <typename ValueType, typename IndexType, typename Closure>
+__device__ __forceinline__ void spmv_kernel(
+    const IndexType nwarps, const IndexType num_rows,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride, Closure scale)
+{
+    const IndexType warp_idx = blockIdx.x * warps_in_block + threadIdx.y;
+    const IndexType column_id = blockIdx.y;
+    if (warp_idx >= nwarps) {
+        return;
+    }
+    const IndexType data_size = row_ptrs[num_rows];
+    const IndexType start = get_warp_start_idx(nwarps, data_size, warp_idx);
+    const IndexType end =
+        min(get_warp_start_idx(nwarps, data_size, warp_idx + 1),
+            ceildivT<IndexType>(data_size, wsize) * wsize);
+    auto row = srow[warp_idx];
+    auto row_end = row_ptrs[row + 1];
+    auto nrow = row;
+    auto nrow_end = row_end;
+    ValueType temp_val = zero<ValueType>();
+    IndexType ind = start + threadIdx.x;
+    find_next_row<true>(num_rows, data_size, ind, &row, &row_end, nrow,
+                        nrow_end, row_ptrs);
+    const IndexType ind_end = end - wsize;
+    const auto tile_block =
+        group::tiled_partition<wsize>(group::this_thread_block());
+    for (; ind < ind_end; ind += wsize) {
+        process_window<false>(tile_block, num_rows, data_size, ind, &row,
+                              &row_end, &nrow, &nrow_end, &temp_val, val,
+                              col_idxs, row_ptrs, b, b_stride, c, c_stride,
+                              column_id, scale);
+    }
+    process_window<true>(tile_block, num_rows, data_size, ind, &row, &row_end,
+                         &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs,
+                         b, b_stride, c, c_stride, column_id, scale);
+    warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id,
+                    scale);
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
+    const IndexType nwarps, const IndexType num_rows,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c,
+                c_stride, [](const ValueType &x) { return x; });
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
+    const IndexType nwarps, const IndexType num_rows,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    ValueType scale_factor = alpha[0];
+    spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c,
+                c_stride, [&scale_factor](const ValueType &x) {
+                    return scale_factor * x;
+                });
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void set_zero(
+    const size_type nnz, ValueType *__restrict__ val)
+{
+    const auto ind = thread::get_thread_id_flat();
+    if (ind < nnz) {
+        val[ind] = zero<ValueType>();
+    }
+}
+
+
+template <typename IndexType>
+__forceinline__ __device__ void merge_path_search(
+    const IndexType diagonal, const IndexType a_len, const IndexType b_len,
+    const IndexType *__restrict__ a, const IndexType offset_b,
+    IndexType *__restrict__ x, IndexType *__restrict__ y)
+{
+    auto x_min = max(diagonal - b_len, zero<IndexType>());
+    auto x_max = min(diagonal, a_len);
+    while (x_min < x_max) {
+        auto pivot = x_min + (x_max - x_min) / 2;
+        if (a[pivot] <= offset_b + diagonal - pivot - 1) {
+            x_min = pivot + 1;
+        } else {
+            x_max = pivot;
+        }
+    }
+
+    *x = min(x_min, a_len);
+    *y = diagonal - x_min;
+}
+
+
+template <typename ValueType, typename IndexType, typename Alpha_op>
+__device__ void merge_path_reduce(const IndexType nwarps,
+                                  const ValueType *__restrict__ last_val,
+                                  const IndexType *__restrict__ last_row,
+                                  ValueType *__restrict__ c,
+                                  const size_type c_stride, Alpha_op alpha_op)
+{
+    const IndexType cache_lines = ceildivT<IndexType>(nwarps, spmv_block_size);
+    const IndexType tid = threadIdx.x;
+    const IndexType start = min(tid * cache_lines, nwarps);
+    const IndexType end = min((tid + 1) * cache_lines, nwarps);
+    ValueType value = zero<ValueType>();
+    IndexType row = last_row[nwarps - 1];
+    if (start < nwarps) {
+        value = last_val[start];
+        row = last_row[start];
+        for (IndexType i = start + 1; i < end; i++) {
+            if (last_row[i] != row) {
+                c[row * c_stride] += alpha_op(value);
+                row = last_row[i];
+                value = last_val[i];
+            } else {
+                value += last_val[i];
+            }
+        }
+    }
+    __shared__ UninitializedArray<IndexType, spmv_block_size> tmp_ind;
+    __shared__ UninitializedArray<ValueType, spmv_block_size> tmp_val;
+    tmp_val[threadIdx.x] = value;
+    tmp_ind[threadIdx.x] = row;
+    group::this_thread_block().sync();
+    bool last = block_segment_scan_reverse(static_cast<IndexType *>(tmp_ind),
+                                           static_cast<ValueType *>(tmp_val));
+    group::this_thread_block().sync();
+    if (last) {
+        c[row * c_stride] += alpha_op(tmp_val[threadIdx.x]);
+    }
+}
+
+
+template <int items_per_thread, typename ValueType, typename IndexType,
+          typename Alpha_op, typename Beta_op>
+__device__ void merge_path_spmv(
+    const IndexType num_rows, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride,
+    IndexType *__restrict__ row_out, ValueType *__restrict__ val_out,
+    Alpha_op alpha_op, Beta_op beta_op)
+{
+    const auto *row_end_ptrs = row_ptrs + 1;
+    const auto nnz = row_ptrs[num_rows];
+    const IndexType num_merge_items = num_rows + nnz;
+    const auto block_items = spmv_block_size * items_per_thread;
+    __shared__ IndexType shared_row_ptrs[block_items];
+    const IndexType diagonal =
+        min(IndexType(block_items * blockIdx.x), num_merge_items);
+    const IndexType diagonal_end = min(diagonal + block_items, num_merge_items);
+    IndexType block_start_x;
+    IndexType block_start_y;
+    IndexType end_x;
+    IndexType end_y;
+    merge_path_search(diagonal, num_rows, nnz, row_end_ptrs, zero<IndexType>(),
+                      &block_start_x, &block_start_y);
+    merge_path_search(diagonal_end, num_rows, nnz, row_end_ptrs,
+                      zero<IndexType>(), &end_x, &end_y);
+    const IndexType block_num_rows = end_x - block_start_x;
+    const IndexType block_num_nonzeros = end_y - block_start_y;
+    for (int i = threadIdx.x;
+         i < block_num_rows && block_start_x + i < num_rows;
+         i += spmv_block_size) {
+        shared_row_ptrs[i] = row_end_ptrs[block_start_x + i];
+    }
+    group::this_thread_block().sync();
+
+    IndexType start_x;
+    IndexType start_y;
+    merge_path_search(IndexType(items_per_thread * threadIdx.x), block_num_rows,
+                      block_num_nonzeros, shared_row_ptrs, block_start_y,
+                      &start_x, &start_y);
+
+
+    IndexType ind = block_start_y + start_y;
+    IndexType row_i = block_start_x + start_x;
+    ValueType value = zero<ValueType>();
+#pragma unroll
+    for (IndexType i = 0; i < items_per_thread; i++) {
+        if (row_i < num_rows) {
+            if (start_x == block_num_rows || ind < shared_row_ptrs[start_x]) {
+                value += val[ind] * b[col_idxs[ind] * b_stride];
+                ind++;
+            } else {
+                c[row_i * c_stride] =
+                    alpha_op(value) + beta_op(c[row_i * c_stride]);
+                start_x++;
+                row_i++;
+                value = zero<ValueType>();
+            }
+        }
+    }
+    group::this_thread_block().sync();
+    IndexType *tmp_ind = shared_row_ptrs;
+    ValueType *tmp_val =
+        reinterpret_cast<ValueType *>(shared_row_ptrs + spmv_block_size);
+    tmp_val[threadIdx.x] = value;
+    tmp_ind[threadIdx.x] = row_i;
+    group::this_thread_block().sync();
+    bool last = block_segment_scan_reverse(static_cast<IndexType *>(tmp_ind),
+                                           static_cast<ValueType *>(tmp_val));
+    if (threadIdx.x == spmv_block_size - 1) {
+        row_out[blockIdx.x] = min(end_x, num_rows - 1);
+        val_out[blockIdx.x] = tmp_val[threadIdx.x];
+    } else if (last) {
+        c[row_i * c_stride] += alpha_op(tmp_val[threadIdx.x]);
+    }
+}
+
+template <int items_per_thread, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv(
+    const IndexType num_rows, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride,
+    IndexType *__restrict__ row_out, ValueType *__restrict__ val_out)
+{
+    merge_path_spmv<items_per_thread>(
+        num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride,
+        row_out, val_out, [](ValueType &x) { return x; },
+        [](ValueType &x) { return zero<ValueType>(); });
+}
+
+
+template <int items_per_thread, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv(
+    const IndexType num_rows, const ValueType *__restrict__ alpha,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
+    const size_type c_stride, IndexType *__restrict__ row_out,
+    ValueType *__restrict__ val_out)
+{
+    const auto alpha_val = alpha[0];
+    const auto beta_val = beta[0];
+    merge_path_spmv<items_per_thread>(
+        num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride,
+        row_out, val_out, [&alpha_val](ValueType &x) { return alpha_val * x; },
+        [&beta_val](ValueType &x) { return beta_val * x; });
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_reduce(
+    const IndexType nwarps, const ValueType *__restrict__ last_val,
+    const IndexType *__restrict__ last_row, ValueType *__restrict__ c,
+    const size_type c_stride)
+{
+    merge_path_reduce(nwarps, last_val, last_row, c, c_stride,
+                      [](ValueType &x) { return x; });
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_reduce(
+    const IndexType nwarps, const ValueType *__restrict__ last_val,
+    const IndexType *__restrict__ last_row, const ValueType *__restrict__ alpha,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    const auto alpha_val = alpha[0];
+    merge_path_reduce(nwarps, last_val, last_row, c, c_stride,
+                      [&alpha_val](ValueType &x) { return alpha_val * x; });
+}
+
+
+template <size_type subwarp_size, typename ValueType, typename IndexType,
+          typename Closure>
+__device__ void device_classical_spmv(const size_type num_rows,
+                                      const ValueType *__restrict__ val,
+                                      const IndexType *__restrict__ col_idxs,
+                                      const IndexType *__restrict__ row_ptrs,
+                                      const ValueType *__restrict__ b,
+                                      const size_type b_stride,
+                                      ValueType *__restrict__ c,
+                                      const size_type c_stride, Closure scale)
+{
+    auto subwarp_tile =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
+    const auto subid = subwarp_tile.thread_rank();
+    const auto column_id = blockIdx.y;
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    for (; row < num_rows; row += subrow) {
+        const auto ind_end = row_ptrs[row + 1];
+        ValueType temp_val = zero<ValueType>();
+        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
+             ind += subwarp_size) {
+            temp_val += val[ind] * b[col_idxs[ind] * b_stride + column_id];
+        }
+        auto subwarp_result = reduce(
+            subwarp_tile, temp_val,
+            [](const ValueType &a, const ValueType &b) { return a + b; });
+        if (subid == 0) {
+            c[row * c_stride + column_id] =
+                scale(subwarp_result, c[row * c_stride + column_id]);
+        }
+    }
+}
+
+
+template <size_type subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b,
+    const size_type b_stride, ValueType *__restrict__ c,
+    const size_type c_stride)
+{
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride,
+        [](const ValueType &x, const ValueType &y) { return x; });
+}
+
+
+template <size_type subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const ValueType *__restrict__ alpha,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
+    const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b,
+    const size_type b_stride, const ValueType *__restrict__ beta,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    const auto alpha_val = alpha[0];
+    const auto beta_val = beta[0];
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride,
+        [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
+            return alpha_val * x + beta_val * y;
+        });
+}
+
+
+template <int subwarp_size, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spgeam_nnz(
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const IndexType *__restrict__ b_row_ptrs,
+    const IndexType *__restrict__ b_col_idxs, IndexType num_rows,
+    IndexType *__restrict__ nnz)
+{
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (row >= num_rows) {
+        return;
+    }
+
+    const auto a_begin = a_row_ptrs[row];
+    const auto b_begin = b_row_ptrs[row];
+    const auto a_size = a_row_ptrs[row + 1] - a_begin;
+    const auto b_size = b_row_ptrs[row + 1] - b_begin;
+    IndexType count{};
+    group_merge<subwarp_size>(
+        a_col_idxs + a_begin, a_size, b_col_idxs + b_begin, b_size, subwarp,
+        [&](IndexType, IndexType a_col, IndexType, IndexType b_col, IndexType,
+            bool valid) {
+            count += popcnt(subwarp.ballot(a_col != b_col && valid));
+            return true;
+        });
+
+    if (subwarp.thread_rank() == 0) {
+        nnz[row] = count;
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spgeam(
+    const ValueType *__restrict__ palpha,
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const ValueType *__restrict__ a_vals, const ValueType *__restrict__ pbeta,
+    const IndexType *__restrict__ b_row_ptrs,
+    const IndexType *__restrict__ b_col_idxs,
+    const ValueType *__restrict__ b_vals, IndexType num_rows,
+    const IndexType *__restrict__ c_row_ptrs,
+    IndexType *__restrict__ c_col_idxs, ValueType *__restrict__ c_vals)
+{
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (row >= num_rows) {
+        return;
+    }
+
+    const auto alpha = palpha[0];
+    const auto beta = pbeta[0];
+    const auto lane = static_cast<IndexType>(subwarp.thread_rank());
+    constexpr auto lanemask_full =
+        ~config::lane_mask_type{} >> (config::warp_size - subwarp_size);
+    const auto lanemask_eq = config::lane_mask_type{1} << lane;
+    const auto lanemask_lt = lanemask_eq - 1;
+
+    const auto a_begin = a_row_ptrs[row];
+    const auto b_begin = b_row_ptrs[row];
+    const auto a_size = a_row_ptrs[row + 1] - a_begin;
+    const auto b_size = b_row_ptrs[row + 1] - b_begin;
+    auto c_begin = c_row_ptrs[row];
+    bool skip_first{};
+    group_merge<subwarp_size>(
+        a_col_idxs + a_begin, a_size, b_col_idxs + b_begin, b_size, subwarp,
+        [&](IndexType a_nz, IndexType a_col, IndexType b_nz, IndexType b_col,
+            IndexType, bool valid) {
+            auto c_col = min(a_col, b_col);
+            auto equal_mask = subwarp.ballot(a_col == b_col && valid);
+            // check if the elements in the previous merge step are
+            // equal
+            auto prev_equal_mask = equal_mask << 1 | skip_first;
+            // store the highest bit for the next group_merge_step
+            skip_first = bool(equal_mask >> (subwarp_size - 1));
+            auto prev_equal = bool(prev_equal_mask & lanemask_eq);
+            // only output an entry if the previous cols weren't equal.
+            // if they were equal, they were both handled in the
+            // previous step
+            if (valid && !prev_equal) {
+                auto c_ofs = popcnt(~prev_equal_mask & lanemask_lt);
+                c_col_idxs[c_begin + c_ofs] = c_col;
+                auto a_val =
+                    a_col <= b_col ? a_vals[a_nz + a_begin] : zero<ValueType>();
+                auto b_val =
+                    b_col <= a_col ? b_vals[b_nz + b_begin] : zero<ValueType>();
+                c_vals[c_begin + c_ofs] = alpha * a_val + beta * b_val;
+            }
+            // advance by the number of merged elements
+            // in theory, we would need to mask by `valid`, but this
+            // would only be false somwhere in the last iteration, where
+            // we don't need the value of c_begin afterwards, anyways.
+            c_begin += popcnt(~prev_equal_mask & lanemask_full);
+            return true;
+        });
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void convert_row_ptrs_to_idxs(
+    size_type num_rows, const IndexType *__restrict__ ptrs,
+    IndexType *__restrict__ idxs)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) {
+            idxs[i] = tidx;
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (tidx_x < num_cols && tidx_y < num_rows) {
+        result[tidx_y * stride + tidx_x] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_dense(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, size_type stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        for (auto i = row_ptrs[tidx]; i < row_ptrs[tidx + 1]; i++) {
+            result[stride * tidx + col_idxs[i]] = values[i];
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void calculate_nnz_per_row(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    size_type *__restrict__ nnz_per_row)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx];
+    }
+}
+
+
+__global__ __launch_bounds__(config::warp_size) void calculate_slice_lengths(
+    size_type num_rows, size_type slice_size, size_type stride_factor,
+    const size_type *__restrict__ nnz_per_row,
+    size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto sliceid = blockIdx.x;
+    const auto tid_in_warp = threadIdx.x;
+
+    if (sliceid * slice_size + tid_in_warp < num_rows) {
+        size_type thread_result = 0;
+        for (int i = tid_in_warp; i < slice_size; i += warp_size) {
+            thread_result =
+                (i + slice_size * sliceid < num_rows)
+                    ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
+                    : thread_result;
+        }
+
+        auto warp_tile =
+            group::tiled_partition<warp_size>(group::this_thread_block());
+        auto warp_result = reduce(
+            warp_tile, thread_result,
+            [](const size_type &a, const size_type &b) { return max(a, b); });
+
+        if (tid_in_warp == 0) {
+            auto slice_length =
+                ceildiv(warp_result, stride_factor) * stride_factor;
+            slice_lengths[sliceid] = slice_length;
+            slice_sets[sliceid] = slice_length;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_sellp(
+    size_type num_rows, size_type slice_size,
+    const ValueType *__restrict__ source_values,
+    const IndexType *__restrict__ source_row_ptrs,
+    const IndexType *__restrict__ source_col_idxs,
+    size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets,
+    IndexType *__restrict__ result_col_idxs,
+    ValueType *__restrict__ result_values)
+{
+    const auto global_row = thread::get_thread_id_flat();
+    const auto row = global_row % slice_size;
+    const auto sliceid = global_row / slice_size;
+
+    if (global_row < num_rows) {
+        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
+
+        for (size_type csr_ind = source_row_ptrs[global_row];
+             csr_ind < source_row_ptrs[global_row + 1]; csr_ind++) {
+            result_values[sellp_ind] = source_values[csr_ind];
+            result_col_idxs[sellp_ind] = source_col_idxs[csr_ind];
+            sellp_ind += slice_size;
+        }
+        for (size_type i = sellp_ind;
+             i <
+             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
+             i += slice_size) {
+            result_col_idxs[i] = 0;
+            result_values[i] = zero<ValueType>();
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void initialize_zero_ell(
+    size_type max_nnz_per_row, size_type stride, ValueType *__restrict__ values,
+    IndexType *__restrict__ col_idxs)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < stride * max_nnz_per_row) {
+        values[tidx] = zero<ValueType>();
+        col_idxs[tidx] = 0;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_ell(
+    size_type num_rows, size_type stride,
+    const ValueType *__restrict__ source_values,
+    const IndexType *__restrict__ source_row_ptrs,
+    const IndexType *__restrict__ source_col_idxs,
+    ValueType *__restrict__ result_values,
+    IndexType *__restrict__ result_col_idxs)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto row = thread::get_subwarp_id_flat<warp_size>();
+    const auto local_tidx = threadIdx.x % warp_size;
+
+    if (row < num_rows) {
+        for (size_type i = local_tidx;
+             i < source_row_ptrs[row + 1] - source_row_ptrs[row];
+             i += warp_size) {
+            const auto result_idx = row + stride * i;
+            const auto source_idx = i + source_row_ptrs[row];
+            result_values[result_idx] = source_values[source_idx];
+            result_col_idxs[result_idx] = source_col_idxs[source_idx];
+        }
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice(
+    size_type num_rows, size_type slice_size, size_type stride_factor,
+    const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result)
+{
+    constexpr auto warp_size = config::warp_size;
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+    const auto warpid = thread::get_subwarp_id_flat<warp_size>();
+    const auto tid_in_warp = warp_tile.thread_rank();
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    size_type thread_result = 0;
+    for (auto i = tid_in_warp; i < slice_size; i += warp_size) {
+        if (warpid * slice_size + i < num_rows) {
+            thread_result =
+                max(thread_result, nnz_per_row[warpid * slice_size + i]);
+        }
+    }
+    auto warp_result = reduce(
+        warp_tile, thread_result,
+        [](const size_type &a, const size_type &b) { return max(a, b); });
+
+    if (tid_in_warp == 0 && warpid < slice_num) {
+        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void reduce_total_cols(
+    size_type num_slices, const size_type *__restrict__ max_nnz_per_slice,
+    size_type *__restrict__ result)
+{
+    __shared__ size_type block_result[default_block_size];
+
+    reduce_array(num_slices, max_nnz_per_slice, block_result,
+                 [](const size_type &x, const size_type &y) { return x + y; });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_result[0];
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void reduce_max_nnz(
+    size_type size, const size_type *__restrict__ nnz_per_row,
+    size_type *__restrict__ result)
+{
+    __shared__ size_type block_max[default_block_size];
+
+    reduce_array(
+        size, nnz_per_row, block_max,
+        [](const size_type &x, const size_type &y) { return max(x, y); });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_max[0];
+    }
+}
+
+
+template <typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void calculate_hybrid_coo_row_nnz(
+        size_type num_rows, size_type ell_max_nnz_per_row,
+        IndexType *__restrict__ csr_row_idxs,
+        size_type *__restrict__ coo_row_nnz)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx];
+        coo_row_nnz[tidx] =
+            (csr_nnz > ell_max_nnz_per_row) * (csr_nnz - ell_max_nnz_per_row);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_hybrid(
+    size_type num_rows, size_type stride, size_type ell_max_nnz_per_row,
+    const ValueType *__restrict__ source_values,
+    const IndexType *__restrict__ source_row_ptrs,
+    const IndexType *__restrict__ source_col_idxs,
+    const size_type *__restrict__ coo_offset,
+    ValueType *__restrict__ result_ell_val,
+    IndexType *__restrict__ result_ell_col,
+    ValueType *__restrict__ result_coo_val,
+    IndexType *__restrict__ result_coo_col,
+    IndexType *__restrict__ result_coo_row)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto row = thread::get_subwarp_id_flat<warp_size>();
+    const auto local_tidx = threadIdx.x % warp_size;
+
+    if (row < num_rows) {
+        for (size_type i = local_tidx;
+             i < source_row_ptrs[row + 1] - source_row_ptrs[row];
+             i += warp_size) {
+            const auto source_idx = i + source_row_ptrs[row];
+            if (i < ell_max_nnz_per_row) {
+                const auto result_idx = row + stride * i;
+                result_ell_val[result_idx] = source_values[source_idx];
+                result_ell_col[result_idx] = source_col_idxs[source_idx];
+            } else {
+                const auto result_idx =
+                    coo_offset[row] + i - ell_max_nnz_per_row;
+                result_coo_val[result_idx] = source_values[source_idx];
+                result_coo_col[result_idx] = source_col_idxs[source_idx];
+                result_coo_row[result_idx] = row;
+            }
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void check_unsorted(
+    const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs, IndexType num_rows, bool *flag)
+{
+    __shared__ bool sh_flag;
+    auto block = group::this_thread_block();
+    if (block.thread_rank() == 0) {
+        sh_flag = *flag;
+    }
+    block.sync();
+
+    auto row = thread::get_thread_id_flat<IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    // fail early
+    if (sh_flag) {
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) {
+            if (col_idxs[nz] > col_idxs[nz + 1]) {
+                *flag = false;
+                sh_flag = false;
+                return;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
+
+
+namespace {
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void conjugate_kernel(
+    size_type num_nonzeros, ValueType *__restrict__ val)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_nonzeros) {
+        val[tidx] = conj(val[tidx]);
+    }
+}
+
+
+}  //  namespace
diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
new file mode 100644
index 00000000000..95e0c4ed7b2
--- /dev/null
+++ b/common/matrix/dense_kernels.hpp.inc
@@ -0,0 +1,484 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void scale(
+    size_type num_rows, size_type num_cols, size_type num_alpha_cols,
+    const ValueType *__restrict__ alpha, ValueType *__restrict__ x,
+    size_type stride_x)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
+    if (row_id < num_rows) {
+        x[row_id * stride_x + col_id] =
+            alpha[alpha_id] == zero<ValueType>()
+                ? zero<ValueType>()
+                : x[row_id * stride_x + col_id] * alpha[alpha_id];
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void add_scaled(
+    size_type num_rows, size_type num_cols, size_type num_alpha_cols,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ x,
+    size_type stride_x, ValueType *__restrict__ y, size_type stride_y)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
+    if (row_id < num_rows && alpha[alpha_id] != zero<ValueType>()) {
+        y[row_id * stride_y + col_id] +=
+            x[row_id * stride_x + col_id] * alpha[alpha_id];
+    }
+}
+
+
+template <size_type block_size, typename OutType, typename CallableGetValue,
+          typename CallableReduce>
+__device__ void compute_partial_reduce(size_type num_rows,
+                                       OutType *__restrict__ work,
+                                       CallableGetValue get_value,
+                                       CallableReduce reduce_op)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+
+    const auto num_blocks = gridDim.x;
+    const auto local_id = thread::get_local_thread_id<config::warp_size>();
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+
+    auto tmp = zero<OutType>();
+    for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
+        tmp = reduce_op(tmp, get_value(i));
+    }
+    __shared__ UninitializedArray<OutType, block_size> tmp_work;
+    tmp_work[local_id] = tmp;
+
+    reduce(group::this_thread_block(), static_cast<OutType *>(tmp_work),
+           reduce_op);
+
+    if (local_id == 0) {
+        work[thread::get_block_id()] = tmp_work[0];
+    }
+}
+
+
+template <size_type block_size, typename ValueType, typename CallableReduce,
+          typename CallableFinalize>
+__device__ void finalize_reduce_computation(size_type size,
+                                            const ValueType *work,
+                                            ValueType *result,
+                                            CallableReduce reduce_op,
+                                            CallableFinalize finalize_op)
+{
+    const auto local_id = thread::get_local_thread_id<config::warp_size>();
+
+    ValueType tmp = zero<ValueType>();
+    for (auto i = local_id; i < size; i += block_size) {
+        tmp = reduce_op(tmp, work[i]);
+    }
+    __shared__ UninitializedArray<ValueType, block_size> tmp_work;
+    tmp_work[local_id] = tmp;
+
+    reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
+           reduce_op);
+
+    if (local_id == 0) {
+        *result = finalize_op(tmp_work[0]);
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void compute_partial_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work)
+{
+    compute_partial_reduce<block_size>(
+        num_rows, work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return x[i * stride_x] * conj(y[i * stride_y]);
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_dot_computation(
+    size_type size, const ValueType *work, ValueType *result)
+{
+    finalize_reduce_computation<block_size>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return x; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void compute_partial_norm2(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    remove_complex<ValueType> *__restrict__ work)
+{
+    using norm_type = remove_complex<ValueType>;
+    compute_partial_reduce<block_size>(
+        num_rows, work,
+        [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
+        [](const norm_type &x, const norm_type &y) { return x + y; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_norm2_computation(
+    size_type size, const ValueType *work, ValueType *result)
+{
+    finalize_reduce_computation<block_size>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return sqrt(x); });
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_coo(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const size_type *__restrict__ row_ptrs,
+    const ValueType *__restrict__ source, IndexType *__restrict__ row_idxs,
+    IndexType *__restrict__ col_idxs, ValueType *__restrict__ values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        size_type write_to = row_ptrs[tidx];
+
+        for (size_type i = 0; i < num_cols; i++) {
+            if (source[stride * tidx + i] != zero<ValueType>()) {
+                values[write_to] = source[stride * tidx + i];
+                col_idxs[write_to] = i;
+                row_idxs[write_to] = tidx;
+                write_to++;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_nnz_per_row(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ work, IndexType *__restrict__ result)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto row_idx = thread::get_subwarp_id_flat<warp_size>();
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+
+    if (row_idx < num_rows) {
+        IndexType part_result{};
+        for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) {
+            if (work[stride * row_idx + i] != zero<ValueType>()) {
+                part_result += 1;
+            }
+        }
+        result[row_idx] = reduce(
+            warp_tile, part_result,
+            [](const size_type &a, const size_type &b) { return a + b; });
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_csr(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs,
+    IndexType *__restrict__ col_idxs, ValueType *__restrict__ values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_rows) {
+        auto write_to = row_ptrs[tidx];
+        for (auto i = 0; i < num_cols; i++) {
+            if (source[stride * tidx + i] != zero<ValueType>()) {
+                values[write_to] = source[stride * tidx + i];
+                col_idxs[write_to] = i;
+                write_to++;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_ell(
+    size_type num_rows, size_type num_cols, size_type source_stride,
+    const ValueType *__restrict__ source, size_type max_nnz_per_row,
+    size_type result_stride, IndexType *__restrict__ col_ptrs,
+    ValueType *__restrict__ values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        IndexType col_idx = 0;
+        for (size_type col = 0; col < num_cols; col++) {
+            if (source[tidx * source_stride + col] != zero<ValueType>()) {
+                col_ptrs[col_idx * result_stride + tidx] = col;
+                values[col_idx * result_stride + tidx] =
+                    source[tidx * source_stride + col];
+                col_idx++;
+            }
+        }
+        for (size_type j = col_idx; j < max_nnz_per_row; j++) {
+            col_ptrs[j * result_stride + tidx] = 0;
+            values[j * result_stride + tidx] = zero<ValueType>();
+        }
+    } else if (tidx < result_stride) {
+        for (size_type j = 0; j < max_nnz_per_row; j++) {
+            col_ptrs[j * result_stride + tidx] = 0;
+            values[j * result_stride + tidx] = zero<ValueType>();
+        }
+    }
+}
+
+
+__global__ __launch_bounds__(config::warp_size) void calculate_slice_lengths(
+    size_type num_rows, size_type slice_size, int slice_num,
+    size_type stride_factor, const size_type *__restrict__ nnz_per_row,
+    size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto sliceid = blockIdx.x;
+    const auto tid_in_warp = threadIdx.x;
+
+    if (sliceid * slice_size + tid_in_warp < num_rows) {
+        size_type thread_result = 0;
+        for (size_type i = tid_in_warp; i < slice_size; i += warp_size) {
+            thread_result =
+                (i + slice_size * sliceid < num_rows)
+                    ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
+                    : thread_result;
+        }
+
+        auto warp_tile =
+            group::tiled_partition<warp_size>(group::this_thread_block());
+        auto warp_result = reduce(
+            warp_tile, thread_result,
+            [](const size_type &a, const size_type &b) { return max(a, b); });
+
+        if (tid_in_warp == 0) {
+            auto slice_length =
+                ceildiv(warp_result, stride_factor) * stride_factor;
+            slice_lengths[sliceid] = slice_length;
+            slice_sets[sliceid] = slice_length;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_sellp(
+    size_type num_rows, size_type num_cols, size_type slice_size,
+    size_type stride, const ValueType *__restrict__ source,
+    size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets,
+    IndexType *__restrict__ col_idxs, ValueType *__restrict__ vals)
+{
+    const auto global_row = thread::get_thread_id_flat();
+    const auto row = global_row % slice_size;
+    const auto sliceid = global_row / slice_size;
+
+    if (global_row < num_rows) {
+        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
+
+        for (size_type col = 0; col < num_cols; col++) {
+            auto val = source[global_row * stride + col];
+            if (val != zero<ValueType>()) {
+                col_idxs[sellp_ind] = col;
+                vals[sellp_ind] = val;
+                sellp_ind += slice_size;
+            }
+        }
+        for (size_type i = sellp_ind;
+             i <
+             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
+             i += slice_size) {
+            col_idxs[i] = 0;
+            vals[i] = zero<ValueType>();
+        }
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void reduce_max_nnz(
+    size_type size, const size_type *__restrict__ nnz_per_row,
+    size_type *__restrict__ result)
+{
+    extern __shared__ size_type block_max[];
+
+    reduce_array(
+        size, nnz_per_row, block_max,
+        [](const size_type &x, const size_type &y) { return max(x, y); });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_max[0];
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice(
+    size_type num_rows, size_type slice_size, size_type stride_factor,
+    const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result)
+{
+    constexpr auto warp_size = config::warp_size;
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+    const auto warpid = thread::get_subwarp_id_flat<warp_size>();
+    const auto tid_in_warp = warp_tile.thread_rank();
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    size_type thread_result = 0;
+    for (size_type i = tid_in_warp; i < slice_size; i += warp_size) {
+        if (warpid * slice_size + i < num_rows) {
+            thread_result =
+                max(thread_result, nnz_per_row[warpid * slice_size + i]);
+        }
+    }
+
+    auto warp_result = reduce(
+        warp_tile, thread_result,
+        [](const size_type &a, const size_type &b) { return max(a, b); });
+
+    if (tid_in_warp == 0 && warpid < slice_num) {
+        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void reduce_total_cols(
+    size_type num_slices, const size_type *__restrict__ max_nnz_per_slice,
+    size_type *__restrict__ result)
+{
+    extern __shared__ size_type block_result[];
+
+    reduce_array(num_slices, max_nnz_per_slice, block_result,
+                 [](const size_type &x, const size_type &y) { return x + y; });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_result[0];
+    }
+}
+
+
+template <size_type block_size, typename IndexType, typename ValueType>
+__global__ __launch_bounds__(block_size) void row_permute(
+    size_type num_rows, size_type num_cols,
+    const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig,
+    size_type stride_orig, ValueType *__restrict__ result,
+    size_type stride_result)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + col_id] =
+            orig[perm_idxs[row_id] * stride_orig + col_id];
+    }
+}
+
+
+template <size_type block_size, typename IndexType, typename ValueType>
+__global__ __launch_bounds__(block_size) void column_permute(
+    size_type num_rows, size_type num_cols,
+    const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig,
+    size_type stride_orig, ValueType *__restrict__ result,
+    size_type stride_result)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + col_id] =
+            orig[row_id * stride_orig + perm_idxs[col_id]];
+    }
+}
+
+
+template <size_type block_size, typename IndexType, typename ValueType>
+__global__ __launch_bounds__(block_size) void inverse_row_permute(
+    size_type num_rows, size_type num_cols,
+    const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig,
+    size_type stride_orig, ValueType *__restrict__ result,
+    size_type stride_result)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[perm_idxs[row_id] * stride_result + col_id] =
+            orig[row_id * stride_orig + col_id];
+    }
+}
+
+
+template <size_type block_size, typename IndexType, typename ValueType>
+__global__ __launch_bounds__(block_size) void inverse_column_permute(
+    size_type num_rows, size_type num_cols,
+    const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig,
+    size_type stride_orig, ValueType *__restrict__ result,
+    size_type stride_result)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>();
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + perm_idxs[col_id]] =
+            orig[row_id * stride_orig + col_id];
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
new file mode 100644
index 00000000000..8b569b650c9
--- /dev/null
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -0,0 +1,240 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+namespace {
+
+
+template <int num_thread_per_worker, bool atomic, typename ValueType,
+          typename IndexType, typename Closure>
+__device__ void spmv_kernel(
+    const size_type num_rows, const int num_worker_per_row,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride, Closure op)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto column_id = blockIdx.y;
+    if (num_thread_per_worker == 1) {
+        // Specialize the num_thread_per_worker = 1. It doesn't need the shared
+        // memory, __syncthreads, and atomic_add
+        if (tidx < num_rows) {
+            ValueType temp = zero<ValueType>();
+            for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+                const auto ind = tidx + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx < idx) {
+                    break;
+                } else {
+                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                }
+            }
+            const auto c_ind = tidx * c_stride + column_id;
+            c[c_ind] = op(temp, c[c_ind]);
+        }
+    } else {
+        if (tidx < num_worker_per_row * num_rows) {
+            const auto idx_in_worker = threadIdx.y;
+            const auto x = tidx % num_rows;
+            const auto worker_id = tidx / num_rows;
+            const auto step_size = num_worker_per_row * num_thread_per_worker;
+            __shared__ UninitializedArray<ValueType, default_block_size /
+                                                         num_thread_per_worker>
+                storage;
+            if (idx_in_worker == 0) {
+                storage[threadIdx.x] = 0;
+            }
+            __syncthreads();
+            ValueType temp = zero<ValueType>();
+            for (size_type idx =
+                     worker_id * num_thread_per_worker + idx_in_worker;
+                 idx < num_stored_elements_per_row; idx += step_size) {
+                const auto ind = x + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx < idx) {
+                    break;
+                } else {
+                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                }
+            }
+            atomic_add(&storage[threadIdx.x], temp);
+            __syncthreads();
+            if (idx_in_worker == 0) {
+                const auto c_ind = x * c_stride + column_id;
+                if (atomic) {
+                    atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
+                } else {
+                    c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
+                }
+            }
+        }
+    }
+}
+
+
+template <int num_thread_per_worker, bool atomic = false, typename ValueType,
+          typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spmv(
+    const size_type num_rows, const int num_worker_per_row,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride)
+{
+    spmv_kernel<num_thread_per_worker, atomic>(
+        num_rows, num_worker_per_row, val, col, stride,
+        num_stored_elements_per_row, b, b_stride, c, c_stride,
+        [](const ValueType &x, const ValueType &y) { return x; });
+}
+
+
+template <int num_thread_per_worker, bool atomic = false, typename ValueType,
+          typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spmv(
+    const size_type num_rows, const int num_worker_per_row,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col, const size_type stride,
+    const size_type num_stored_elements_per_row,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
+    const size_type c_stride)
+{
+    const ValueType alpha_val = alpha[0];
+    const ValueType beta_val = beta[0];
+    // Because the atomic operation changes the values of c during computation,
+    // it can not do the right alpha * a * b + beta * c operation.
+    // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
+    // operation.
+    if (atomic) {
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
+            num_stored_elements_per_row, b, b_stride, c, c_stride,
+            [&alpha_val](const ValueType &x, const ValueType &y) {
+                return alpha_val * x;
+            });
+    } else {
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
+            num_stored_elements_per_row, b, b_stride, c, c_stride,
+            [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
+                return alpha_val * x + beta_val * y;
+            });
+    }
+}
+
+
+}  // namespace
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (tidx_x < num_cols && tidx_y < num_rows) {
+        result[tidx_y * stride + tidx_x] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_dense(
+    size_type num_rows, size_type nnz, size_type source_stride,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, size_type result_stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        for (auto col = 0; col < nnz; col++) {
+            result[tidx * result_stride +
+                   col_idxs[tidx + col * source_stride]] +=
+                values[tidx + col * source_stride];
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_nnz_per_row(
+    size_type num_rows, size_type max_nnz_per_row, size_type stride,
+    const ValueType *__restrict__ values, IndexType *__restrict__ result)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto row_idx = thread::get_subwarp_id_flat<warp_size>();
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+
+    if (row_idx < num_rows) {
+        IndexType part_result{};
+        for (auto i = warp_tile.thread_rank(); i < max_nnz_per_row;
+             i += warp_size) {
+            if (values[stride * i + row_idx] != zero<ValueType>()) {
+                part_result += 1;
+            }
+        }
+        result[row_idx] = reduce(
+            warp_tile, part_result,
+            [](const size_type &a, const size_type &b) { return a + b; });
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_csr(
+    size_type num_rows, size_type max_nnz_per_row, size_type stride,
+    const ValueType *__restrict__ source_values,
+    const IndexType *__restrict__ source_col_idxs,
+    IndexType *__restrict__ result_row_ptrs,
+    IndexType *__restrict__ result_col_idxs,
+    ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_rows) {
+        auto write_to = result_row_ptrs[tidx];
+        for (auto i = 0; i < max_nnz_per_row; i++) {
+            const auto source_idx = tidx + stride * i;
+            if (source_values[source_idx] != zero<ValueType>()) {
+                result_values[write_to] = source_values[source_idx];
+                result_col_idxs[write_to] = source_col_idxs[source_idx];
+                write_to++;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc
new file mode 100644
index 00000000000..a2c9d2c7ae4
--- /dev/null
+++ b/common/matrix/hybrid_kernels.hpp.inc
@@ -0,0 +1,142 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+/**
+ * The global function for counting the number of nonzeros per row of COO.
+ * It is almost like COO spmv routine.
+ * It performs is_nonzeros(Coo) times the vector whose values are one
+ *
+ * @param nnz  the number of nonzeros in the matrix
+ * @param num_line  the maximum round of each warp
+ * @param val  the value array of the matrix
+ * @param row  the row index array of the matrix
+ * @param nnz_per_row  the output nonzeros per row
+ */
+template <int subwarp_size = config::warp_size, typename ValueType,
+          typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_coo_row_nnz(
+    const size_type nnz, const size_type num_lines,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ row,
+    IndexType *__restrict__ nnz_per_row)
+{
+    IndexType temp_val = 0;
+    const auto start = static_cast<size_type>(blockDim.x) * blockIdx.x *
+                           blockDim.y * num_lines +
+                       threadIdx.y * blockDim.x * num_lines;
+    size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size);
+    num = min(num, num_lines);
+    const IndexType ind_start = start + threadIdx.x;
+    const IndexType ind_end = ind_start + (num - 1) * subwarp_size;
+    IndexType ind = ind_start;
+    IndexType curr_row = (ind < nnz) ? row[ind] : 0;
+    const auto tile_block =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    for (; ind < ind_end; ind += subwarp_size) {
+        temp_val += ind < nnz && val[ind] != zero<ValueType>();
+        auto next_row =
+            (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
+        // segmented scan
+        if (tile_block.any(curr_row != next_row)) {
+            bool is_first_in_segment =
+                segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+            if (is_first_in_segment) {
+                atomic_add(&(nnz_per_row[curr_row]), temp_val);
+            }
+            temp_val = 0;
+        }
+        curr_row = next_row;
+    }
+    if (num > 0) {
+        ind = ind_end;
+        temp_val += ind < nnz && val[ind] != zero<ValueType>();
+        // segmented scan
+
+        bool is_first_in_segment =
+            segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+        if (is_first_in_segment) {
+            atomic_add(&(nnz_per_row[curr_row]), temp_val);
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_csr(
+    size_type num_rows, size_type max_nnz_per_row, size_type stride,
+    const ValueType *__restrict__ ell_val,
+    const IndexType *__restrict__ ell_col,
+    const ValueType *__restrict__ coo_val,
+    const IndexType *__restrict__ coo_col,
+    const IndexType *__restrict__ coo_offset,
+    IndexType *__restrict__ result_row_ptrs,
+    IndexType *__restrict__ result_col_idxs,
+    ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_rows) {
+        auto write_to = result_row_ptrs[tidx];
+        for (auto i = 0; i < max_nnz_per_row; i++) {
+            const auto source_idx = tidx + stride * i;
+            if (ell_val[source_idx] != zero<ValueType>()) {
+                result_values[write_to] = ell_val[source_idx];
+                result_col_idxs[write_to] = ell_col[source_idx];
+                write_to++;
+            }
+        }
+        for (auto i = coo_offset[tidx]; i < coo_offset[tidx + 1]; i++) {
+            if (coo_val[i] != zero<ValueType>()) {
+                result_values[write_to] = coo_val[i];
+                result_col_idxs[write_to] = coo_col[i];
+                write_to++;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType1, typename ValueType2>
+__global__ __launch_bounds__(default_block_size) void add(
+    size_type num, ValueType1 *__restrict__ val1,
+    const ValueType2 *__restrict__ val2)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num) {
+        val1[tidx] += val2[tidx];
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/matrix/sellp_kernels.hpp.inc b/common/matrix/sellp_kernels.hpp.inc
new file mode 100644
index 00000000000..d1a0bee9d12
--- /dev/null
+++ b/common/matrix/sellp_kernels.hpp.inc
@@ -0,0 +1,199 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace {
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(matrix::default_slice_size) void spmv_kernel(
+    size_type num_rows, size_type num_right_hand_sides, size_type b_stride,
+    size_type c_stride, const size_type *__restrict__ slice_lengths,
+    const size_type *__restrict__ slice_sets, const ValueType *__restrict__ a,
+    const IndexType *__restrict__ col, const ValueType *__restrict__ b,
+    ValueType *__restrict__ c)
+{
+    const auto slice_id = blockIdx.x;
+    const auto slice_size = blockDim.x;
+    const auto row_in_slice = threadIdx.x;
+    const auto global_row =
+        static_cast<size_type>(slice_size) * slice_id + row_in_slice;
+    const auto column_id = blockIdx.y;
+    ValueType val = 0;
+    IndexType ind = 0;
+    if (global_row < num_rows && column_id < num_right_hand_sides) {
+        for (size_type i = 0; i < slice_lengths[slice_id]; i++) {
+            ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size;
+            val += a[ind] * b[col[ind] * b_stride + column_id];
+        }
+        c[global_row * c_stride + column_id] = val;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(matrix::default_slice_size) void advanced_spmv_kernel(
+        size_type num_rows, size_type num_right_hand_sides, size_type b_stride,
+        size_type c_stride, const size_type *__restrict__ slice_lengths,
+        const size_type *__restrict__ slice_sets,
+        const ValueType *__restrict__ alpha, const ValueType *__restrict__ a,
+        const IndexType *__restrict__ col, const ValueType *__restrict__ b,
+        const ValueType *__restrict__ beta, ValueType *__restrict__ c)
+{
+    const auto slice_id = blockIdx.x;
+    const auto slice_size = blockDim.x;
+    const auto row_in_slice = threadIdx.x;
+    const auto global_row =
+        static_cast<size_type>(slice_size) * slice_id + row_in_slice;
+    const auto column_id = blockIdx.y;
+    ValueType val = 0;
+    IndexType ind = 0;
+    if (global_row < num_rows && column_id < num_right_hand_sides) {
+        for (size_type i = 0; i < slice_lengths[slice_id]; i++) {
+            ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size;
+            val += alpha[0] * a[ind] * b[col[ind] * b_stride + column_id];
+        }
+        c[global_row * c_stride + column_id] =
+            beta[0] * c[global_row * c_stride + column_id] + val;
+    }
+}
+
+
+}  // namespace
+
+
+namespace kernel {
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_zero_dense(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ result)
+{
+    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (tidx_x < num_cols && tidx_y < num_rows) {
+        result[tidx_y * stride + tidx_x] = zero<ValueType>();
+    }
+}
+
+
+template <unsigned int threads_per_row, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_dense(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type slice_size, const size_type *__restrict__ slice_lengths,
+    const size_type *__restrict__ slice_sets,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, ValueType *__restrict__ result)
+{
+    const auto global_row = thread::get_subwarp_id_flat<threads_per_row>();
+    const auto row = global_row % slice_size;
+    const auto slice = global_row / slice_size;
+    const auto start_index = threadIdx.x % threads_per_row;
+
+    if (global_row < num_rows) {
+        for (auto i = start_index; i < slice_lengths[slice];
+             i += threads_per_row) {
+            if (values[(slice_sets[slice] + i) * slice_size + row] !=
+                zero<ValueType>()) {
+                result[global_row * stride +
+                       col_idxs[(slice_sets[slice] + i) * slice_size + row]] =
+                    values[(slice_sets[slice] + i) * slice_size + row];
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void count_nnz_per_row(
+    size_type num_rows, size_type slice_size,
+    const size_type *__restrict__ slice_sets,
+    const ValueType *__restrict__ values, IndexType *__restrict__ result)
+{
+    constexpr auto warp_size = config::warp_size;
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+    const auto row_idx = thread::get_subwarp_id_flat<warp_size>();
+    const auto slice_id = row_idx / slice_size;
+    const auto tid_in_warp = warp_tile.thread_rank();
+    const auto row_in_slice = row_idx % slice_size;
+
+    if (row_idx < num_rows) {
+        IndexType part_result{};
+        for (size_type sellp_ind =
+                 (slice_sets[slice_id] + tid_in_warp) * slice_size +
+                 row_in_slice;
+             sellp_ind < slice_sets[slice_id + 1] * slice_size;
+             sellp_ind += warp_size * slice_size) {
+            if (values[sellp_ind] != zero<ValueType>()) {
+                part_result += 1;
+            }
+        }
+        result[row_idx] = reduce(
+            warp_tile, part_result,
+            [](const size_type &a, const size_type &b) { return a + b; });
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_csr(
+    size_type num_rows, size_type slice_size,
+    const size_type *__restrict__ source_slice_sets,
+    const IndexType *__restrict__ source_col_idxs,
+    const ValueType *__restrict__ source_values,
+    IndexType *__restrict__ result_row_ptrs,
+    IndexType *__restrict__ result_col_idxs,
+    ValueType *__restrict__ result_values)
+{
+    const auto row = thread::get_thread_id_flat();
+    const auto slice_id = row / slice_size;
+    const auto row_in_slice = row % slice_size;
+
+    if (row < num_rows) {
+        size_type csr_ind = result_row_ptrs[row];
+        for (size_type sellp_ind =
+                 source_slice_sets[slice_id] * slice_size + row_in_slice;
+             sellp_ind < source_slice_sets[slice_id + 1] * slice_size;
+             sellp_ind += slice_size) {
+            if (source_values[sellp_ind] != zero<ValueType>()) {
+                result_values[csr_ind] = source_values[sellp_ind];
+                result_col_idxs[csr_ind] = source_col_idxs[sellp_ind];
+                csr_ind++;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/common/preconditioner/isai_kernels.hpp.inc b/common/preconditioner/isai_kernels.hpp.inc
new file mode 100644
index 00000000000..9eec6afaa04
--- /dev/null
+++ b/common/preconditioner/isai_kernels.hpp.inc
@@ -0,0 +1,336 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+/**
+ * @internal
+ *
+ * This kernel supports at most `subwarp_size` (< `warp_size`) elements per row.
+ * If there are more elements, they are simply ignored. Only the first
+ * `subwarp_size` elements are considered both for the values and for the
+ * sparsity pattern.
+ */
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType, typename Callable>
+__forceinline__ __device__ void generic_generate(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values,
+    const IndexType *__restrict__ i_row_ptrs,
+    const IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values,
+    IndexType *__restrict__ excess_rhs_sizes,
+    IndexType *__restrict__ excess_nnz, Callable trs_solve)
+{
+    static_assert(subwarp_size >= row_size_limit, "incompatible subwarp_size");
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    const auto i_row_begin = i_row_ptrs[row];
+    const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin;
+
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const int local_id = subwarp.thread_rank();
+
+    if (i_row_size > subwarp_size) {
+        // defer long rows: store their nnz and number of matches
+        IndexType count{};
+        for (IndexType nz = 0; nz < i_row_size; ++nz) {
+            auto col = i_col_idxs[i_row_begin + nz];
+            auto m_row_begin = m_row_ptrs[col];
+            auto m_row_size = m_row_ptrs[col + 1] - m_row_begin;
+            // extract the sparse submatrix consisting of the entries whose
+            // columns/rows match column indices from this row
+            group_match<subwarp_size>(
+                m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin,
+                i_row_size, subwarp,
+                [&](IndexType, IndexType, IndexType,
+                    config::lane_mask_type matchmask,
+                    bool) { count += popcnt(matchmask); });
+        }
+        // store the dim and nnz of this sparse block
+        if (local_id == 0) {
+            excess_rhs_sizes[row] = i_row_size;
+            excess_nnz[row] = count;
+        }
+    } else {
+        // handle short rows directly: no excess
+        if (local_id == 0) {
+            excess_rhs_sizes[row] = 0;
+            excess_nnz[row] = 0;
+        }
+
+        // subwarp_size^2 storage per subwarp
+        __shared__ UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                                     subwarps_per_block>
+            storage;
+
+        auto trisystem_ptr = storage + (threadIdx.x / subwarp_size) *
+                                           subwarp_size * subwarp_size;
+        // row-major accessor
+        auto trisystem = [&](IndexType row, IndexType col) -> ValueType & {
+            return trisystem_ptr[row * subwarp_size + col];
+        };
+
+#pragma unroll
+        for (int i = 0; i < subwarp_size; ++i) {
+            trisystem(i, local_id) = zero<ValueType>();
+        }
+
+        subwarp.sync();
+
+        for (IndexType nz = 0; nz < i_row_size; ++nz) {
+            auto col = i_col_idxs[i_row_begin + nz];
+            auto m_row_begin = m_row_ptrs[col];
+            auto m_row_size = m_row_ptrs[col + 1] - m_row_begin;
+            // extract the dense submatrix consisting of the entries whose
+            // columns/rows match column indices from this row
+            group_match<subwarp_size>(
+                m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin,
+                i_row_size, subwarp,
+                [&](IndexType, IndexType m_idx, IndexType i_idx,
+                    config::lane_mask_type, bool valid) {
+                    if (valid) {
+                        trisystem(nz, i_idx) = m_values[m_row_begin + m_idx];
+                    }
+                });
+        }
+
+        subwarp.sync();
+
+        // Now, read a full col of `trisystem` into local registers, which will
+        // be row elements after this (implicit) transpose
+        ValueType local_row[subwarp_size];
+#pragma unroll
+        for (int i = 0; i < subwarp_size; ++i) {
+            local_row[i] = trisystem(i, local_id);
+        }
+
+        const auto rhs = trs_solve(i_row_size, local_row, subwarp);
+
+        // Write back:
+        if (local_id < i_row_size) {
+            const auto idx = i_row_begin + local_id;
+            if (is_finite(rhs)) {
+                i_values[idx] = rhs;
+            } else {
+                i_values[idx] = i_col_idxs[idx] == row ? one<ValueType>()
+                                                       : zero<ValueType>();
+            }
+        }
+    }
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+__global__ __launch_bounds__(default_block_size) void generate_l_inverse(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values,
+    const IndexType *__restrict__ i_row_ptrs,
+    const IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values,
+    IndexType *__restrict__ excess_rhs_sizes,
+    IndexType *__restrict__ excess_nnz)
+{
+    auto trs_solve = [](IndexType num_elems,
+                        const ValueType *__restrict__ local_row,
+                        group::thread_block_tile<subwarp_size> &subwarp) {
+        const int local_id = subwarp.thread_rank();
+        ValueType rhs =
+            local_id == num_elems - 1 ? one<ValueType>() : zero<ValueType>();
+        // Solve Triangular system
+        for (int d_col = num_elems - 1; d_col >= 0; --d_col) {
+            const auto elem = local_row[d_col];
+            if (d_col == local_id) {
+                rhs /= elem;
+            }
+
+            const ValueType bot = subwarp.shfl(rhs, d_col);
+            if (local_id < d_col) {
+                rhs -= bot * elem;
+            }
+        }
+
+        return rhs;
+    };
+
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, excess_rhs_sizes, excess_nnz, trs_solve);
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+__global__ __launch_bounds__(default_block_size) void generate_u_inverse(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values,
+    const IndexType *__restrict__ i_row_ptrs,
+    const IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values,
+    IndexType *__restrict__ excess_rhs_sizes,
+    IndexType *__restrict__ excess_nnz)
+{
+    auto trs_solve = [](IndexType num_elems,
+                        const ValueType *__restrict__ local_row,
+                        group::thread_block_tile<subwarp_size> &subwarp) {
+        const int local_id = subwarp.thread_rank();
+        ValueType rhs = local_id == 0 ? one<ValueType>() : zero<ValueType>();
+        // Solve Triangular system
+        for (int d_col = 0; d_col < num_elems; ++d_col) {
+            const auto elem = local_row[d_col];
+            if (d_col == local_id) {
+                rhs /= elem;
+            }
+
+            const ValueType top = subwarp.shfl(rhs, d_col);
+            if (d_col < local_id) {
+                rhs -= top * elem;
+            }
+        }
+
+        return rhs;
+    };
+
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, excess_rhs_sizes, excess_nnz, trs_solve);
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void generate_excess_system(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values,
+    const IndexType *__restrict__ i_row_ptrs,
+    const IndexType *__restrict__ i_col_idxs,
+    const IndexType *__restrict__ excess_rhs_ptrs,
+    const IndexType *__restrict__ excess_nz_ptrs,
+    IndexType *__restrict__ excess_row_ptrs,
+    IndexType *__restrict__ excess_col_idxs,
+    ValueType *__restrict__ excess_values, ValueType *__restrict__ excess_rhs)
+{
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    const auto i_row_begin = i_row_ptrs[row];
+    const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin;
+
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const int local_id = subwarp.thread_rank();
+    const auto prefix_mask = (config::lane_mask_type{1} << local_id) - 1;
+
+    if (row == 0 && local_id == 0) {
+        excess_row_ptrs[0] = 0;
+    }
+
+    if (i_row_size <= subwarp_size) {
+        return;
+    }
+
+    auto excess_rhs_begin = excess_rhs_ptrs[row];
+    auto excess_nz_begin = excess_nz_ptrs[row];
+
+    // defer long rows: store their nnz and number of matches
+    for (IndexType nz = 0; nz < i_row_size; ++nz) {
+        auto col = i_col_idxs[i_row_begin + nz];
+        auto m_row_begin = m_row_ptrs[col];
+        auto m_row_size = m_row_ptrs[col + 1] - m_row_begin;
+        // extract the sparse submatrix consisting of the entries whose
+        // columns/rows match column indices from this row
+        group_match<subwarp_size>(
+            m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin,
+            i_row_size, subwarp,
+            [&](IndexType col, IndexType m_idx, IndexType i_idx,
+                config::lane_mask_type mask, bool valid) {
+                // trisystem(nz, i_idx) = m_values[m_row_begin + m_idx]
+                // only in sparse :)
+                if (valid) {
+                    auto nz = excess_nz_begin + popcnt(mask & prefix_mask);
+                    excess_col_idxs[nz] = excess_rhs_begin + i_idx;
+                    excess_values[nz] = m_values[m_row_begin + m_idx];
+                }
+                excess_nz_begin += popcnt(mask);
+            });
+        if (local_id == 0) {
+            // build right-hand side: 1 for diagonal entry, 0 else
+            excess_rhs[excess_rhs_begin + nz] =
+                row == col ? one<ValueType>() : zero<ValueType>();
+            // store row pointers
+            excess_row_ptrs[excess_rhs_begin + nz + 1] = excess_nz_begin;
+        }
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void copy_excess_solution(
+    IndexType num_rows, const IndexType *__restrict__ i_row_ptrs,
+    const IndexType *__restrict__ excess_rhs_ptrs,
+    const ValueType *__restrict__ excess_solution,
+    ValueType *__restrict__ i_values)
+{
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    auto local_id = threadIdx.x % subwarp_size;
+
+    const auto i_row_begin = i_row_ptrs[row];
+
+    const auto excess_begin = excess_rhs_ptrs[row];
+    const auto excess_size = excess_rhs_ptrs[row + 1] - excess_begin;
+
+    // if it was handled separately:
+    if (excess_size > 0) {
+        // copy the values for this row
+        for (IndexType nz = local_id; nz < excess_size; nz += subwarp_size) {
+            i_values[nz + i_row_begin] = excess_solution[nz + excess_begin];
+        }
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc b/common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc
new file mode 100644
index 00000000000..2426728d402
--- /dev/null
+++ b/common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc
@@ -0,0 +1,109 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block *config::warp_size)
+    advanced_apply(const ValueType *__restrict__ blocks,
+                   preconditioner::block_interleaved_storage_scheme<IndexType>
+                       storage_scheme,
+                   const IndexType *__restrict__ block_ptrs,
+                   size_type num_blocks, const ValueType *__restrict__ alpha,
+                   const ValueType *__restrict__ b, int32 b_stride,
+                   ValueType *__restrict__ x, int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = alpha[0] *
+            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    multiply_vec<max_block_size>(
+        subwarp, block_size, v,
+        blocks + storage_scheme.get_global_block_offset(block_id) +
+            subwarp.thread_rank(),
+        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+        x_stride,
+        [](ValueType &result, const ValueType &out) { result += out; });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void
+__launch_bounds__(warps_per_block *config::warp_size) advanced_adaptive_apply(
+    const ValueType *__restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction *__restrict__ block_precisions,
+    const IndexType *__restrict__ block_ptrs, size_type num_blocks,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ b,
+    int32 b_stride, ValueType *__restrict__ x, int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    auto alpha_val = alpha == nullptr ? one<ValueType>() : alpha[0];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = alpha[0] *
+            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        multiply_vec<max_block_size>(
+            subwarp, block_size, v,
+            reinterpret_cast<const resolved_precision *>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id) +
+                subwarp.thread_rank(),
+            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+            x_stride,
+            [](ValueType &result, const ValueType &out) { result += out; }));
+}
+
+
+}  // namespace kernel
diff --git a/common/preconditioner/jacobi_generate_kernel.hpp.inc b/common/preconditioner/jacobi_generate_kernel.hpp.inc
new file mode 100644
index 00000000000..da8fe668aa0
--- /dev/null
+++ b/common/preconditioner/jacobi_generate_kernel.hpp.inc
@@ -0,0 +1,208 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int max_block_size, typename ReducedType, typename Group,
+          typename ValueType, typename IndexType>
+__device__ __forceinline__ bool validate_precision_reduction_feasibility(
+    Group &__restrict__ group, IndexType block_size,
+    ValueType *__restrict__ row, ValueType *__restrict__ work, size_type stride)
+{
+    using gko::detail::float_traits;
+    // save original data and reduce precision
+    if (group.thread_rank() < block_size) {
+#pragma unroll
+        for (auto i = 0u; i < max_block_size; ++i) {
+            if (i < block_size) {
+                work[i * stride + group.thread_rank()] = row[i];
+                row[i] =
+                    static_cast<ValueType>(static_cast<ReducedType>(row[i]));
+            }
+        }
+    }
+
+    // compute the condition number
+    auto perm = group.thread_rank();
+    auto trans_perm = perm;
+    auto block_cond = compute_infinity_norm<max_block_size>(group, block_size,
+                                                            block_size, row);
+    auto succeeded =
+        invert_block<max_block_size>(group, block_size, row, perm, trans_perm);
+    block_cond *= compute_infinity_norm<max_block_size>(group, block_size,
+                                                        block_size, row);
+
+    // restore original data
+    if (group.thread_rank() < block_size) {
+#pragma unroll
+        for (auto i = 0u; i < max_block_size; ++i) {
+            if (i < block_size) {
+                row[i] = work[i * stride + group.thread_rank()];
+            }
+        }
+    }
+
+    return succeeded && block_cond >= 1.0 &&
+           block_cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block *config::warp_size) generate(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, ValueType *__restrict__ block_data,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType *__restrict__ block_ptrs, size_type num_blocks)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto block = group::this_thread_block();
+    ValueType row[max_block_size];
+    __shared__ UninitializedArray<ValueType, max_block_size * warps_per_block>
+        workspace;
+    csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
+        block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values,
+        block_ptrs, num_blocks, row, 1,
+        workspace + threadIdx.z * max_block_size);
+    const auto subwarp = group::tiled_partition<subwarp_size>(block);
+    if (block_id < num_blocks) {
+        const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+        auto perm = subwarp.thread_rank();
+        auto trans_perm = subwarp.thread_rank();
+        invert_block<max_block_size>(subwarp, block_size, row, perm,
+                                     trans_perm);
+        copy_matrix<max_block_size, and_transpose>(
+            subwarp, block_size, row, 1, perm, trans_perm,
+            block_data + storage_scheme.get_global_block_offset(block_id),
+            storage_scheme.get_stride());
+    }
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void
+__launch_bounds__(warps_per_block *config::warp_size) adaptive_generate(
+    size_type num_rows, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ col_idxs,
+    const ValueType *__restrict__ values, remove_complex<ValueType> accuracy,
+    ValueType *__restrict__ block_data,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    remove_complex<ValueType> *__restrict__ conditioning,
+    precision_reduction *__restrict__ block_precisions,
+    const IndexType *__restrict__ block_ptrs, size_type num_blocks)
+{
+    // extract blocks
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto block = group::this_thread_block();
+    ValueType row[max_block_size];
+    __shared__ UninitializedArray<ValueType, max_block_size * warps_per_block>
+        workspace;
+    csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
+        block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values,
+        block_ptrs, num_blocks, row, 1,
+        workspace + threadIdx.z * max_block_size);
+
+    // compute inverse and figure out the correct precision
+    const auto subwarp = group::tiled_partition<subwarp_size>(block);
+    const auto block_size =
+        block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id]
+                              : 0;
+    auto perm = subwarp.thread_rank();
+    auto trans_perm = subwarp.thread_rank();
+    auto prec_descriptor = ~uint32{};
+    if (block_id < num_blocks) {
+        auto block_cond = compute_infinity_norm<max_block_size>(
+            subwarp, block_size, block_size, row);
+        invert_block<max_block_size>(subwarp, block_size, row, perm,
+                                     trans_perm);
+        block_cond *= compute_infinity_norm<max_block_size>(subwarp, block_size,
+                                                            block_size, row);
+        conditioning[block_id] = block_cond;
+        const auto prec = block_precisions[block_id];
+        prec_descriptor =
+            preconditioner::detail::precision_reduction_descriptor::singleton(
+                prec);
+        if (prec == precision_reduction::autodetect()) {
+            using preconditioner::detail::get_supported_storage_reductions;
+            prec_descriptor = get_supported_storage_reductions<ValueType>(
+                accuracy, block_cond,
+                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
+                 &block_id] {
+                    using target = reduce_precision<ValueType>;
+                    return validate_precision_reduction_feasibility<
+                        max_block_size, target>(
+                        subwarp, block_size, row,
+                        block_data +
+                            storage_scheme.get_global_block_offset(block_id),
+                        storage_scheme.get_stride());
+                },
+                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
+                 &block_id] {
+                    using target =
+                        reduce_precision<reduce_precision<ValueType>>;
+                    return validate_precision_reduction_feasibility<
+                        max_block_size, target>(
+                        subwarp, block_size, row,
+                        block_data +
+                            storage_scheme.get_global_block_offset(block_id),
+                        storage_scheme.get_stride());
+                });
+        }
+    }
+
+    // make sure all blocks in the group have the same precision
+    const auto warp = group::tiled_partition<config::warp_size>(block);
+    const auto prec =
+        preconditioner::detail::get_optimal_storage_reduction(reduce(
+            warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; }));
+
+    // store the block back into memory
+    if (block_id < num_blocks) {
+        block_precisions[block_id] = prec;
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, prec,
+            copy_matrix<max_block_size, and_transpose>(
+                subwarp, block_size, row, 1, perm, trans_perm,
+                reinterpret_cast<resolved_precision *>(
+                    block_data + storage_scheme.get_group_offset(block_id)) +
+                    storage_scheme.get_block_offset(block_id),
+                storage_scheme.get_stride()));
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc
new file mode 100644
index 00000000000..d480a0a154a
--- /dev/null
+++ b/common/preconditioner/jacobi_kernels.hpp.inc
@@ -0,0 +1,215 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <int warps_per_block>
+__global__
+__launch_bounds__(warps_per_block *config::warp_size) void duplicate_array(
+    const precision_reduction *__restrict__ source, size_type source_size,
+    precision_reduction *__restrict__ dest, size_type dest_size)
+{
+    auto grid = group::this_grid();
+    if (grid.thread_rank() >= dest_size) {
+        return;
+    }
+    for (auto i = grid.thread_rank(); i < dest_size; i += grid.size()) {
+        dest[i] = source[i % source_size];
+    }
+}
+
+
+template <typename IndexType>
+__global__ void compare_adjacent_rows(size_type num_rows, int32 max_block_size,
+                                      const IndexType *__restrict__ row_ptrs,
+                                      const IndexType *__restrict__ col_idx,
+                                      bool *__restrict__ matching_next_row)
+{
+    const auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto local_tid = warp.thread_rank();
+    const auto warp_id = thread::get_subwarp_id_flat<config::warp_size>();
+
+    if (warp_id >= num_rows - 1) {
+        return;
+    }
+
+    const auto curr_row_start = row_ptrs[warp_id];
+    const auto next_row_start = row_ptrs[warp_id + 1];
+    const auto next_row_end = row_ptrs[warp_id + 2];
+
+    const auto nz_this_row = next_row_end - next_row_start;
+    const auto nz_prev_row = next_row_start - curr_row_start;
+
+    if (nz_this_row != nz_prev_row) {
+        matching_next_row[warp_id] = false;
+        return;
+    }
+    size_type steps = ceildiv(nz_this_row, config::warp_size);
+    for (size_type i = 0; i < steps; i++) {
+        auto j = local_tid + i * config::warp_size;
+        auto prev_col = (curr_row_start + j < next_row_start)
+                            ? col_idx[curr_row_start + j]
+                            : 0;
+        auto this_col = (curr_row_start + j < next_row_start)
+                            ? col_idx[next_row_start + j]
+                            : 0;
+        if (warp.any(prev_col != this_col)) {
+            matching_next_row[warp_id] = false;
+            return;
+        }
+    }
+    matching_next_row[warp_id] = true;
+}
+
+
+template <typename IndexType>
+__global__ void generate_natural_block_pointer(
+    size_type num_rows, int32 max_block_size,
+    const bool *__restrict__ matching_next_row,
+    IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr)
+{
+    block_ptrs[0] = 0;
+    if (num_rows == 0) {
+        return;
+    }
+    size_type num_blocks = 1;
+    int32 current_block_size = 1;
+    for (size_type i = 0; i < num_rows - 1; ++i) {
+        if ((matching_next_row[i]) && (current_block_size < max_block_size)) {
+            ++current_block_size;
+        } else {
+            block_ptrs[num_blocks] =
+                block_ptrs[num_blocks - 1] + current_block_size;
+            ++num_blocks;
+            current_block_size = 1;
+        }
+    }
+    block_ptrs[num_blocks] = block_ptrs[num_blocks - 1] + current_block_size;
+    num_blocks_arr[0] = num_blocks;
+}
+
+
+template <typename IndexType>
+__global__ void agglomerate_supervariables_kernel(
+    int32 max_block_size, size_type num_natural_blocks,
+    IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr)
+{
+    num_blocks_arr[0] = 0;
+    if (num_natural_blocks == 0) {
+        return;
+    }
+    size_type num_blocks = 1;
+    int32 current_block_size = block_ptrs[1] - block_ptrs[0];
+    for (size_type i = 1; i < num_natural_blocks; ++i) {
+        const int32 block_size = block_ptrs[i + 1] - block_ptrs[i];
+        if (current_block_size + block_size <= max_block_size) {
+            current_block_size += block_size;
+        } else {
+            block_ptrs[num_blocks] = block_ptrs[i];
+            ++num_blocks;
+            current_block_size = block_size;
+        }
+    }
+    block_ptrs[num_blocks] = block_ptrs[num_natural_blocks];
+    num_blocks_arr[0] = num_blocks;
+}
+
+
+template <bool conjugate, int max_block_size, int subwarp_size,
+          int warps_per_block, typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block *config::warp_size)
+    transpose_jacobi(const ValueType *__restrict__ blocks,
+                     preconditioner::block_interleaved_storage_scheme<IndexType>
+                         storage_scheme,
+                     const IndexType *__restrict__ block_ptrs,
+                     size_type num_blocks, ValueType *__restrict__ out_blocks)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+
+    const auto block_ofs = storage_scheme.get_global_block_offset(block_id);
+    const auto block_stride = storage_scheme.get_stride();
+    const auto rank = subwarp.thread_rank();
+    if (rank < block_size) {
+        for (IndexType i = 0; i < block_size; ++i) {
+            auto val = blocks[block_ofs + i * block_stride + rank];
+            out_blocks[block_ofs + i + rank * block_stride] =
+                conjugate ? conj(val) : val;
+        }
+    }
+}
+
+
+template <bool conjugate, int max_block_size, int subwarp_size,
+          int warps_per_block, typename ValueType, typename IndexType>
+__global__ void
+__launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
+    const ValueType *__restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction *__restrict__ block_precisions,
+    const IndexType *__restrict__ block_ptrs, size_type num_blocks,
+    ValueType *__restrict__ out_blocks)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+
+    const auto block_stride = storage_scheme.get_stride();
+    const auto rank = subwarp.thread_rank();
+    if (rank < block_size) {
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, block_precisions[block_id],
+            auto local_block =
+                reinterpret_cast<const resolved_precision *>(
+                    blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id);
+            auto local_out_block =
+                reinterpret_cast<resolved_precision *>(
+                    out_blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id);
+            for (IndexType i = 0; i < block_size; ++i) {
+                auto val = local_block[i * block_stride + rank];
+                local_out_block[i + rank * block_stride] =
+                    conjugate ? conj(val) : val;
+            });
+    }
+}
diff --git a/common/preconditioner/jacobi_simple_apply_kernel.hpp.inc b/common/preconditioner/jacobi_simple_apply_kernel.hpp.inc
new file mode 100644
index 00000000000..c7a472bd409
--- /dev/null
+++ b/common/preconditioner/jacobi_simple_apply_kernel.hpp.inc
@@ -0,0 +1,104 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block *config::warp_size) apply(
+    const ValueType *__restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType *__restrict__ block_ptrs, size_type num_blocks,
+    const ValueType *__restrict__ b, int32 b_stride, ValueType *__restrict__ x,
+    int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    multiply_vec<max_block_size>(
+        subwarp, block_size, v,
+        blocks + storage_scheme.get_global_block_offset(block_id) +
+            subwarp.thread_rank(),
+        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+        x_stride,
+        [](ValueType &result, const ValueType &out) { result = out; });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block *config::warp_size)
+    adaptive_apply(const ValueType *__restrict__ blocks,
+                   preconditioner::block_interleaved_storage_scheme<IndexType>
+                       storage_scheme,
+                   const precision_reduction *__restrict__ block_precisions,
+                   const IndexType *__restrict__ block_ptrs,
+                   size_type num_blocks, const ValueType *__restrict__ b,
+                   int32 b_stride, ValueType *__restrict__ x, int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        multiply_vec<max_block_size>(
+            subwarp, block_size, v,
+            reinterpret_cast<const resolved_precision *>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id) +
+                subwarp.thread_rank(),
+            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+            x_stride,
+            [](ValueType &result, const ValueType &out) { result = out; }));
+}
+
+
+}  // namespace kernel
diff --git a/common/solver/bicg_kernels.hpp.inc b/common/solver/bicg_kernels.hpp.inc
new file mode 100644
index 00000000000..fdb8ee8f3f9
--- /dev/null
+++ b/common/solver/bicg_kernels.hpp.inc
@@ -0,0 +1,111 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ b, ValueType *__restrict__ r,
+    ValueType *__restrict__ z, ValueType *__restrict__ p,
+    ValueType *__restrict__ q, ValueType *__restrict__ r2,
+    ValueType *__restrict__ z2, ValueType *__restrict__ p2,
+    ValueType *__restrict__ q2, ValueType *__restrict__ prev_rho,
+    ValueType *__restrict__ rho, stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_cols) {
+        rho[tidx] = zero<ValueType>();
+        prev_rho[tidx] = one<ValueType>();
+        stop_status[tidx].reset();
+    }
+
+    if (tidx < num_rows * stride) {
+        r[tidx] = b[tidx];
+        z[tidx] = zero<ValueType>();
+        p[tidx] = zero<ValueType>();
+        q[tidx] = zero<ValueType>();
+        r2[tidx] = b[tidx];
+        z2[tidx] = zero<ValueType>();
+        p2[tidx] = zero<ValueType>();
+        q2[tidx] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ p, const ValueType *__restrict__ z,
+    ValueType *__restrict__ p2, const ValueType *__restrict__ z2,
+    const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    const auto tmp = rho[col] / prev_rho[col];
+
+    p[tidx] =
+        prev_rho[col] == zero<ValueType>() ? z[tidx] : z[tidx] + tmp * p[tidx];
+
+    p2[tidx] = prev_rho[col] == zero<ValueType>() ? z2[tidx]
+                                                  : z2[tidx] + tmp * p2[tidx];
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
+    ValueType *__restrict__ r2, const ValueType *__restrict__ p,
+    const ValueType *__restrict__ q, const ValueType *__restrict__ q2,
+    const ValueType *__restrict__ beta, const ValueType *__restrict__ rho,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+
+    if (col >= num_cols || tidx >= num_rows * num_cols ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    if (beta[col] != zero<ValueType>()) {
+        const auto tmp = rho[col] / beta[col];
+        x[row * x_stride + col] += tmp * p[tidx];
+        r[tidx] -= tmp * q[tidx];
+        r2[tidx] -= tmp * q2[tidx];
+    }
+}
diff --git a/common/solver/bicgstab_kernels.hpp.inc b/common/solver/bicgstab_kernels.hpp.inc
new file mode 100644
index 00000000000..03071970fcc
--- /dev/null
+++ b/common/solver/bicgstab_kernels.hpp.inc
@@ -0,0 +1,168 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ b, ValueType *__restrict__ r,
+    ValueType *__restrict__ rr, ValueType *__restrict__ y,
+    ValueType *__restrict__ s, ValueType *__restrict__ t,
+    ValueType *__restrict__ z, ValueType *__restrict__ v,
+    ValueType *__restrict__ p, ValueType *__restrict__ prev_rho,
+    ValueType *__restrict__ rho, ValueType *__restrict__ alpha,
+    ValueType *__restrict__ beta, ValueType *__restrict__ gamma,
+    ValueType *__restrict__ omega, stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_cols) {
+        prev_rho[tidx] = one<ValueType>();
+        rho[tidx] = one<ValueType>();
+        alpha[tidx] = one<ValueType>();
+        beta[tidx] = one<ValueType>();
+        gamma[tidx] = one<ValueType>();
+        omega[tidx] = one<ValueType>();
+        stop_status[tidx].reset();
+    }
+
+    if (tidx < num_rows * stride) {
+        r[tidx] = b[tidx];
+        rr[tidx] = zero<ValueType>();
+        y[tidx] = zero<ValueType>();
+        s[tidx] = zero<ValueType>();
+        t[tidx] = zero<ValueType>();
+        z[tidx] = zero<ValueType>();
+        v[tidx] = zero<ValueType>();
+        p[tidx] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ r, ValueType *__restrict__ p,
+    const ValueType *__restrict__ v, const ValueType *__restrict__ rho,
+    const ValueType *__restrict__ prev_rho, const ValueType *__restrict__ alpha,
+    const ValueType *__restrict__ omega,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    auto res = r[tidx];
+    if (prev_rho[col] * omega[col] != zero<ValueType>()) {
+        const auto tmp = (rho[col] / prev_rho[col]) * (alpha[col] / omega[col]);
+        res += tmp * (p[tidx] - omega[col] * v[tidx]);
+    }
+    p[tidx] = res;
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ r, ValueType *__restrict__ s,
+    const ValueType *__restrict__ v, const ValueType *__restrict__ rho,
+    ValueType *__restrict__ alpha, const ValueType *__restrict__ beta,
+    const stopping_status *__restrict__ stop_status)
+{
+    const size_type tidx = thread::get_thread_id_flat();
+    const size_type col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    auto t_alpha = zero<ValueType>();
+    auto t_s = r[tidx];
+    if (beta[col] != zero<ValueType>()) {
+        t_alpha = rho[col] / beta[col];
+        t_s -= t_alpha * v[tidx];
+    }
+    alpha[col] = t_alpha;
+    s[tidx] = t_s;
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_3_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
+    const ValueType *__restrict__ s, const ValueType *__restrict__ t,
+    const ValueType *__restrict__ y, const ValueType *__restrict__ z,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ beta,
+    const ValueType *__restrict__ gamma, ValueType *__restrict__ omega,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    const auto x_pos = row * x_stride + col;
+    auto t_omega = zero<ValueType>();
+    auto t_x = x[x_pos] + alpha[col] * y[tidx];
+    auto t_r = s[tidx];
+    if (beta[col] != zero<ValueType>()) {
+        t_omega = gamma[col] / beta[col];
+        t_x += t_omega * z[tidx];
+        t_r -= t_omega * t[tidx];
+    }
+    omega[col] = t_omega;
+    x[x_pos] = t_x;
+    r[tidx] = t_r;
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void finalize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, ValueType *__restrict__ x,
+    const ValueType *__restrict__ y, const ValueType *__restrict__ alpha,
+    stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].is_finalized() || !stop_status[col].has_stopped()) {
+        return;
+    }
+    const auto x_pos = row * x_stride + col;
+    x[x_pos] = x[x_pos] + alpha[col] * y[tidx];
+    stop_status[col].finalize();
+}
diff --git a/common/solver/cg_kernels.hpp.inc b/common/solver/cg_kernels.hpp.inc
new file mode 100644
index 00000000000..d318c30f338
--- /dev/null
+++ b/common/solver/cg_kernels.hpp.inc
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ b, ValueType *__restrict__ r,
+    ValueType *__restrict__ z, ValueType *__restrict__ p,
+    ValueType *__restrict__ q, ValueType *__restrict__ prev_rho,
+    ValueType *__restrict__ rho, stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_cols) {
+        rho[tidx] = zero<ValueType>();
+        prev_rho[tidx] = one<ValueType>();
+        stop_status[tidx].reset();
+    }
+
+    if (tidx < num_rows * stride) {
+        r[tidx] = b[tidx];
+        z[tidx] = zero<ValueType>();
+        p[tidx] = zero<ValueType>();
+        q[tidx] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ p, const ValueType *__restrict__ z,
+    const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    const auto tmp = rho[col] / prev_rho[col];
+    p[tidx] =
+        prev_rho[col] == zero<ValueType>() ? z[tidx] : z[tidx] + tmp * p[tidx];
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
+    const ValueType *__restrict__ p, const ValueType *__restrict__ q,
+    const ValueType *__restrict__ beta, const ValueType *__restrict__ rho,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+
+    if (col >= num_cols || tidx >= num_rows * num_cols ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    if (beta[col] != zero<ValueType>()) {
+        const auto tmp = rho[col] / beta[col];
+        x[row * x_stride + col] += tmp * p[tidx];
+        r[tidx] -= tmp * q[tidx];
+    }
+}
diff --git a/common/solver/cgs_kernels.hpp.inc b/common/solver/cgs_kernels.hpp.inc
new file mode 100644
index 00000000000..d6c3e64cd4c
--- /dev/null
+++ b/common/solver/cgs_kernels.hpp.inc
@@ -0,0 +1,137 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ b, ValueType *__restrict__ r,
+    ValueType *__restrict__ r_tld, ValueType *__restrict__ p,
+    ValueType *__restrict__ q, ValueType *__restrict__ u,
+    ValueType *__restrict__ u_hat, ValueType *__restrict__ v_hat,
+    ValueType *__restrict__ t, ValueType *__restrict__ alpha,
+    ValueType *__restrict__ beta, ValueType *__restrict__ gamma,
+    ValueType *__restrict__ rho_prev, ValueType *__restrict__ rho,
+    stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_cols) {
+        rho[tidx] = zero<ValueType>();
+        alpha[tidx] = one<ValueType>();
+        beta[tidx] = one<ValueType>();
+        gamma[tidx] = one<ValueType>();
+        rho_prev[tidx] = one<ValueType>();
+        stop_status[tidx].reset();
+    }
+
+    if (tidx < num_rows * stride) {
+        r[tidx] = b[tidx];
+        r_tld[tidx] = b[tidx];
+        u[tidx] = zero<ValueType>();
+        p[tidx] = zero<ValueType>();
+        q[tidx] = zero<ValueType>();
+        u_hat[tidx] = zero<ValueType>();
+        v_hat[tidx] = zero<ValueType>();
+        t[tidx] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ r, ValueType *__restrict__ u,
+    ValueType *__restrict__ p, const ValueType *__restrict__ q,
+    ValueType *__restrict__ beta, const ValueType *__restrict__ rho,
+    const ValueType *__restrict__ rho_prev,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto col = tidx % stride;
+
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    if (rho_prev[col] != zero<ValueType>()) {
+        beta[col] = rho[col] / rho_prev[col];
+        u[tidx] = r[tidx] + beta[col] * q[tidx];
+        p[tidx] = u[tidx] + beta[col] * (q[tidx] + beta[col] * p[tidx]);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ u, const ValueType *__restrict__ v_hat,
+    ValueType *__restrict__ q, ValueType *__restrict__ t,
+    ValueType *__restrict__ alpha, const ValueType *__restrict__ rho,
+    const ValueType *__restrict__ gamma,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto col = tidx % stride;
+
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    if (gamma[col] != zero<ValueType>()) {
+        alpha[col] = rho[col] / gamma[col];
+        q[tidx] = u[tidx] - alpha[col] * v_hat[tidx];
+        t[tidx] = u[tidx] + q[tidx];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_3_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, const ValueType *__restrict__ t,
+    const ValueType *__restrict__ v_hat, ValueType *__restrict__ r,
+    ValueType *__restrict__ x, const ValueType *__restrict__ alpha,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    const auto x_pos = row * x_stride + col;
+    auto t_x = x[x_pos] + alpha[col] * v_hat[tidx];
+    auto t_r = r[tidx] - alpha[col] * t[tidx];
+    x[x_pos] = t_x;
+    r[tidx] = t_r;
+}
\ No newline at end of file
diff --git a/common/solver/fcg_kernels.hpp.inc b/common/solver/fcg_kernels.hpp.inc
new file mode 100644
index 00000000000..2b5b72029a2
--- /dev/null
+++ b/common/solver/fcg_kernels.hpp.inc
@@ -0,0 +1,104 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ b, ValueType *__restrict__ r,
+    ValueType *__restrict__ z, ValueType *__restrict__ p,
+    ValueType *__restrict__ q, ValueType *__restrict__ t,
+    ValueType *__restrict__ prev_rho, ValueType *__restrict__ rho,
+    ValueType *__restrict__ rho_t, stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_cols) {
+        rho[tidx] = zero<ValueType>();
+        prev_rho[tidx] = one<ValueType>();
+        rho_t[tidx] = one<ValueType>();
+        stop_status[tidx].reset();
+    }
+
+    if (tidx < num_rows * stride) {
+        r[tidx] = b[tidx];
+        z[tidx] = zero<ValueType>();
+        p[tidx] = zero<ValueType>();
+        q[tidx] = zero<ValueType>();
+        t[tidx] = b[tidx];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ p, const ValueType *__restrict__ z,
+    const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    const auto tmp = rho[col] / prev_rho[col];
+    p[tidx] =
+        prev_rho[col] == zero<ValueType>() ? z[tidx] : z[tidx] + tmp * p[tidx];
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
+    ValueType *__restrict__ t, const ValueType *__restrict__ p,
+    const ValueType *__restrict__ q, const ValueType *__restrict__ beta,
+    const ValueType *__restrict__ rho,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+
+    if (col >= num_cols || tidx >= num_rows * num_cols ||
+        stop_status[col].has_stopped()) {
+        return;
+    }
+    if (beta[col] != zero<ValueType>()) {
+        const auto tmp = rho[col] / beta[col];
+        const auto prev_r = r[tidx];
+        x[row * x_stride + col] += tmp * p[tidx];
+        r[tidx] -= tmp * q[tidx];
+        t[tidx] = r[tidx] - prev_r;
+    }
+}
\ No newline at end of file
diff --git a/common/solver/gmres_kernels.hpp.inc b/common/solver/gmres_kernels.hpp.inc
new file mode 100644
index 00000000000..7b991879571
--- /dev/null
+++ b/common/solver/gmres_kernels.hpp.inc
@@ -0,0 +1,405 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// Must be called with at least `max(stride_b * num_rows, krylov_dim *
+// num_cols)` threads in total.
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void initialize_1_kernel(
+    size_type num_rows, size_type num_cols, size_type krylov_dim,
+    const ValueType *__restrict__ b, size_type stride_b,
+    ValueType *__restrict__ residual, size_type stride_residual,
+    ValueType *__restrict__ givens_sin, size_type stride_sin,
+    ValueType *__restrict__ givens_cos, size_type stride_cos,
+    stopping_status *__restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+
+    const auto row_idx = global_id / stride_b;
+    const auto col_idx = global_id % stride_b;
+
+    if (global_id < num_cols) {
+        stop_status[global_id].reset();
+    }
+
+    if (row_idx < num_rows && col_idx < num_cols) {
+        residual[row_idx * stride_residual + col_idx] =
+            b[row_idx * stride_b + col_idx];
+    }
+
+    if (global_id < krylov_dim * num_cols) {
+        const auto row_givens = global_id / num_cols;
+        const auto col_givens = global_id % num_cols;
+
+        givens_sin[row_givens * stride_sin + col_givens] = zero<ValueType>();
+        givens_cos[row_givens * stride_cos + col_givens] = zero<ValueType>();
+    }
+}
+
+
+// Must be called with at least `num_rows * num_rhs` threads in total.
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void initialize_2_2_kernel(
+    size_type num_rows, size_type num_rhs,
+    const ValueType *__restrict__ residual, size_type stride_residual,
+    const remove_complex<ValueType> *__restrict__ residual_norm,
+    ValueType *__restrict__ residual_norm_collection,
+    ValueType *__restrict__ krylov_bases, size_type stride_krylov,
+    size_type *__restrict__ final_iter_nums)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row_idx = global_id / num_rhs;
+    const auto col_idx = global_id % num_rhs;
+
+    if (global_id < num_rhs) {
+        residual_norm_collection[global_id] = residual_norm[global_id];
+        final_iter_nums[global_id] = 0;
+    }
+
+    if (row_idx < num_rows && col_idx < num_rhs) {
+        auto value = residual[row_idx * stride_residual + col_idx] /
+                     residual_norm[col_idx];
+        krylov_bases[row_idx * stride_krylov + col_idx] = value;
+    }
+}
+
+
+__global__
+    __launch_bounds__(default_block_size) void increase_final_iteration_numbers_kernel(
+        size_type *__restrict__ final_iter_nums,
+        const stopping_status *__restrict__ stop_status, size_type total_number)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    if (global_id < total_number) {
+        final_iter_nums[global_id] += !stop_status[global_id].has_stopped();
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
+    size_type k, size_type num_rows, size_type num_cols,
+    const ValueType *__restrict__ krylov_bases,
+    const ValueType *__restrict__ next_krylov_basis, size_type stride_krylov,
+    ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = threadIdx.x;
+    const auto tidy = threadIdx.y;
+    const auto col_idx = blockIdx.x * default_dot_dim + tidx;
+    const auto num = ceildiv(num_rows, gridDim.y);
+    const auto start_row = blockIdx.y * num;
+    const auto end_row =
+        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
+    // Used that way to get around dynamic initialization warning and
+    // template error when using `reduction_helper_array` directly in `reduce`
+    __shared__
+        UninitializedArray<ValueType, default_dot_dim *(default_dot_dim + 1)>
+            reduction_helper_array;
+    ValueType *__restrict__ reduction_helper = reduction_helper_array;
+
+    ValueType local_res = zero<ValueType>();
+    if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
+        for (size_type i = start_row + tidy; i < end_row;
+             i += default_dot_dim) {
+            const auto krylov_idx = i * stride_krylov + col_idx;
+            local_res +=
+                conj(krylov_bases[krylov_idx]) * next_krylov_basis[krylov_idx];
+        }
+    }
+    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
+    __syncthreads();
+    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
+    const auto tile_block =
+        group::tiled_partition<default_dot_dim>(group::this_thread_block());
+    const auto sum =
+        reduce(tile_block, local_res,
+               [](const ValueType &a, const ValueType &b) { return a + b; });
+    const auto new_col_idx = blockIdx.x * default_dot_dim + tidy;
+    if (tidx == 0 && new_col_idx < num_cols &&
+        !stop_status[new_col_idx].has_stopped()) {
+        const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
+        atomic_add(hessenberg_iter + hessenberg_idx, sum);
+    }
+}
+
+
+// Must be called with at least `num_rows * stride_next_krylov` threads in
+// total.
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_next_krylov_kernel(
+    size_type k, size_type num_rows, size_type num_cols,
+    const ValueType *__restrict__ krylov_bases,
+    ValueType *__restrict__ next_krylov_basis, size_type stride_krylov,
+    const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row_idx = global_id / stride_krylov;
+    const auto col_idx = global_id % stride_krylov;
+
+    if (row_idx < num_rows && col_idx < num_cols &&
+        !stop_status[col_idx].has_stopped()) {
+        const auto next_krylov_idx = row_idx * stride_krylov + col_idx;
+        const auto krylov_idx = row_idx * stride_krylov + col_idx;
+        const auto hessenberg_idx = k * stride_hessenberg + col_idx;
+
+        next_krylov_basis[next_krylov_idx] -=
+            hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx];
+    }
+}
+
+
+// Must be called with at least `num_cols` blocks, each with `block_size`
+// threads. `block_size` must be a power of 2.
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_hessenberg_2_kernel(
+    size_type iter, size_type num_rows, size_type num_cols,
+    const ValueType *__restrict__ next_krylov_basis,
+    size_type stride_next_krylov, ValueType *__restrict__ hessenberg_iter,
+    size_type stride_hessenberg,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidx = threadIdx.x;
+    const auto col_idx = blockIdx.x;
+
+    // Used that way to get around dynamic initialization warning and
+    // template error when using `reduction_helper_array` directly in `reduce`
+    __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
+    ValueType *__restrict__ reduction_helper = reduction_helper_array;
+
+    if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
+        ValueType local_res{};
+        for (size_type i = tidx; i < num_rows; i += block_size) {
+            const auto next_krylov_idx = i * stride_next_krylov + col_idx;
+            const auto next_krylov_value = next_krylov_basis[next_krylov_idx];
+
+            local_res += next_krylov_value * next_krylov_value;
+        }
+
+        reduction_helper[tidx] = local_res;
+
+        // Perform thread block reduction. Result is in reduction_helper[0]
+        reduce(group::this_thread_block(), reduction_helper,
+               [](const ValueType &a, const ValueType &b) { return a + b; });
+
+        if (tidx == 0) {
+            hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] =
+                sqrt(reduction_helper[0]);
+        }
+    }
+}
+
+
+// Must be called with at least `num_rows * stride_krylov` threads in
+// total.
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_krylov_kernel(
+    size_type iter, size_type num_rows, size_type num_cols,
+    ValueType *__restrict__ krylov_bases, size_type stride_krylov,
+    const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row_idx = global_id / stride_krylov;
+    const auto col_idx = global_id % stride_krylov;
+    const auto hessenberg =
+        hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx];
+
+    if (row_idx < num_rows && col_idx < num_cols &&
+        !stop_status[col_idx].has_stopped()) {
+        const auto krylov_idx = row_idx * stride_krylov + col_idx;
+
+        krylov_bases[krylov_idx] /= hessenberg;
+    }
+}
+
+
+template <typename ValueType>
+__device__ void calculate_sin_and_cos_kernel(
+    size_type col_idx, size_type num_cols, size_type iter,
+    const ValueType &this_hess, const ValueType &next_hess,
+    ValueType *givens_sin, size_type stride_sin, ValueType *givens_cos,
+    size_type stride_cos, ValueType &register_sin, ValueType &register_cos)
+{
+    if (this_hess == zero<ValueType>()) {
+        register_cos = zero<ValueType>();
+        register_sin = one<ValueType>();
+    } else {
+        const auto scale = abs(this_hess) + abs(next_hess);
+        const auto hypotenuse =
+            scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) +
+                         abs(next_hess / scale) * abs(next_hess / scale));
+        register_cos = conj(this_hess) / hypotenuse;
+        register_sin = conj(next_hess) / hypotenuse;
+    }
+    givens_cos[iter * stride_cos + col_idx] = register_cos;
+    givens_sin[iter * stride_sin + col_idx] = register_sin;
+}
+
+
+template <typename ValueType>
+__device__ void calculate_residual_norm_kernel(
+    size_type col_idx, size_type num_cols, size_type iter,
+    const ValueType &register_sin, const ValueType &register_cos,
+    remove_complex<ValueType> *residual_norm,
+    ValueType *residual_norm_collection,
+    size_type stride_residual_norm_collection)
+{
+    const auto this_rnc =
+        residual_norm_collection[iter * stride_residual_norm_collection +
+                                 col_idx];
+    const auto next_rnc = -conj(register_sin) * this_rnc;
+    residual_norm_collection[iter * stride_residual_norm_collection + col_idx] =
+        register_cos * this_rnc;
+    residual_norm[col_idx] = abs(next_rnc);
+    residual_norm_collection[(iter + 1) * stride_residual_norm_collection +
+                             col_idx] = next_rnc;
+}
+
+
+// Must be called with at least `num_cols` threads in total.
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void givens_rotation_kernel(
+    size_type num_rows, size_type num_cols, size_type iter,
+    ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
+    ValueType *__restrict__ givens_sin, size_type stride_sin,
+    ValueType *__restrict__ givens_cos, size_type stride_cos,
+    remove_complex<ValueType> *__restrict__ residual_norm,
+    ValueType *__restrict__ residual_norm_collection,
+    size_type stride_residual_norm_collection,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto col_idx = thread::get_thread_id_flat();
+
+    if (col_idx >= num_cols || stop_status[col_idx].has_stopped()) {
+        return;
+    }
+
+    auto this_hess = hessenberg_iter[col_idx];
+    auto next_hess = hessenberg_iter[stride_hessenberg + col_idx];
+    for (size_type i = 0; i < iter; ++i) {
+        const auto cos = givens_cos[i * stride_cos + col_idx];
+        const auto sin = givens_sin[i * stride_sin + col_idx];
+        hessenberg_iter[i * stride_hessenberg + col_idx] =
+            cos * this_hess + sin * next_hess;
+        this_hess = conj(cos) * next_hess - conj(sin) * this_hess;
+        next_hess = hessenberg_iter[(i + 2) * stride_hessenberg + col_idx];
+    }
+    // for j in 0:iter - 1
+    //     temp             =  cos(j)*hessenberg(j) +
+    //                         sin(j)*hessenberg(j+1)
+    //     hessenberg(j+1)  = -sin(j)*hessenberg(j) +
+    //                         cos(j)*hessenberg(j+1)
+    //     hessenberg(j)    =  temp;
+    // end
+
+    ValueType register_sin;
+    ValueType register_cos;
+    calculate_sin_and_cos_kernel(col_idx, num_cols, iter, this_hess, next_hess,
+                                 givens_sin, stride_sin, givens_cos, stride_cos,
+                                 register_sin, register_cos);
+    // Calculate sin and cos on hessenberg(iter) and hessenberg(iter+1)
+
+    hessenberg_iter[iter * stride_hessenberg + col_idx] =
+        register_cos * this_hess + register_sin * next_hess;
+    hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] =
+        zero<ValueType>();
+    // hessenberg(iter)   = cos(iter)*hessenberg(iter) +
+    //                      sin(iter)*hessenberg(iter+1)
+    // hessenberg(iter+1) = 0
+
+    calculate_residual_norm_kernel(
+        col_idx, num_cols, iter, register_sin, register_cos, residual_norm,
+        residual_norm_collection, stride_residual_norm_collection);
+    // Calculate residual norm
+}
+
+
+// Must be called with at least `num_rhs` threads in total.
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void solve_upper_triangular_kernel(
+    size_type num_cols, size_type num_rhs,
+    const ValueType *__restrict__ residual_norm_collection,
+    size_type stride_residual_norm_collection,
+    const ValueType *__restrict__ hessenberg, size_type stride_hessenberg,
+    ValueType *__restrict__ y, size_type stride_y,
+    const size_type *__restrict__ final_iter_nums)
+{
+    const auto col_idx = thread::get_thread_id_flat();
+
+    if (col_idx >= num_rhs) {
+        return;
+    }
+
+    for (int i = final_iter_nums[col_idx] - 1; i >= 0; --i) {
+        auto temp =
+            residual_norm_collection[i * stride_residual_norm_collection +
+                                     col_idx];
+        for (size_type j = i + 1; j < final_iter_nums[col_idx]; ++j) {
+            temp -= hessenberg[i * stride_hessenberg + j * num_rhs + col_idx] *
+                    y[j * stride_y + col_idx];
+        }
+
+        y[i * stride_y + col_idx] =
+            temp / hessenberg[i * stride_hessenberg + i * num_rhs + col_idx];
+    }
+    // Solve upper triangular.
+    // y = hessenberg \ residual_norm_collection
+}
+
+
+// Must be called with at least `stride_preconditioner * num_rows` threads in
+// total.
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void calculate_Qy_kernel(
+    size_type num_rows, size_type num_cols, size_type num_rhs,
+    const ValueType *__restrict__ krylov_bases, size_type stride_krylov,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ before_preconditioner,
+    size_type stride_preconditioner,
+    const size_type *__restrict__ final_iter_nums)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row_id = global_id / stride_preconditioner;
+    const auto col_id = global_id % stride_preconditioner;
+
+    if (row_id < num_rows && col_id < num_cols) {
+        ValueType temp = zero<ValueType>();
+
+        for (size_type j = 0; j < final_iter_nums[col_id]; ++j) {
+            temp +=
+                krylov_bases[(row_id + j * num_rows) * stride_krylov + col_id] *
+                y[j * stride_y + col_id];
+        }
+        before_preconditioner[global_id] = temp;
+    }
+}
diff --git a/common/solver/ir_kernels.hpp.inc b/common/solver/ir_kernels.hpp.inc
new file mode 100644
index 00000000000..24a66f2795b
--- /dev/null
+++ b/common/solver/ir_kernels.hpp.inc
@@ -0,0 +1,41 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_cols, stopping_status *stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_cols) {
+        stop_status[tidx].reset();
+    }
+}
\ No newline at end of file
diff --git a/contributors.txt b/contributors.txt
index 22856fbdb9f..fd97439ad7f 100644
--- a/contributors.txt
+++ b/contributors.txt
@@ -8,10 +8,12 @@ Cojean Terry <terry.cojean@kit.edu> Karlsruhe Institute of Technology
 Drzaic Jelena <jelena.drzaic1@gmail.com> University of Zagreb
 Flegar Goran <flegar@uji.es> Universitat Jaume I
 Göbel Fritz <goebel.fritz@gmail.com> Karlsruhe Institute of Technology
+Grötzinger Dennis <dennis.groetzinger@web.de> Karlsruhe Institute of Technology
 Grützmacher Thomas <thogru.kit@gmx.de> Karlsruhe Institute of Technology
 Heroux Mike <maherou@sandia.gov> Sandia National Laboratories
 Hoemmen Mark <mhoemme@sandia.gov> Sandia National Laboratories
 Holeksa Claudius <mail@keldu.de> Karlsruhe Institute of Technology
+Maier Matthias <matthias@43-1.org> Texas A&M University
 Nayak Pratik <pratik.nayak@kit.edu> Karlsruhe Institute of Technology
 Ribizel Tobias <mail@upsj.de> Karlsruhe Institute of Technology
 Tsai Yuhsiang <yhmtsai@gmail.com> National Taiwan University
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index a976e362d97..036f6f1fe19 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -4,39 +4,46 @@ add_subdirectory(device_hooks)  # placeholders for disabled modules
 add_library(ginkgo "")
 target_sources(ginkgo
     PRIVATE
-        base/combination.cpp
-        base/composition.cpp
-        base/executor.cpp
-        base/mtx_io.cpp
-        base/perturbation.cpp
-        base/version.cpp
-        factorization/par_ilu.cpp
-        log/convergence.cpp
-        log/logger.cpp
-        log/record.cpp
-        log/stream.cpp
-        matrix/coo.cpp
-        matrix/csr.cpp
-        matrix/dense.cpp
-        matrix/ell.cpp
-        matrix/hybrid.cpp
-        matrix/identity.cpp
-        matrix/sellp.cpp
-        matrix/sparsity_csr.cpp
-        preconditioner/jacobi.cpp
-        solver/bicgstab.cpp
-        solver/cg.cpp
-        solver/cgs.cpp
-        solver/fcg.cpp
-        solver/gmres.cpp
-        solver/ir.cpp
-        solver/lower_trs.cpp
-        solver/upper_trs.cpp
-        stop/combined.cpp
-        stop/criterion.cpp
-        stop/iteration.cpp
-        stop/residual_norm_reduction.cpp
-        stop/time.cpp)
+    base/array.cpp
+    base/combination.cpp
+    base/composition.cpp
+    base/executor.cpp
+    base/mtx_io.cpp
+    base/perturbation.cpp
+    base/version.cpp
+    factorization/ilu.cpp
+    factorization/par_ict.cpp
+    factorization/par_ilu.cpp
+    factorization/par_ilut.cpp
+    log/convergence.cpp
+    log/logger.cpp
+    log/record.cpp
+    log/stream.cpp
+    matrix/coo.cpp
+    matrix/csr.cpp
+    matrix/dense.cpp
+    matrix/ell.cpp
+    matrix/hybrid.cpp
+    matrix/identity.cpp
+    matrix/permutation.cpp
+    matrix/sellp.cpp
+    matrix/sparsity_csr.cpp
+    preconditioner/isai.cpp
+    preconditioner/jacobi.cpp
+    solver/bicg.cpp
+    solver/bicgstab.cpp
+    solver/cg.cpp
+    solver/cgs.cpp
+    solver/fcg.cpp
+    solver/gmres.cpp
+    solver/ir.cpp
+    solver/lower_trs.cpp
+    solver/upper_trs.cpp
+    stop/combined.cpp
+    stop/criterion.cpp
+    stop/iteration.cpp
+    stop/residual_norm.cpp
+    stop/time.cpp)
 
 if(GINKGO_HAVE_PAPI_SDE)
     target_sources(ginkgo PRIVATE log/papi.cpp)
@@ -49,14 +56,18 @@ target_compile_options(ginkgo PRIVATE "${GINKGO_COMPILER_FLAGS}")
 # regardless of whether it is installed or added as a subdirectory
 add_library(Ginkgo::ginkgo ALIAS ginkgo)
 target_link_libraries(ginkgo
-    PUBLIC ginkgo_omp ginkgo_cuda ginkgo_reference)
+    PUBLIC ginkgo_omp ginkgo_cuda ginkgo_reference ginkgo_hip)
+# The PAPI dependency needs to be exposed to the user.
 if (GINKGO_HAVE_PAPI_SDE)
-    target_link_libraries(ginkgo PRIVATE PAPI::PAPI)
+    target_link_libraries(ginkgo PUBLIC PAPI::PAPI)
 endif()
 ginkgo_default_includes(ginkgo)
 ginkgo_install_library(ginkgo core)
 
+if (GINKGO_CHECK_CIRCULAR_DEPS)
+    ginkgo_check_headers(ginkgo)
+endif()
+
 if(GINKGO_BUILD_TESTS)
     add_subdirectory(test)
 endif()
-
diff --git a/core/base/allocator.hpp b/core/base/allocator.hpp
new file mode 100644
index 00000000000..0c62f5deccb
--- /dev/null
+++ b/core/base/allocator.hpp
@@ -0,0 +1,175 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_ALLOCATOR_HPP_
+#define GKO_CORE_BASE_ALLOCATOR_HPP_
+
+
+#include <map>
+#include <memory>
+#include <set>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace gko {
+
+/**
+ * @internal
+ *
+ * C++ standard library-compatible allocator that uses an executor for
+ * allocations.
+ *
+ * @tparam T  the type of the allocated elements.
+ */
+template <typename T>
+class ExecutorAllocator {
+public:
+    using value_type = T;
+    using propagate_on_container_copy_assignment = std::true_type;
+    using propagate_on_container_move_assignment = std::true_type;
+    using propagate_on_container_swap = std::true_type;
+
+    /**
+     * Constructs an allocator from a given executor.
+     *
+     * This function works with both const and non-const ExecType,
+     * as long as it is derived from gko::Executor.
+     * @param exec  the executor
+     * @tparam ExecType  the static type of the executor
+     */
+    template <typename ExecType>
+    ExecutorAllocator(std::shared_ptr<ExecType> exec) : exec_{std::move(exec)}
+    {}
+
+    /**
+     * Constructs an allocator for another element type from a given executor.
+     *
+     * This is related to `std::allocator_traits::template rebind<U>` and its
+     * use in more advanced data structures.
+     *
+     * @param other  the other executor
+     * @tparam U  the element type of the allocator to be constructed.
+     */
+    template <typename U>
+    explicit ExecutorAllocator(const ExecutorAllocator<U> &other)
+        : exec_{other.get_executor()}
+    {}
+
+    /** Returns the executor used by this allocator.  */
+    std::shared_ptr<const Executor> get_executor() const { return exec_; }
+
+    /**
+     * Allocates a memory area of the given size.
+     *
+     * @param n  the number of elements to allocate
+     * @return  the pointer to a newly allocated memory area of `n` elements.
+     */
+    T *allocate(std::size_t n) const { return exec_->alloc<T>(n); }
+
+    /**
+     * Frees a memory area that was allocated by this allocator.
+     *
+     * @param ptr  The memory area to free, previously returned by `allocate`.
+     *
+     * @note  The second parameter is unused.
+     */
+    void deallocate(T *ptr, std::size_t) const { exec_->free(ptr); }
+
+    /**
+     * Compares two ExecutorAllocators for equality
+     *
+     * @param l  the first allocator
+     * @param r  the second allocator
+     * @return true iff the two allocators use the same executor
+     */
+    template <typename T2>
+    friend bool operator==(const ExecutorAllocator<T> &l,
+                           const ExecutorAllocator<T2> &r)
+    {
+        return l.get_executor() == r.get_executor();
+    }
+
+    /**
+     * Compares two ExecutorAllocators for inequality
+     *
+     * @param l  the first allocator
+     * @param r  the second allocator
+     * @return true iff the two allocators use different executors
+     */
+    template <typename T2>
+    friend bool operator!=(const ExecutorAllocator<T> &l,
+                           const ExecutorAllocator<T2> &r)
+    {
+        return !(l == r);
+    }
+
+private:
+    std::shared_ptr<const Executor> exec_;
+};
+
+
+// Convenience type aliases
+/** std::vector using an ExecutorAllocator. */
+template <typename T>
+using vector = std::vector<T, ExecutorAllocator<T>>;
+
+/** std::set using an ExecutorAllocator. */
+template <typename Key>
+using set = std::set<Key, std::less<Key>, gko::ExecutorAllocator<Key>>;
+
+/** std::map using an ExecutorAllocator. */
+template <typename Key, typename Value>
+using map = std::map<Key, Value, std::less<Key>,
+                     gko::ExecutorAllocator<std::pair<const Key, Value>>>;
+
+/** std::unordered_set using an ExecutorAllocator. */
+template <typename Key>
+using unordered_set =
+    std::unordered_set<Key, std::hash<Key>, std::equal_to<Key>,
+                       gko::ExecutorAllocator<Key>>;
+
+/** std::unordered_map using an ExecutorAllocator. */
+template <typename Key, typename Value>
+using unordered_map =
+    std::unordered_map<Key, Value, std::hash<Key>, std::equal_to<Key>,
+                       gko::ExecutorAllocator<std::pair<const Key, Value>>>;
+
+
+}  // namespace gko
+
+#endif  // GKO_CORE_BASE_ALLOCATOR_HPP_
\ No newline at end of file
diff --git a/core/base/array.cpp b/core/base/array.cpp
new file mode 100644
index 00000000000..21d8b5f3326
--- /dev/null
+++ b/core/base/array.cpp
@@ -0,0 +1,71 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/components/precision_conversion.hpp"
+
+
+namespace gko {
+namespace conversion {
+
+
+GKO_REGISTER_OPERATION(convert, components::convert_precision);
+
+
+}  // namespace conversion
+
+
+namespace detail {
+
+
+template <typename SourceType, typename TargetType>
+void convert_data(std::shared_ptr<const Executor> exec, size_type size,
+                  const SourceType *src, TargetType *dst)
+{
+    exec->run(conversion::make_convert(size, src, dst));
+}
+
+
+#define GKO_DECLARE_ARRAY_CONVERSION(From, To)                              \
+    void convert_data<From, To>(std::shared_ptr<const Executor>, size_type, \
+                                const From *, To *)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_ARRAY_CONVERSION);
+
+
+}  // namespace detail
+}  // namespace gko
diff --git a/core/base/combination.cpp b/core/base/combination.cpp
index 567d8d9778b..dd95298858e 100644
--- a/core/base/combination.cpp
+++ b/core/base/combination.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,6 +59,45 @@ inline void initialize_scalars(std::shared_ptr<const Executor> exec,
 }  // namespace
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Combination<ValueType>::transpose() const
+{
+    auto transposed = Combination<ValueType>::create(this->get_executor());
+    transposed->set_size(gko::transpose(this->get_size()));
+    // copy coefficients
+    for (auto &coef : get_coefficients()) {
+        transposed->coefficients_.push_back(share(coef->clone()));
+    }
+    // transpose operators
+    for (auto &op : get_operators()) {
+        transposed->operators_.push_back(
+            share(as<Transposable>(op)->transpose()));
+    }
+
+    return std::move(transposed);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Combination<ValueType>::conj_transpose() const
+{
+    auto transposed = Combination<ValueType>::create(this->get_executor());
+    transposed->set_size(gko::transpose(this->get_size()));
+    // conjugate coefficients!
+    for (auto &coef : get_coefficients()) {
+        transposed->coefficients_.push_back(
+            share(as<Transposable>(coef)->conj_transpose()));
+    }
+    // conjugate-transpose operators
+    for (auto &op : get_operators()) {
+        transposed->operators_.push_back(
+            share(as<Transposable>(op)->conj_transpose()));
+    }
+
+    return std::move(transposed);
+}
+
+
 template <typename ValueType>
 void Combination<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index ea15b5087c4..6fb0171b56e 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,50 +33,143 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/composition.hpp>
 
 
+#include <algorithm>
+#include <iterator>
+
+
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/components/fill_array.hpp"
+
+
 namespace gko {
-namespace {
+namespace composition {
+
+
+GKO_REGISTER_OPERATION(fill_array, components::fill_array);
 
 
-template <typename ValueType, typename OpIterator, typename VecIterator>
-inline void allocate_vectors(OpIterator begin, OpIterator end, VecIterator res)
+}  // namespace composition
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> apply_inner_operators(
+    const std::vector<std::shared_ptr<const LinOp>> &operators,
+    Array<ValueType> &storage, const LinOp *rhs)
 {
-    for (auto it = begin; it != end; ++it, ++res) {
-        if (*res != nullptr && (*res)->get_size()[0] == (*it)->get_size()[0]) {
-            continue;
+    using Dense = matrix::Dense<ValueType>;
+    // determine amount of necessary storage:
+    // maximum sum of two subsequent intermediate vectors
+    // (and the out dimension of the last op if we only have one operator)
+    auto num_rhs = rhs->get_size()[1];
+    auto max_intermediate_size = std::accumulate(
+        begin(operators) + 1, end(operators) - 1,
+        operators.back()->get_size()[0],
+        [](size_type acc, std::shared_ptr<const LinOp> op) {
+            return std::max(acc, op->get_size()[0] + op->get_size()[1]);
+        });
+    auto storage_size = max_intermediate_size * num_rhs;
+    storage.resize_and_reset(storage_size);
+
+    // apply inner vectors
+    auto exec = rhs->get_executor();
+    auto data = storage.get_data();
+    // apply last operator
+    auto op_size = operators.back()->get_size();
+    auto out_dim = gko::dim<2>{op_size[0], num_rhs};
+    auto out_size = out_dim[0] * num_rhs;
+    auto out = Dense::create(
+        exec, out_dim, Array<ValueType>::view(exec, out_size, data), num_rhs);
+    // for operators with initial guess: set initial guess
+    if (operators.back()->apply_uses_initial_guess()) {
+        if (op_size[0] == op_size[1]) {
+            // square matrix: we can use the previous output
+            exec->copy(out_size, as<Dense>(rhs)->get_const_values(),
+                       out->get_values());
+        } else {
+            // rectangular matrix: we can't do better than zeros
+            exec->run(composition::make_fill_array(out->get_values(), out_size,
+                                                   zero<ValueType>()));
         }
-        *res = matrix::Dense<ValueType>::create(
-            (*it)->get_executor(), gko::dim<2>{(*it)->get_size()[0], 1});
     }
+    operators.back()->apply(rhs, lend(out));
+    // apply following operators
+    // alternate intermediate vectors between beginning/end of storage
+    auto reversed_storage = true;
+    for (auto i = operators.size() - 2; i > 0; --i) {
+        // swap in and out
+        auto in = std::move(out);
+        // build new intermediate vector
+        op_size = operators[i]->get_size();
+        out_dim[0] = op_size[0];
+        out_size = out_dim[0] * num_rhs;
+        auto out_data =
+            data + (reversed_storage ? storage_size - out_size : size_type{});
+        reversed_storage = !reversed_storage;
+        out = Dense::create(exec, out_dim,
+                            Array<ValueType>::view(exec, out_size, out_data),
+                            num_rhs);
+        // for operators with initial guess: set initial guess
+        if (operators[i]->apply_uses_initial_guess()) {
+            if (op_size[0] == op_size[1]) {
+                // square matrix: we can use the previous output
+                exec->copy(out_size, in->get_const_values(), out->get_values());
+            } else {
+                // rectangular matrix: we can't do better than zeros
+                exec->run(composition::make_fill_array(
+                    out->get_values(), out_size, zero<ValueType>()));
+            }
+        }
+        // apply operator
+        operators[i]->apply(lend(in), lend(out));
+    }
+
+    return std::move(out);
 }
 
 
-inline const LinOp *apply_inner_operators(
-    const std::vector<std::shared_ptr<const LinOp>> &operators,
-    const std::vector<std::unique_ptr<LinOp>> &intermediate, const LinOp *rhs)
+template <typename ValueType>
+std::unique_ptr<LinOp> Composition<ValueType>::transpose() const
 {
-    for (auto i = operators.size() - 1; i > 0u; --i) {
-        auto solution = lend(intermediate[i - 1]);
-        operators[i]->apply(rhs, solution);
-        rhs = solution;
-    }
-    return rhs;
+    auto transposed = Composition<ValueType>::create(this->get_executor());
+    transposed->set_size(gko::transpose(this->get_size()));
+    // transpose and reverse operators
+    std::transform(this->get_operators().rbegin(), this->get_operators().rend(),
+                   std::back_inserter(transposed->operators_),
+                   [](const std::shared_ptr<const LinOp> &op) {
+                       return share(as<Transposable>(op)->transpose());
+                   });
+
+    return std::move(transposed);
 }
 
 
-}  // namespace
+template <typename ValueType>
+std::unique_ptr<LinOp> Composition<ValueType>::conj_transpose() const
+{
+    auto transposed = Composition<ValueType>::create(this->get_executor());
+    transposed->set_size(gko::transpose(this->get_size()));
+    // conjugate-transpose and reverse operators
+    std::transform(this->get_operators().rbegin(), this->get_operators().rend(),
+                   std::back_inserter(transposed->operators_),
+                   [](const std::shared_ptr<const LinOp> &op) {
+                       return share(as<Transposable>(op)->conj_transpose());
+                   });
+
+    return std::move(transposed);
+}
 
 
 template <typename ValueType>
 void Composition<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
-    cache_.intermediate.resize(operators_.size() - 1);
-    allocate_vectors<ValueType>(begin(operators_) + 1, end(operators_),
-                                begin(cache_.intermediate));
-    operators_[0]->apply(
-        apply_inner_operators(operators_, cache_.intermediate, b), x);
+    if (operators_.size() > 1) {
+        operators_[0]->apply(
+            lend(apply_inner_operators(operators_, storage_, b)), x);
+    } else {
+        operators_[0]->apply(b, x);
+    }
 }
 
 
@@ -84,12 +177,13 @@ template <typename ValueType>
 void Composition<ValueType>::apply_impl(const LinOp *alpha, const LinOp *b,
                                         const LinOp *beta, LinOp *x) const
 {
-    cache_.intermediate.resize(operators_.size() - 1);
-    allocate_vectors<ValueType>(begin(operators_) + 1, end(operators_),
-                                begin(cache_.intermediate));
-    operators_[0]->apply(
-        alpha, apply_inner_operators(operators_, cache_.intermediate, b), beta,
-        x);
+    if (operators_.size() > 1) {
+        operators_[0]->apply(
+            alpha, lend(apply_inner_operators(operators_, storage_, b)), beta,
+            x);
+    } else {
+        operators_[0]->apply(alpha, b, beta, x);
+    }
 }
 
 
diff --git a/core/base/executor.cpp b/core/base/executor.cpp
index 4c2d6828ee6..9d80ad818f0 100644
--- a/core/base/executor.cpp
+++ b/core/base/executor.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -49,6 +49,10 @@ void Operation::run(std::shared_ptr<const CudaExecutor> executor) const
     GKO_NOT_IMPLEMENTED;
 
 
+void Operation::run(std::shared_ptr<const HipExecutor> executor) const
+    GKO_NOT_IMPLEMENTED;
+
+
 void Operation::run(std::shared_ptr<const ReferenceExecutor> executor) const
 {
     this->run(static_cast<std::shared_ptr<const OmpExecutor>>(executor));
diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp
index 84be68eca66..1d35b09f8e7 100644
--- a/core/base/extended_float.hpp
+++ b/core/base/extended_float.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cuda_fp16.h>
 
 
+#elif defined(__HIP_DEVICE_COMPILE__)
+
+
+#include <hip/hip_fp16.h>
+
+
 #endif  // __CUDA_ARCH__
 
 
@@ -301,16 +307,16 @@ struct precision_converter<SourceType, ResultType, false> {
  */
 class half {
 public:
-    GKO_ATTRIBUTES half() noexcept = default;
+    half() noexcept = default;
 
     GKO_ATTRIBUTES half(float32 val) noexcept
     {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         const auto tmp = __float2half_rn(val);
         data_ = reinterpret_cast<const uint16 &>(tmp);
-#else   // __CUDA_ARCH__
+#else   // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         data_ = float2half(reinterpret_cast<const uint32 &>(val));
-#endif  // __CUDA_ARCH__
+#endif  // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     }
 
     GKO_ATTRIBUTES half(float64 val) noexcept : half(static_cast<float32>(val))
@@ -318,12 +324,12 @@ class half {
 
     GKO_ATTRIBUTES operator float32() const noexcept
     {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         return __half2float(reinterpret_cast<const __half &>(data_));
-#else   // __CUDA_ARCH__
+#else   // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         const auto bits = half2float(data_);
         return reinterpret_cast<const float32 &>(bits);
-#endif  // __CUDA_ARCH__
+#endif  // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     }
 
     GKO_ATTRIBUTES operator float64() const noexcept
@@ -331,6 +337,14 @@ class half {
         return static_cast<float64>(static_cast<float32>(*this));
     }
 
+    GKO_ATTRIBUTES half operator-() const noexcept
+    {
+        auto res = *this;
+        // flip sign bit
+        res.data_ ^= f16_traits::sign_mask;
+        return res;
+    }
+
 private:
     using f16_traits = detail::float_traits<float16>;
     using f32_traits = detail::float_traits<float32>;
@@ -434,7 +448,7 @@ class truncated {
     static_assert(component_id < num_components,
                   "This type doesn't have that many components");
 
-    GKO_ATTRIBUTES truncated() noexcept = default;
+    truncated() noexcept = default;
 
     GKO_ATTRIBUTES explicit truncated(const float_type &val) noexcept
     {
@@ -450,6 +464,16 @@ class truncated {
         return reinterpret_cast<const float_type &>(bits);
     }
 
+    GKO_ATTRIBUTES truncated operator-() const noexcept
+    {
+        auto res = *this;
+        // flip sign bit
+        if (ComponentId == 0) {
+            res.data_ ^= bits_type{1} << (8 * sizeof(bits_type) - 1);
+        }
+        return res;
+    }
+
 private:
     bits_type data_;
 };
diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index a9d03603f4c..b7efd21dfe0 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -292,7 +292,7 @@ class IteratorFactory {
 
         Reference operator*() const { return {parent_, arr_index_}; }
 
-        Reference operator[](size_t idx) const
+        Reference operator[](difference_type idx) const
         {
             return {parent_, arr_index_ + idx};
         }
diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index 26995be0b4d..ab2b96cce29 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -139,6 +139,7 @@ class mtx_io {
     struct : entry_format {
         /**
          * reads entry from the input stream
+         *
          * @param  is the input stream
          *
          * @return the matrix entry.
@@ -152,6 +153,7 @@ class mtx_io {
 
         /**
          * writes entry to the output stream
+         *
          * @param  os the output stream
          * @param  value the matrix entry to be written
          */
@@ -186,6 +188,7 @@ class mtx_io {
     struct : entry_format {
         /**
          * reads entry from the input stream
+         *
          * @param  is the input stream
          *
          * @return the matrix entry.
@@ -197,6 +200,7 @@ class mtx_io {
 
         /**
          * writes entry to the output stream
+         *
          * @param  os the output stream
          * @param  value the matrix entry to be written
          */
@@ -237,6 +241,7 @@ class mtx_io {
     struct : entry_format {
         /**
          * reads entry from the input stream
+         *
          * @param  dummy input stream
          *
          * @return the matrix entry(one).
@@ -248,6 +253,7 @@ class mtx_io {
 
         /**
          * writes entry to the output stream
+         *
          * @param  dummy output stream
          * @param  dummy matrix entry to be written
          */
@@ -284,6 +290,7 @@ class mtx_io {
     struct : storage_modifier {
         /**
          * get the reservation size
+         *
          * @param num_rows  the number of rows
          * @param num_cols  the number of columns
          * @param num_nonzeros  the number of non-zeros
@@ -298,6 +305,7 @@ class mtx_io {
 
         /**
          * Insert an entry
+         *
          * @param row  The row where the entry is to be inserted.
          * @param col  The column where the entry is to be inserted.
          * @param entry  the entry to be inserted.
@@ -337,6 +345,7 @@ class mtx_io {
 
         /**
          * Insert an entry
+         *
          * @param row  The row where the entry is to be inserted.
          * @param col  The column where the entry is to be inserted.
          * @param entry  the entry to be inserted.
@@ -366,6 +375,7 @@ class mtx_io {
     struct : storage_modifier {
         /**
          * get the reservation size
+         *
          * @param num_rows
          * @param num_cols
          * @param num_nonzeros  the number of non-zeros
@@ -380,6 +390,7 @@ class mtx_io {
 
         /**
          * Insert an entry
+         *
          * @param row  The row where the entry is to be inserted.
          * @param col  The column where the entry is to be inserted.
          * @param entry  the entry to be inserted.
@@ -409,6 +420,7 @@ class mtx_io {
     struct : storage_modifier {
         /**
          * get the reservation size
+         *
          * @param num_rows
          * @param num_cols
          * @param num_nonzeros  the number of non-zeros
@@ -423,6 +435,7 @@ class mtx_io {
 
         /**
          * Insert an entry
+         *
          * @param row  The row where the entry is to be inserted.
          * @param col  The column where the entry is to be inserted.
          * @param entry  the entry to be inserted.
@@ -667,6 +680,7 @@ class mtx_io {
 
     /**
      * reads and parses the first line of the header
+     *
      * @param is  the input stream
      *
      * @return the data containing the description
@@ -711,6 +725,7 @@ class mtx_io {
 
     /**
      * reads and parses the header
+     *
      * @param is  The input stream to read the header from.
      *
      * @return the header data
diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp
index f2cbaeb6587..a7a6a0b004b 100644
--- a/core/base/perturbation.cpp
+++ b/core/base/perturbation.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/base/utils.hpp b/core/base/utils.hpp
new file mode 100644
index 00000000000..4e6fbc1dfce
--- /dev/null
+++ b/core/base/utils.hpp
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_INTERNAL_CORE_BASE_UTILS_HPP_
+#define GKO_INTERNAL_CORE_BASE_UTILS_HPP_
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ValueType checked_load(const ValueType *p,
+                                                 IndexType i, IndexType size,
+                                                 ValueType sentinel)
+{
+    return i < size ? p[i] : sentinel;
+}
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_INTERNAL_CORE_BASE_UTILS_HPP_
\ No newline at end of file
diff --git a/core/base/version.cpp b/core/base/version.cpp
index 16846760594..7993cee5cae 100644
--- a/core/base/version.cpp
+++ b/core/base/version.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,8 @@ std::ostream &operator<<(std::ostream &os, const version_info &ver_info)
     print_version(os, ver_info.omp_version);
     os << "\n    the CUDA      module is  ";
     print_version(os, ver_info.cuda_version);
+    os << "\n    the HIP      module is  ";
+    print_version(os, ver_info.hip_version);
     return os;
 }
 
diff --git a/core/components/fill_array.hpp b/core/components/fill_array.hpp
new file mode 100644
index 00000000000..7bafb8aecb4
--- /dev/null
+++ b/core/components/fill_array.hpp
@@ -0,0 +1,101 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_COMPONENTS_FILL_ARRAY_HPP_
+#define GKO_CORE_COMPONENTS_FILL_ARRAY_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_FILL_ARRAY_KERNEL(ValueType)                 \
+    void fill_array(std::shared_ptr<const DefaultExecutor> exec, \
+                    ValueType *data, size_type num_entries, ValueType val)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES \
+    template <typename IndexType>    \
+    GKO_DECLARE_FILL_ARRAY_KERNEL(IndexType)
+
+
+namespace omp {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace omp
+
+
+namespace cuda {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace cuda
+
+
+namespace reference {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace reference
+
+
+namespace hip {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_COMPONENTS_FILL_ARRAY_HPP_
diff --git a/core/components/precision_conversion.hpp b/core/components/precision_conversion.hpp
new file mode 100644
index 00000000000..719c596c34e
--- /dev/null
+++ b/core/components/precision_conversion.hpp
@@ -0,0 +1,102 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_COMPONENTS_PRECISION_CONVERSION_HPP_
+#define GKO_CORE_COMPONENTS_PRECISION_CONVERSION_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_CONVERT_PRECISION_KERNEL(SourceType, TargetType)    \
+    void convert_precision(std::shared_ptr<const DefaultExecutor> exec, \
+                           size_type size, const SourceType *in,        \
+                           TargetType *out)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                    \
+    template <typename SourceType, typename TargetType> \
+    GKO_DECLARE_CONVERT_PRECISION_KERNEL(SourceType, TargetType)
+
+
+namespace omp {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace omp
+
+
+namespace cuda {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace cuda
+
+
+namespace reference {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace reference
+
+
+namespace hip {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CORE_COMPONENTS_PRECISION_CONVERSION_HPP_
diff --git a/core/components/prefix_sum.hpp b/core/components/prefix_sum.hpp
new file mode 100644
index 00000000000..d171be831aa
--- /dev/null
+++ b/core/components/prefix_sum.hpp
@@ -0,0 +1,100 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
+#define GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)                 \
+    void prefix_sum(std::shared_ptr<const DefaultExecutor> exec, \
+                    IndexType *counts, size_type num_entries)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES \
+    template <typename IndexType>    \
+    GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)
+
+
+namespace omp {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace omp
+
+
+namespace cuda {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace cuda
+
+
+namespace reference {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace reference
+
+
+namespace hip {
+namespace components {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace components
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt
index 4ca355a3d51..94dfc8ab9f0 100644
--- a/core/device_hooks/CMakeLists.txt
+++ b/core/device_hooks/CMakeLists.txt
@@ -3,6 +3,7 @@ if(NOT GINKGO_BUILD_CUDA)
         $<TARGET_OBJECTS:ginkgo_cuda_device>
         cuda_hooks.cpp)
     ginkgo_compile_features(ginkgo_cuda)
+    target_link_libraries(ginkgo_cuda PUBLIC ginkgo_hip)
     ginkgo_default_includes(ginkgo_cuda)
     ginkgo_install_library(ginkgo_cuda cuda)
 endif()
@@ -13,6 +14,7 @@ if (NOT GINKGO_BUILD_OMP)
         omp_hooks.cpp)
     ginkgo_compile_features(ginkgo_omp)
     target_link_libraries(ginkgo_omp PUBLIC ginkgo_cuda)
+    target_link_libraries(ginkgo_omp PUBLIC ginkgo_hip)
     ginkgo_default_includes(ginkgo_omp)
     ginkgo_install_library(ginkgo_omp omp)
 endif()
@@ -25,3 +27,12 @@ if (NOT GINKGO_BUILD_REFERENCE)
     ginkgo_default_includes(ginkgo_reference)
     ginkgo_install_library(ginkgo_reference reference)
 endif()
+
+if(NOT GINKGO_BUILD_HIP)
+    add_library(ginkgo_hip
+        $<TARGET_OBJECTS:ginkgo_hip_device>
+        hip_hooks.cpp)
+    ginkgo_compile_features(ginkgo_hip)
+    ginkgo_default_includes(ginkgo_hip)
+    ginkgo_install_library(ginkgo_hip hip)
+endif()
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 05a7f8bc136..53798c2b596 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,7 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "core/components/precision_conversion.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/factorization/ilu_kernels.hpp"
+#include "core/factorization/par_ict_kernels.hpp"
 #include "core/factorization/par_ilu_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -41,7 +48,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/hybrid_kernels.hpp"
 #include "core/matrix/sellp_kernels.hpp"
 #include "core/matrix/sparsity_csr_kernels.hpp"
+#include "core/preconditioner/isai_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
+#include "core/solver/bicg_kernels.hpp"
 #include "core/solver/bicgstab_kernels.hpp"
 #include "core/solver/cg_kernels.hpp"
 #include "core/solver/cgs_kernels.hpp"
@@ -51,7 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/solver/lower_trs_kernels.hpp"
 #include "core/solver/upper_trs_kernels.hpp"
 #include "core/stop/criterion_kernels.hpp"
-#include "core/stop/residual_norm_reduction_kernels.hpp"
+#include "core/stop/residual_norm_kernels.hpp"
 
 
 #ifndef GKO_HOOK_MODULE
@@ -62,6 +71,32 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 namespace kernels {
 namespace GKO_HOOK_MODULE {
+namespace components {
+
+
+template <typename SourceType, typename TargetType>
+GKO_DECLARE_CONVERT_PRECISION_KERNEL(SourceType, TargetType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+
+template <typename IndexType>
+GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+// explicitly instantiate for size_type, as this is used in the SellP format
+template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
+
+template <typename IndexType>
+GKO_DECLARE_FILL_ARRAY_KERNEL(IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type);
+
+
+}  // namespace components
+
+
 namespace dense {
 
 
@@ -164,6 +199,29 @@ GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(ValueType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_ROW_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL);
+
 
 }  // namespace dense
 
@@ -190,6 +248,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
 }  // namespace cg
 
 
+namespace bicg {
+
+
+template <typename ValueType>
+GKO_DECLARE_BICG_INITIALIZE_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_BICG_STEP_1_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_BICG_STEP_2_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+
+
+}  // namespace bicg
+
+
 namespace lower_trs {
 
 
@@ -422,6 +502,22 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
@@ -469,6 +565,30 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
@@ -667,6 +787,18 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
@@ -680,20 +812,102 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 }  // namespace jacobi
 
 
-namespace par_ilu_factorization {
+namespace isai {
 
 
 template <typename ValueType, typename IndexType>
-GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType)
+GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL);
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
-GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL(ValueType, IndexType)
+GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL);
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+
+
+namespace factorization {
+
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+
+
+namespace ilu_factorization {
+
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+
+
+}  // namespace ilu_factorization
+
+
+namespace par_ict_factorization {
+
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+
+
+namespace par_ilu_factorization {
+
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL(ValueType, IndexType)
@@ -705,6 +919,43 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace par_ilu_factorization
 
 
+namespace par_ilut_factorization {
+
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+
+
 namespace set_all_statuses {
 
 
@@ -715,16 +966,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 }  // namespace set_all_statuses
 
 
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 
 template <typename ValueType>
-GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(ValueType)
+GKO_DECLARE_RESIDUAL_NORM_KERNEL(ValueType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace GKO_HOOK_MODULE
 }  // namespace kernels
 }  // namespace gko
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index 884b85425b3..d41d77d24d9 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <memory>
+#include <string>
+
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
 
 
@@ -50,10 +55,10 @@ version version_info::get_cuda_version() noexcept
 
 
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
-    int device_id, std::shared_ptr<Executor> master)
+    int device_id, std::shared_ptr<Executor> master, bool device_reset)
 {
     return std::shared_ptr<CudaExecutor>(
-        new CudaExecutor(device_id, std::move(master)));
+        new CudaExecutor(device_id, std::move(master), device_reset));
 }
 
 
@@ -70,8 +75,7 @@ void CudaExecutor::raw_free(void *ptr) const noexcept
 }
 
 
-void *CudaExecutor::raw_alloc(size_type num_bytes) const
-    GKO_NOT_COMPILED(nvidia);
+void *CudaExecutor::raw_alloc(size_type num_bytes) const GKO_NOT_COMPILED(cuda);
 
 
 void CudaExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes,
@@ -84,6 +88,11 @@ void CudaExecutor::raw_copy_to(const CudaExecutor *, size_type num_bytes,
     GKO_NOT_COMPILED(cuda);
 
 
+void CudaExecutor::raw_copy_to(const HipExecutor *, size_type num_bytes,
+                               const void *src_ptr, void *dest_ptr) const
+    GKO_NOT_COMPILED(cuda);
+
+
 void CudaExecutor::synchronize() const GKO_NOT_COMPILED(cuda);
 
 
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
new file mode 100644
index 00000000000..a2e288b4157
--- /dev/null
+++ b/core/device_hooks/hip_hooks.cpp
@@ -0,0 +1,135 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <memory>
+#include <string>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/version.hpp>
+
+
+namespace gko {
+
+
+version version_info::get_hip_version() noexcept
+{
+    // We just return 1.1.0 with a special "not compiled" tag in placeholder
+    // modules.
+    return {1, 1, 0, "not compiled"};
+}
+
+
+std::shared_ptr<HipExecutor> HipExecutor::create(
+    int device_id, std::shared_ptr<Executor> master, bool device_reset)
+{
+    return std::shared_ptr<HipExecutor>(
+        new HipExecutor(device_id, std::move(master), device_reset));
+}
+
+
+void OmpExecutor::raw_copy_to(const HipExecutor *, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+    GKO_NOT_COMPILED(hip);
+
+
+void HipExecutor::raw_free(void *ptr) const noexcept
+{
+    // Free must never fail, as it can be called in destructors.
+    // If the nvidia module was not compiled, the library couldn't have
+    // allocated the memory, so there is no need to deallocate it.
+}
+
+
+void *HipExecutor::raw_alloc(size_type num_bytes) const GKO_NOT_COMPILED(hip);
+
+
+void HipExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+    GKO_NOT_COMPILED(hip);
+
+
+void HipExecutor::raw_copy_to(const CudaExecutor *, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+    GKO_NOT_COMPILED(hip);
+
+
+void HipExecutor::raw_copy_to(const HipExecutor *, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+    GKO_NOT_COMPILED(hip);
+
+
+void HipExecutor::synchronize() const GKO_NOT_COMPILED(hip);
+
+
+void HipExecutor::run(const Operation &op) const
+{
+    op.run(
+        std::static_pointer_cast<const HipExecutor>(this->shared_from_this()));
+}
+
+
+std::string HipError::get_error(int64)
+{
+    return "ginkgo HIP module is not compiled";
+}
+
+
+std::string HipblasError::get_error(int64)
+{
+    return "ginkgo HIP module is not compiled";
+}
+
+
+std::string HipsparseError::get_error(int64)
+{
+    return "ginkgo HIP module is not compiled";
+}
+
+
+int HipExecutor::get_num_devices() { return 0; }
+
+
+void HipExecutor::set_gpu_property() {}
+
+
+void HipExecutor::init_handles() {}
+
+
+}  // namespace gko
+
+
+#define GKO_HOOK_MODULE hip
+#include "core/device_hooks/common_kernels.inc.cpp"
+#undef GKO_HOOK_MODULE
diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp
index 4fb251758a8..131fa51a4d8 100644
--- a/core/device_hooks/omp_hooks.cpp
+++ b/core/device_hooks/omp_hooks.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/device_hooks/reference_hooks.cpp b/core/device_hooks/reference_hooks.cpp
index 7e7ab287ca5..ea7742776c8 100644
--- a/core/device_hooks/reference_hooks.cpp
+++ b/core/device_hooks/reference_hooks.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/base/version.hpp>
 
 
diff --git a/core/devices/CMakeLists.txt b/core/devices/CMakeLists.txt
index 67e8a6fab58..2a5626c0018 100644
--- a/core/devices/CMakeLists.txt
+++ b/core/devices/CMakeLists.txt
@@ -8,4 +8,5 @@ endfunction()
 
 add_subdirectory(omp)
 add_subdirectory(cuda)
+add_subdirectory(hip)
 add_subdirectory(reference)
diff --git a/core/devices/cuda/executor.cpp b/core/devices/cuda/executor.cpp
index b377b2afa94..3566578a681 100644
--- a/core/devices/cuda/executor.cpp
+++ b/core/devices/cuda/executor.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/devices/hip/CMakeLists.txt b/core/devices/hip/CMakeLists.txt
new file mode 100644
index 00000000000..7f855b3e2e9
--- /dev/null
+++ b/core/devices/hip/CMakeLists.txt
@@ -0,0 +1,3 @@
+ginkgo_add_object_library(ginkgo_hip_device
+    executor.cpp)
+
diff --git a/core/devices/hip/executor.cpp b/core/devices/hip/executor.cpp
new file mode 100644
index 00000000000..f4787523290
--- /dev/null
+++ b/core/devices/hip/executor.cpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace gko {
+
+
+std::shared_ptr<Executor> HipExecutor::get_master() noexcept { return master_; }
+
+
+std::shared_ptr<const Executor> HipExecutor::get_master() const noexcept
+{
+    return master_;
+}
+
+
+int HipExecutor::num_execs[max_devices];
+
+
+std::mutex HipExecutor::mutex[max_devices];
+
+
+}  // namespace gko
diff --git a/core/devices/omp/executor.cpp b/core/devices/omp/executor.cpp
index 193672ef229..e53a1b53c43 100644
--- a/core/devices/omp/executor.cpp
+++ b/core/devices/omp/executor.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -68,7 +68,9 @@ void *OmpExecutor::raw_alloc(size_type num_bytes) const
 void OmpExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes,
                               const void *src_ptr, void *dest_ptr) const
 {
-    std::memcpy(dest_ptr, src_ptr, num_bytes);
+    if (num_bytes > 0) {
+        std::memcpy(dest_ptr, src_ptr, num_bytes);
+    }
 }
 
 
diff --git a/core/devices/reference/dummy.cpp b/core/devices/reference/dummy.cpp
index 971afc395e3..a2f3f380cbe 100644
--- a/core/devices/reference/dummy.cpp
+++ b/core/devices/reference/dummy.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp
new file mode 100644
index 00000000000..f7c25964dde
--- /dev/null
+++ b/core/factorization/factorization_kernels.hpp
@@ -0,0 +1,142 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_FACTORIZATION_KERNELS_HPP_
+#define GKO_CORE_FACTORIZATION_FACTORIZATION_KERNELS_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType,   \
+                                                               IndexType)   \
+    void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec, \
+                               matrix::Csr<ValueType, IndexType> *mtx,      \
+                               bool is_sorted)
+
+#define GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \
+                                                                 IndexType) \
+    void initialize_row_ptrs_l_u(                                           \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const matrix::Csr<ValueType, IndexType> *system_matrix,             \
+        IndexType *l_row_ptrs, IndexType *u_row_ptrs)
+
+#define GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL(ValueType, IndexType) \
+    void initialize_l_u(                                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const matrix::Csr<ValueType, IndexType> *system_matrix,               \
+        matrix::Csr<ValueType, IndexType> *l_factor,                          \
+        matrix::Csr<ValueType, IndexType> *u_factor)
+
+#define GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL(ValueType, \
+                                                               IndexType) \
+    void initialize_row_ptrs_l(                                           \
+        std::shared_ptr<const DefaultExecutor> exec,                      \
+        const matrix::Csr<ValueType, IndexType> *system_matrix,           \
+        IndexType *l_row_ptrs)
+
+#define GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL(ValueType, IndexType)   \
+    void initialize_l(std::shared_ptr<const DefaultExecutor> exec,            \
+                      const matrix::Csr<ValueType, IndexType> *system_matrix, \
+                      matrix::Csr<ValueType, IndexType> *l_factor,            \
+                      bool diag_sqrt)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                       \
+    template <typename ValueType, typename IndexType>                      \
+    GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType,      \
+                                                           IndexType);     \
+    template <typename ValueType, typename IndexType>                      \
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType,    \
+                                                             IndexType);   \
+    template <typename ValueType, typename IndexType>                      \
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                      \
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL(ValueType,      \
+                                                           IndexType);     \
+    template <typename ValueType, typename IndexType>                      \
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL(ValueType, IndexType)
+
+
+namespace omp {
+namespace factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace factorization
+}  // namespace omp
+
+
+namespace cuda {
+namespace factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace factorization
+}  // namespace cuda
+
+
+namespace reference {
+namespace factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace factorization
+}  // namespace reference
+
+
+namespace hip {
+namespace factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace factorization
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_FACTORIZATION_KERNELS_HPP_
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
new file mode 100644
index 00000000000..c2f397151d3
--- /dev/null
+++ b/core/factorization/ilu.cpp
@@ -0,0 +1,126 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/ilu.hpp>
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/factorization/ilu_kernels.hpp"
+#include "core/factorization/par_ilu_kernels.hpp"
+
+
+namespace gko {
+namespace factorization {
+namespace ilu_factorization {
+
+
+GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::compute_lu);
+GKO_REGISTER_OPERATION(add_diagonal_elements,
+                       factorization::add_diagonal_elements);
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
+                       factorization::initialize_row_ptrs_l_u);
+GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
+
+
+}  // namespace ilu_factorization
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
+    const std::shared_ptr<const LinOp> &system_matrix) const
+{
+    GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
+
+    const auto exec = this->get_executor();
+
+    // Converts the system matrix to CSR.
+    // Throws an exception if it is not convertible.
+    auto local_system_matrix = matrix_type::create(exec);
+    as<ConvertibleTo<matrix_type>>(system_matrix.get())
+        ->convert_to(local_system_matrix.get());
+
+    // Add explicit diagonal zero elements if they are missing
+    exec->run(ilu_factorization::make_add_diagonal_elements(
+        local_system_matrix.get(), false));
+
+    // Compute LU factorization
+    exec->run(ilu_factorization::make_compute_ilu(local_system_matrix.get()));
+
+    // Separate L and U factors: nnz
+    const auto matrix_size = local_system_matrix->get_size();
+    const auto num_rows = matrix_size[0];
+    Array<IndexType> l_row_ptrs{exec, num_rows + 1};
+    Array<IndexType> u_row_ptrs{exec, num_rows + 1};
+    exec->run(ilu_factorization::make_initialize_row_ptrs_l_u(
+        local_system_matrix.get(), l_row_ptrs.get_data(),
+        u_row_ptrs.get_data()));
+
+    // Get nnz from device memory
+    auto l_nnz = static_cast<size_type>(
+        exec->copy_val_to_host(l_row_ptrs.get_data() + num_rows));
+    auto u_nnz = static_cast<size_type>(
+        exec->copy_val_to_host(u_row_ptrs.get_data() + num_rows));
+
+    // Init arrays
+    Array<IndexType> l_col_idxs{exec, l_nnz};
+    Array<ValueType> l_vals{exec, l_nnz};
+    std::shared_ptr<matrix_type> l_factor = matrix_type::create(
+        exec, matrix_size, std::move(l_vals), std::move(l_col_idxs),
+        std::move(l_row_ptrs), parameters_.l_strategy);
+    Array<IndexType> u_col_idxs{exec, u_nnz};
+    Array<ValueType> u_vals{exec, u_nnz};
+    std::shared_ptr<matrix_type> u_factor = matrix_type::create(
+        exec, matrix_size, std::move(u_vals), std::move(u_col_idxs),
+        std::move(u_row_ptrs), parameters_.u_strategy);
+
+    // Separate L and U: columns and values
+    exec->run(ilu_factorization::make_initialize_l_u(
+        local_system_matrix.get(), l_factor.get(), u_factor.get()));
+
+    return Composition<ValueType>::create(std::move(l_factor),
+                                          std::move(u_factor));
+}
+
+
+#define GKO_DECLARE_ILU(ValueType, IndexType) class Ilu<ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU);
+
+
+}  // namespace factorization
+}  // namespace gko
diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp
new file mode 100644
index 00000000000..17602ac4ab4
--- /dev/null
+++ b/core/factorization/ilu_kernels.hpp
@@ -0,0 +1,105 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_
+#define GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_
+
+
+#include <ginkgo/core/factorization/ilu.hpp>
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)  \
+    void compute_lu(std::shared_ptr<const DefaultExecutor> exec, \
+                    matrix::Csr<ValueType, IndexType> *system_matrix)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                  \
+    template <typename ValueType, typename IndexType> \
+    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)
+
+
+namespace omp {
+namespace ilu_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace ilu_factorization
+}  // namespace omp
+
+
+namespace cuda {
+namespace ilu_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace ilu_factorization
+}  // namespace cuda
+
+
+namespace reference {
+namespace ilu_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace ilu_factorization
+}  // namespace reference
+
+
+namespace hip {
+namespace ilu_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace ilu_factorization
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
new file mode 100644
index 00000000000..46e9f8cebb6
--- /dev/null
+++ b/core/factorization/par_ict.cpp
@@ -0,0 +1,305 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/par_ict.hpp>
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/factorization/par_ict_kernels.hpp"
+#include "core/factorization/par_ilu_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+
+
+namespace gko {
+namespace factorization {
+namespace par_ict_factorization {
+
+
+GKO_REGISTER_OPERATION(threshold_select,
+                       par_ilut_factorization::threshold_select);
+GKO_REGISTER_OPERATION(threshold_filter,
+                       par_ilut_factorization::threshold_filter);
+GKO_REGISTER_OPERATION(threshold_filter_approx,
+                       par_ilut_factorization::threshold_filter_approx);
+GKO_REGISTER_OPERATION(add_candidates, par_ict_factorization::add_candidates);
+GKO_REGISTER_OPERATION(compute_factor, par_ict_factorization::compute_factor);
+
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
+                       factorization::initialize_row_ptrs_l);
+GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
+
+GKO_REGISTER_OPERATION(csr_conj_transpose, csr::conj_transpose);
+GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo);
+GKO_REGISTER_OPERATION(spgemm, csr::spgemm);
+
+
+}  // namespace par_ict_factorization
+
+
+using par_ict_factorization::make_add_candidates;
+using par_ict_factorization::make_compute_factor;
+using par_ict_factorization::make_convert_to_coo;
+using par_ict_factorization::make_csr_conj_transpose;
+using par_ict_factorization::make_initialize_l;
+using par_ict_factorization::make_initialize_row_ptrs_l;
+using par_ict_factorization::make_spgemm;
+using par_ict_factorization::make_threshold_filter;
+using par_ict_factorization::make_threshold_filter_approx;
+using par_ict_factorization::make_threshold_select;
+
+
+template <typename ValueType, typename IndexType>
+struct ParIctState {
+    using CsrMatrix = matrix::Csr<ValueType, IndexType>;
+    using CooMatrix = matrix::Coo<ValueType, IndexType>;
+    using CsrBuilder = matrix::CsrBuilder<ValueType, IndexType>;
+    using CooBuilder = matrix::CooBuilder<ValueType, IndexType>;
+    using Scalar = matrix::Dense<ValueType>;
+    // the executor on which the kernels are being executed
+    std::shared_ptr<const Executor> exec;
+    // max number of non-zeros L is supposed to have
+    IndexType l_nnz_limit;
+    // use the approximate selection/filter kernels?
+    bool use_approx_select;
+    // system matrix A
+    const CsrMatrix *system_matrix;
+    // current lower factor L
+    std::unique_ptr<CsrMatrix> l;
+    // current upper factor L^H
+    std::unique_ptr<CsrMatrix> lt;
+    // current product L * L^H
+    std::unique_ptr<CsrMatrix> llt;
+    // temporary lower factor L' before filtering
+    std::unique_ptr<CsrMatrix> l_new;
+    // lower factor L currently being updated with asynchronous iterations
+    std::unique_ptr<CooMatrix> l_coo;
+    // temporary array for threshold selection
+    Array<ValueType> selection_tmp;
+    // temporary array for threshold selection
+    Array<remove_complex<ValueType>> selection_tmp2;
+    // strategy to be used by the lower factor
+    std::shared_ptr<typename CsrMatrix::strategy_type> l_strategy;
+    // strategy to be used by the upper factor
+    std::shared_ptr<typename CsrMatrix::strategy_type> lt_strategy;
+
+    ParIctState(std::shared_ptr<const Executor> exec_in,
+                const CsrMatrix *system_matrix_in,
+                std::unique_ptr<CsrMatrix> l_in, IndexType l_nnz_limit,
+                bool use_approx_select,
+                std::shared_ptr<typename CsrMatrix::strategy_type> l_strategy_,
+                std::shared_ptr<typename CsrMatrix::strategy_type> lt_strategy_)
+        : exec{std::move(exec_in)},
+          l_nnz_limit{l_nnz_limit},
+          use_approx_select{use_approx_select},
+          system_matrix{system_matrix_in},
+          l{std::move(l_in)},
+          selection_tmp{exec},
+          selection_tmp2{exec},
+          l_strategy{std::move(l_strategy_)},
+          lt_strategy{std::move(lt_strategy_)}
+    {
+        auto mtx_size = system_matrix->get_size();
+        auto l_nnz = l->get_num_stored_elements();
+        lt = CsrMatrix::create(exec, mtx_size, l_nnz);
+        llt = CsrMatrix::create(exec, mtx_size);
+        l_new = CsrMatrix::create(exec, mtx_size);
+        l_coo = CooMatrix::create(exec, mtx_size);
+        exec->run(make_csr_conj_transpose(l.get(), lt.get()));
+    }
+
+    std::unique_ptr<Composition<ValueType>> to_factors() &&
+    {
+        l->set_strategy(l_strategy);
+        lt->set_strategy(lt_strategy);
+        return Composition<ValueType>::create(std::move(l), std::move(lt));
+    }
+
+    void iterate();
+};
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Composition<ValueType>>
+ParIct<ValueType, IndexType>::generate_l_lt(
+    const std::shared_ptr<const LinOp> &system_matrix) const
+{
+    using CsrMatrix = matrix::Csr<ValueType, IndexType>;
+
+    GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
+    // make sure no invalid parameters break our kernels!
+    GKO_ASSERT_EQ(parameters_.fill_in_limit > 0.0, true);
+
+    const auto exec = this->get_executor();
+
+    // convert and/or sort the matrix if necessary
+    std::unique_ptr<CsrMatrix> csr_system_matrix_unique_ptr{};
+    auto csr_system_matrix =
+        dynamic_cast<const CsrMatrix *>(system_matrix.get());
+    if (csr_system_matrix == nullptr ||
+        csr_system_matrix->get_executor() != exec) {
+        csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
+        as<ConvertibleTo<CsrMatrix>>(system_matrix.get())
+            ->convert_to(csr_system_matrix_unique_ptr.get());
+        csr_system_matrix = csr_system_matrix_unique_ptr.get();
+    }
+    if (!parameters_.skip_sorting) {
+        if (csr_system_matrix_unique_ptr == nullptr) {
+            csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
+            csr_system_matrix_unique_ptr->copy_from(csr_system_matrix);
+        }
+        csr_system_matrix_unique_ptr->sort_by_column_index();
+        csr_system_matrix = csr_system_matrix_unique_ptr.get();
+    }
+
+    // initialize the L matrix data structures
+    const auto num_rows = csr_system_matrix->get_size()[0];
+    Array<IndexType> l_row_ptrs_array{exec, num_rows + 1};
+    auto l_row_ptrs = l_row_ptrs_array.get_data();
+    exec->run(make_initialize_row_ptrs_l(csr_system_matrix, l_row_ptrs));
+
+    auto l_nnz =
+        static_cast<size_type>(exec->copy_val_to_host(l_row_ptrs + num_rows));
+
+    auto mtx_size = csr_system_matrix->get_size();
+    auto l = CsrMatrix::create(exec, mtx_size, Array<ValueType>{exec, l_nnz},
+                               Array<IndexType>{exec, l_nnz},
+                               std::move(l_row_ptrs_array));
+
+    // initialize L
+    exec->run(make_initialize_l(csr_system_matrix, l.get(), true));
+
+    // compute limit #nnz for L
+    auto l_nnz_limit =
+        static_cast<IndexType>(l_nnz * parameters_.fill_in_limit);
+
+    ParIctState<ValueType, IndexType> state{exec,
+                                            csr_system_matrix,
+                                            std::move(l),
+                                            l_nnz_limit,
+                                            parameters_.approximate_select,
+                                            parameters_.l_strategy,
+                                            parameters_.lt_strategy};
+
+    for (size_type it = 0; it < parameters_.iterations; ++it) {
+        state.iterate();
+    }
+
+    return std::move(state).to_factors();
+}
+
+
+template <typename ValueType, typename IndexType>
+void ParIctState<ValueType, IndexType>::iterate()
+{
+    // compute L * L^H
+    exec->run(make_spgemm(l.get(), lt.get(), llt.get()));
+
+    // add new candidates to L' factor
+    exec->run(
+        make_add_candidates(llt.get(), system_matrix, l.get(), l_new.get()));
+
+    // update L(COO), L'^H sizes and pointers
+    {
+        auto l_nnz = l_new->get_num_stored_elements();
+        CooBuilder l_builder{l_coo.get()};
+        // resize arrays that will be filled
+        l_builder.get_row_idx_array().resize_and_reset(l_nnz);
+        // update arrays that will be aliased
+        l_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, l_nnz, l_new->get_col_idxs());
+        l_builder.get_value_array() =
+            Array<ValueType>::view(exec, l_nnz, l_new->get_values());
+    }
+
+    // convert L into COO format
+    exec->run(make_convert_to_coo(l_new.get(), l_coo.get()));
+
+    // execute asynchronous iteration
+    exec->run(make_compute_factor(system_matrix, l_new.get(), l_coo.get()));
+
+    // determine ranks for selection/filtering
+    IndexType l_nnz = l_new->get_num_stored_elements();
+    // make sure that the rank is in [0, *_nnz)
+    auto l_filter_rank = std::max<IndexType>(0, l_nnz - l_nnz_limit - 1);
+    if (use_approx_select) {
+        remove_complex<ValueType> tmp{};
+        // remove approximately smallest candidates
+        exec->run(make_threshold_filter_approx(l_new.get(), l_filter_rank,
+                                               selection_tmp, tmp, l.get(),
+                                               l_coo.get()));
+    } else {
+        // select threshold to remove smallest candidates
+        remove_complex<ValueType> l_threshold{};
+        exec->run(make_threshold_select(l_new.get(), l_filter_rank,
+                                        selection_tmp, selection_tmp2,
+                                        l_threshold));
+
+        // remove smallest candidates
+        exec->run(make_threshold_filter(l_new.get(), l_threshold, l.get(),
+                                        l_coo.get(), true));
+    }
+
+    // execute asynchronous iteration
+    exec->run(make_compute_factor(system_matrix, l.get(), l_coo.get()));
+
+    // convert L to L^H
+    {
+        auto l_nnz = l->get_num_stored_elements();
+        CsrBuilder lt_builder{lt.get()};
+        lt_builder.get_col_idx_array().resize_and_reset(l_nnz);
+        lt_builder.get_value_array().resize_and_reset(l_nnz);
+    }
+    exec->run(make_csr_conj_transpose(l.get(), lt.get()));
+}
+
+
+#define GKO_DECLARE_PAR_ICT(ValueType, IndexType) \
+    class ParIct<ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT);
+
+
+}  // namespace factorization
+}  // namespace gko
\ No newline at end of file
diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp
new file mode 100644
index 00000000000..f02b6ac7bb6
--- /dev/null
+++ b/core/factorization/par_ict_kernels.hpp
@@ -0,0 +1,116 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_
+#define GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_
+
+
+#include <ginkgo/core/factorization/par_ict.hpp>
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL(ValueType, IndexType) \
+    void add_candidates(std::shared_ptr<const DefaultExecutor> exec,    \
+                        const matrix::Csr<ValueType, IndexType> *llt,   \
+                        const matrix::Csr<ValueType, IndexType> *a,     \
+                        const matrix::Csr<ValueType, IndexType> *l,     \
+                        matrix::Csr<ValueType, IndexType> *l_new)
+
+#define GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL(ValueType, IndexType) \
+    void compute_factor(std::shared_ptr<const DefaultExecutor> exec,    \
+                        const matrix::Csr<ValueType, IndexType> *a,     \
+                        matrix::Csr<ValueType, IndexType> *l,           \
+                        const matrix::Coo<ValueType, IndexType> *l_coo)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                 \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL(ValueType, IndexType)
+
+
+namespace omp {
+namespace par_ict_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ict_factorization
+}  // namespace omp
+
+
+namespace cuda {
+namespace par_ict_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ict_factorization
+}  // namespace cuda
+
+
+namespace reference {
+namespace par_ict_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ict_factorization
+}  // namespace reference
+
+
+namespace hip {
+namespace par_ict_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ict_factorization
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_
diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
index 3d6763f7926..d61a27747af 100644
--- a/core/factorization/par_ilu.cpp
+++ b/core/factorization/par_ilu.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/par_ilu_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
@@ -53,9 +54,11 @@ namespace factorization {
 namespace par_ilu_factorization {
 
 
+GKO_REGISTER_OPERATION(add_diagonal_elements,
+                       factorization::add_diagonal_elements);
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
-                       par_ilu_factorization::initialize_row_ptrs_l_u);
-GKO_REGISTER_OPERATION(initialize_l_u, par_ilu_factorization::initialize_l_u);
+                       factorization::initialize_row_ptrs_l_u);
+GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
 GKO_REGISTER_OPERATION(compute_l_u_factors,
                        par_ilu_factorization::compute_l_u_factors);
 GKO_REGISTER_OPERATION(csr_transpose, csr::transpose);
@@ -67,7 +70,9 @@ GKO_REGISTER_OPERATION(csr_transpose, csr::transpose);
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Composition<ValueType>>
 ParIlu<ValueType, IndexType>::generate_l_u(
-    const std::shared_ptr<const LinOp> &system_matrix, bool skip_sorting) const
+    const std::shared_ptr<const LinOp> &system_matrix, bool skip_sorting,
+    std::shared_ptr<typename l_matrix_type::strategy_type> l_strategy,
+    std::shared_ptr<typename u_matrix_type::strategy_type> u_strategy) const
 {
     using CsrMatrix = matrix::Csr<ValueType, IndexType>;
     using CooMatrix = matrix::Coo<ValueType, IndexType>;
@@ -75,33 +80,22 @@ ParIlu<ValueType, IndexType>::generate_l_u(
     GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
 
     const auto exec = this->get_executor();
-    const auto host_exec = exec->get_master();
-
-    // If required, it is also possible to make this a Factory parameter
-    auto csr_strategy = std::make_shared<typename CsrMatrix::cusparse>();
-
-    // Only copies the matrix if it is not on the same executor or was not in
-    // the right format. Throws an exception if it is not convertable.
-    std::unique_ptr<CsrMatrix> csr_system_matrix_unique_ptr{};
-    auto csr_system_matrix =
-        dynamic_cast<const CsrMatrix *>(system_matrix.get());
-    if (csr_system_matrix == nullptr ||
-        csr_system_matrix->get_executor() != exec) {
-        csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
-        as<ConvertibleTo<CsrMatrix>>(system_matrix.get())
-            ->convert_to(csr_system_matrix_unique_ptr.get());
-        csr_system_matrix = csr_system_matrix_unique_ptr.get();
-    }
-    // If it needs to be sorted, copy it if necessary and sort it
+
+    // Converts the system matrix to CSR.
+    // Throws an exception if it is not convertible.
+    auto csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
+    as<ConvertibleTo<CsrMatrix>>(system_matrix.get())
+        ->convert_to(csr_system_matrix_unique_ptr.get());
+    auto csr_system_matrix = csr_system_matrix_unique_ptr.get();
+    // If necessary, sort it
     if (!skip_sorting) {
-        if (csr_system_matrix_unique_ptr == nullptr) {
-            csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
-            csr_system_matrix_unique_ptr->copy_from(csr_system_matrix);
-        }
-        csr_system_matrix_unique_ptr->sort_by_column_index();
-        csr_system_matrix = csr_system_matrix_unique_ptr.get();
+        csr_system_matrix->sort_by_column_index();
     }
 
+    // Add explicit diagonal zero elements if they are missing
+    exec->run(par_ilu_factorization::make_add_diagonal_elements(
+        csr_system_matrix, true));
+
     const auto matrix_size = csr_system_matrix->get_size();
     const auto number_rows = matrix_size[0];
     Array<IndexType> l_row_ptrs{exec, number_rows + 1};
@@ -109,15 +103,11 @@ ParIlu<ValueType, IndexType>::generate_l_u(
     exec->run(par_ilu_factorization::make_initialize_row_ptrs_l_u(
         csr_system_matrix, l_row_ptrs.get_data(), u_row_ptrs.get_data()));
 
-    IndexType l_nnz_it;
-    IndexType u_nnz_it;
-    // Since nnz is always at row_ptrs[m], it can be extracted easily
-    host_exec->copy_from(exec.get(), 1, l_row_ptrs.get_data() + number_rows,
-                         &l_nnz_it);
-    host_exec->copy_from(exec.get(), 1, u_row_ptrs.get_data() + number_rows,
-                         &u_nnz_it);
-    auto l_nnz = static_cast<size_type>(l_nnz_it);
-    auto u_nnz = static_cast<size_type>(u_nnz_it);
+    // Get nnz from device memory
+    auto l_nnz = static_cast<size_type>(
+        exec->copy_val_to_host(l_row_ptrs.get_data() + number_rows));
+    auto u_nnz = static_cast<size_type>(
+        exec->copy_val_to_host(u_row_ptrs.get_data() + number_rows));
 
     // Since `row_ptrs` of L and U is already created, the matrix can be
     // directly created with it
@@ -125,12 +115,12 @@ ParIlu<ValueType, IndexType>::generate_l_u(
     Array<ValueType> l_vals{exec, l_nnz};
     std::shared_ptr<CsrMatrix> l_factor = l_matrix_type::create(
         exec, matrix_size, std::move(l_vals), std::move(l_col_idxs),
-        std::move(l_row_ptrs), csr_strategy);
+        std::move(l_row_ptrs), l_strategy);
     Array<IndexType> u_col_idxs{exec, u_nnz};
     Array<ValueType> u_vals{exec, u_nnz};
     std::shared_ptr<CsrMatrix> u_factor = u_matrix_type::create(
         exec, matrix_size, std::move(u_vals), std::move(u_col_idxs),
-        std::move(u_row_ptrs), csr_strategy);
+        std::move(u_row_ptrs), u_strategy);
 
     exec->run(par_ilu_factorization::make_initialize_l_u(
         csr_system_matrix, l_factor.get(), u_factor.get()));
@@ -173,8 +163,8 @@ ParIlu<ValueType, IndexType>::generate_l_u(
     // Since the transposed version has the exact same non-zero positions
     // as `u_factor`, we can both skip the allocation and the `make_srow()`
     // call from CSR, leaving just the `transpose()` kernel call
-    exec->run(par_ilu_factorization::make_csr_transpose(u_factor.get(),
-                                                        u_factor_transpose));
+    exec->run(par_ilu_factorization::make_csr_transpose(u_factor_transpose,
+                                                        u_factor.get()));
 
     return Composition<ValueType>::create(std::move(l_factor),
                                           std::move(u_factor));
diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp
index df96ff0389f..09bc1dd2596 100644
--- a/core/factorization/par_ilu_kernels.hpp
+++ b/core/factorization/par_ilu_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -48,19 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 namespace kernels {
 
-
-#define GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \
-                                                           IndexType) \
-    void initialize_row_ptrs_l_u(                                     \
-        std::shared_ptr<const DefaultExecutor> exec,                  \
-        const matrix::Csr<ValueType, IndexType> *system_matrix,       \
-        IndexType *l_row_ptrs, IndexType *u_row_ptrs)
-#define GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL(ValueType, IndexType) \
-    void initialize_l_u(                                                \
-        std::shared_ptr<const DefaultExecutor> exec,                    \
-        const matrix::Csr<ValueType, IndexType> *system_matrix,         \
-        matrix::Csr<ValueType, IndexType> *l_factor,                    \
-        matrix::Csr<ValueType, IndexType> *u_factor)
 #define GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL(ValueType, IndexType) \
     void compute_l_u_factors(                                                \
         std::shared_ptr<const DefaultExecutor> exec, size_type iterations,   \
@@ -69,12 +56,8 @@ namespace kernels {
         matrix::Csr<ValueType, IndexType> *u_factor)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                          \
-    template <typename ValueType, typename IndexType>                         \
-    GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType); \
-    template <typename ValueType, typename IndexType>                         \
-    GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL(ValueType, IndexType);          \
-    template <typename ValueType, typename IndexType>                         \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                  \
+    template <typename ValueType, typename IndexType> \
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL(ValueType, IndexType)
 
 
@@ -105,6 +88,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace par_ilu_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ilu_factorization
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
new file mode 100644
index 00000000000..1eb3dfeb950
--- /dev/null
+++ b/core/factorization/par_ilut.cpp
@@ -0,0 +1,355 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/par_ilut.hpp>
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/factorization/par_ilu_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+
+
+namespace gko {
+namespace factorization {
+namespace par_ilut_factorization {
+
+
+GKO_REGISTER_OPERATION(threshold_select,
+                       par_ilut_factorization::threshold_select);
+GKO_REGISTER_OPERATION(threshold_filter,
+                       par_ilut_factorization::threshold_filter);
+GKO_REGISTER_OPERATION(threshold_filter_approx,
+                       par_ilut_factorization::threshold_filter_approx);
+GKO_REGISTER_OPERATION(add_candidates, par_ilut_factorization::add_candidates);
+GKO_REGISTER_OPERATION(compute_l_u_factors,
+                       par_ilut_factorization::compute_l_u_factors);
+
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
+                       factorization::initialize_row_ptrs_l_u);
+GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
+
+GKO_REGISTER_OPERATION(csr_transpose, csr::transpose);
+GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo);
+GKO_REGISTER_OPERATION(spgemm, csr::spgemm);
+
+
+}  // namespace par_ilut_factorization
+
+
+using par_ilut_factorization::make_add_candidates;
+using par_ilut_factorization::make_compute_l_u_factors;
+using par_ilut_factorization::make_convert_to_coo;
+using par_ilut_factorization::make_csr_transpose;
+using par_ilut_factorization::make_initialize_l_u;
+using par_ilut_factorization::make_initialize_row_ptrs_l_u;
+using par_ilut_factorization::make_spgemm;
+using par_ilut_factorization::make_threshold_filter;
+using par_ilut_factorization::make_threshold_filter_approx;
+using par_ilut_factorization::make_threshold_select;
+
+
+template <typename ValueType, typename IndexType>
+struct ParIlutState {
+    using CsrMatrix = matrix::Csr<ValueType, IndexType>;
+    using CooMatrix = matrix::Coo<ValueType, IndexType>;
+    using CsrBuilder = matrix::CsrBuilder<ValueType, IndexType>;
+    using CooBuilder = matrix::CooBuilder<ValueType, IndexType>;
+    using Scalar = matrix::Dense<ValueType>;
+    // the executor on which the kernels are being executed
+    std::shared_ptr<const Executor> exec;
+    // max number of non-zeros L is supposed to have
+    IndexType l_nnz_limit;
+    // max number of non-zeros U is supposed to have
+    IndexType u_nnz_limit;
+    // use the approximate selection/filter kernels?
+    bool use_approx_select;
+    // system matrix A
+    const CsrMatrix *system_matrix;
+    // current lower factor L
+    std::unique_ptr<CsrMatrix> l;
+    // current upper factor U
+    std::unique_ptr<CsrMatrix> u;
+    // current upper factor U in CSC format
+    std::unique_ptr<CsrMatrix> u_csc;
+    // current product L * U
+    std::unique_ptr<CsrMatrix> lu;
+    // temporary lower factor L' before filtering
+    std::unique_ptr<CsrMatrix> l_new;
+    // temporary upper factor U' before filtering
+    std::unique_ptr<CsrMatrix> u_new;
+    // temporary upper factor U' in CSC format before filtering
+    std::unique_ptr<CsrMatrix> u_new_csc;
+    // lower factor L currently being updated with asynchronous iterations
+    std::unique_ptr<CooMatrix> l_coo;
+    // upper factor U currently being updated
+    std::unique_ptr<CooMatrix> u_coo;
+    // temporary array for threshold selection
+    Array<ValueType> selection_tmp;
+    // temporary array for threshold selection
+    Array<remove_complex<ValueType>> selection_tmp2;
+    // strategy to be used by the lower factor
+    std::shared_ptr<typename CsrMatrix::strategy_type> l_strategy;
+    // strategy to be used by the upper factor
+    std::shared_ptr<typename CsrMatrix::strategy_type> u_strategy;
+
+    ParIlutState(std::shared_ptr<const Executor> exec_in,
+                 const CsrMatrix *system_matrix_in,
+                 std::unique_ptr<CsrMatrix> l_in,
+                 std::unique_ptr<CsrMatrix> u_in, IndexType l_nnz_limit,
+                 IndexType u_nnz_limit, bool use_approx_select,
+                 std::shared_ptr<typename CsrMatrix::strategy_type> l_strategy_,
+                 std::shared_ptr<typename CsrMatrix::strategy_type> u_strategy_)
+        : exec{std::move(exec_in)},
+          l_nnz_limit{l_nnz_limit},
+          u_nnz_limit{u_nnz_limit},
+          use_approx_select{use_approx_select},
+          system_matrix{system_matrix_in},
+          l{std::move(l_in)},
+          u{std::move(u_in)},
+          selection_tmp{exec},
+          selection_tmp2{exec},
+          l_strategy{std::move(l_strategy_)},
+          u_strategy{std::move(u_strategy_)}
+    {
+        auto mtx_size = system_matrix->get_size();
+        auto u_nnz = u->get_num_stored_elements();
+        u_csc = CsrMatrix::create(exec, mtx_size, u_nnz);
+        lu = CsrMatrix::create(exec, mtx_size);
+        l_new = CsrMatrix::create(exec, mtx_size);
+        u_new = CsrMatrix::create(exec, mtx_size);
+        u_new_csc = CsrMatrix::create(exec, mtx_size);
+        l_coo = CooMatrix::create(exec, mtx_size);
+        u_coo = CooMatrix::create(exec, mtx_size);
+        exec->run(make_csr_transpose(u.get(), u_csc.get()));
+    }
+
+    std::unique_ptr<Composition<ValueType>> to_factors() &&
+    {
+        l->set_strategy(l_strategy);
+        u->set_strategy(u_strategy);
+        return Composition<ValueType>::create(std::move(l), std::move(u));
+    }
+
+    void iterate();
+};
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Composition<ValueType>>
+ParIlut<ValueType, IndexType>::generate_l_u(
+    const std::shared_ptr<const LinOp> &system_matrix) const
+{
+    using CsrMatrix = matrix::Csr<ValueType, IndexType>;
+
+    GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
+    // make sure no invalid parameters break our kernels!
+    GKO_ASSERT_EQ(parameters_.fill_in_limit > 0.0, true);
+
+    const auto exec = this->get_executor();
+
+    // convert and/or sort the matrix if necessary
+    std::unique_ptr<CsrMatrix> csr_system_matrix_unique_ptr{};
+    auto csr_system_matrix =
+        dynamic_cast<const CsrMatrix *>(system_matrix.get());
+    if (csr_system_matrix == nullptr ||
+        csr_system_matrix->get_executor() != exec) {
+        csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
+        as<ConvertibleTo<CsrMatrix>>(system_matrix.get())
+            ->convert_to(csr_system_matrix_unique_ptr.get());
+        csr_system_matrix = csr_system_matrix_unique_ptr.get();
+    }
+    if (!parameters_.skip_sorting) {
+        if (csr_system_matrix_unique_ptr == nullptr) {
+            csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
+            csr_system_matrix_unique_ptr->copy_from(csr_system_matrix);
+        }
+        csr_system_matrix_unique_ptr->sort_by_column_index();
+        csr_system_matrix = csr_system_matrix_unique_ptr.get();
+    }
+
+    // initialize the L and U matrix data structures
+    const auto num_rows = csr_system_matrix->get_size()[0];
+    Array<IndexType> l_row_ptrs_array{exec, num_rows + 1};
+    Array<IndexType> u_row_ptrs_array{exec, num_rows + 1};
+    auto l_row_ptrs = l_row_ptrs_array.get_data();
+    auto u_row_ptrs = u_row_ptrs_array.get_data();
+    exec->run(make_initialize_row_ptrs_l_u(csr_system_matrix, l_row_ptrs,
+                                           u_row_ptrs));
+
+    auto l_nnz =
+        static_cast<size_type>(exec->copy_val_to_host(l_row_ptrs + num_rows));
+    auto u_nnz =
+        static_cast<size_type>(exec->copy_val_to_host(u_row_ptrs + num_rows));
+
+    auto mtx_size = csr_system_matrix->get_size();
+    auto l = CsrMatrix::create(exec, mtx_size, Array<ValueType>{exec, l_nnz},
+                               Array<IndexType>{exec, l_nnz},
+                               std::move(l_row_ptrs_array));
+    auto u = CsrMatrix::create(exec, mtx_size, Array<ValueType>{exec, u_nnz},
+                               Array<IndexType>{exec, u_nnz},
+                               std::move(u_row_ptrs_array));
+
+    // initialize L and U
+    exec->run(make_initialize_l_u(csr_system_matrix, l.get(), u.get()));
+
+    // compute limit #nnz for L and U
+    auto l_nnz_limit =
+        static_cast<IndexType>(l_nnz * parameters_.fill_in_limit);
+    auto u_nnz_limit =
+        static_cast<IndexType>(u_nnz * parameters_.fill_in_limit);
+
+    ParIlutState<ValueType, IndexType> state{exec,
+                                             csr_system_matrix,
+                                             std::move(l),
+                                             std::move(u),
+                                             l_nnz_limit,
+                                             u_nnz_limit,
+                                             parameters_.approximate_select,
+                                             parameters_.l_strategy,
+                                             parameters_.u_strategy};
+
+    for (size_type it = 0; it < parameters_.iterations; ++it) {
+        state.iterate();
+    }
+
+    return std::move(state).to_factors();
+}
+
+
+template <typename ValueType, typename IndexType>
+void ParIlutState<ValueType, IndexType>::iterate()
+{
+    // compute L * U
+    exec->run(make_spgemm(l.get(), u.get(), lu.get()));
+
+    // add new candidates to L' and U' factors
+    exec->run(make_add_candidates(lu.get(), system_matrix, l.get(), u.get(),
+                                  l_new.get(), u_new.get()));
+
+    // update U'(CSC), L'(COO), U'(COO) sizes and pointers
+    {
+        auto l_nnz = l_new->get_num_stored_elements();
+        auto u_nnz = u_new->get_num_stored_elements();
+        CooBuilder l_builder{l_coo.get()};
+        CooBuilder u_builder{u_coo.get()};
+        CsrBuilder u_csc_builder{u_new_csc.get()};
+        // resize arrays that will be filled
+        l_builder.get_row_idx_array().resize_and_reset(l_nnz);
+        u_builder.get_row_idx_array().resize_and_reset(u_nnz);
+        u_csc_builder.get_col_idx_array().resize_and_reset(u_nnz);
+        u_csc_builder.get_value_array().resize_and_reset(u_nnz);
+        // update arrays that will be aliased
+        l_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, l_nnz, l_new->get_col_idxs());
+        u_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, u_nnz, u_new->get_col_idxs());
+        l_builder.get_value_array() =
+            Array<ValueType>::view(exec, l_nnz, l_new->get_values());
+        u_builder.get_value_array() =
+            Array<ValueType>::view(exec, u_nnz, u_new->get_values());
+    }
+
+    // convert U' into CSC format
+    exec->run(make_csr_transpose(u_new.get(), u_new_csc.get()));
+
+    // convert L' and U' into COO format
+    exec->run(make_convert_to_coo(l_new.get(), l_coo.get()));
+    exec->run(make_convert_to_coo(u_new.get(), u_coo.get()));
+
+    // execute asynchronous iteration
+    exec->run(make_compute_l_u_factors(system_matrix, l_new.get(), l_coo.get(),
+                                       u_new.get(), u_coo.get(),
+                                       u_new_csc.get()));
+
+    // determine ranks for selection/filtering
+    IndexType l_nnz = l_new->get_num_stored_elements();
+    IndexType u_nnz = u_new->get_num_stored_elements();
+    // make sure that the rank is in [0, *_nnz)
+    auto l_filter_rank = std::max<IndexType>(0, l_nnz - l_nnz_limit - 1);
+    auto u_filter_rank = std::max<IndexType>(0, u_nnz - u_nnz_limit - 1);
+    remove_complex<ValueType> l_threshold{};
+    remove_complex<ValueType> u_threshold{};
+    CooMatrix *null_coo = nullptr;
+    if (use_approx_select) {
+        // remove approximately smallest candidates from L' and U'^T
+        exec->run(make_threshold_filter_approx(l_new.get(), l_filter_rank,
+                                               selection_tmp, l_threshold,
+                                               l.get(), l_coo.get()));
+        exec->run(make_threshold_filter_approx(u_new_csc.get(), u_filter_rank,
+                                               selection_tmp, u_threshold,
+                                               u_csc.get(), null_coo));
+    } else {
+        // select threshold to remove smallest candidates
+        exec->run(make_threshold_select(l_new.get(), l_filter_rank,
+                                        selection_tmp, selection_tmp2,
+                                        l_threshold));
+        exec->run(make_threshold_select(u_new_csc.get(), u_filter_rank,
+                                        selection_tmp, selection_tmp2,
+                                        u_threshold));
+
+        // remove smallest candidates from L' and U'^T
+        exec->run(make_threshold_filter(l_new.get(), l_threshold, l.get(),
+                                        l_coo.get(), true));
+        exec->run(make_threshold_filter(u_new_csc.get(), u_threshold,
+                                        u_csc.get(), null_coo, true));
+    }
+    // remove smallest candidates from U'
+    exec->run(make_threshold_filter(u_new.get(), u_threshold, u.get(),
+                                    u_coo.get(), false));
+
+    // execute asynchronous iteration
+    exec->run(make_compute_l_u_factors(system_matrix, l.get(), l_coo.get(),
+                                       u.get(), u_coo.get(), u_csc.get()));
+}
+
+
+#define GKO_DECLARE_PAR_ILUT(ValueType, IndexType) \
+    class ParIlut<ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT);
+
+
+}  // namespace factorization
+}  // namespace gko
\ No newline at end of file
diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp
new file mode 100644
index 00000000000..9bb19596c3f
--- /dev/null
+++ b/core/factorization/par_ilut_kernels.hpp
@@ -0,0 +1,153 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_
+#define GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_
+
+
+#include <ginkgo/core/factorization/par_ilut.hpp>
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL(ValueType, IndexType) \
+    void add_candidates(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Csr<ValueType, IndexType> *lu,     \
+                        const matrix::Csr<ValueType, IndexType> *a,      \
+                        const matrix::Csr<ValueType, IndexType> *l,      \
+                        const matrix::Csr<ValueType, IndexType> *u,      \
+                        matrix::Csr<ValueType, IndexType> *l_new,        \
+                        matrix::Csr<ValueType, IndexType> *u_new)
+
+#define GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL(ValueType, IndexType) \
+    void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,    \
+                             const matrix::Csr<ValueType, IndexType> *a,     \
+                             matrix::Csr<ValueType, IndexType> *l,           \
+                             const matrix::Coo<ValueType, IndexType> *l_coo, \
+                             matrix::Csr<ValueType, IndexType> *u,           \
+                             const matrix::Coo<ValueType, IndexType> *u_coo, \
+                             matrix::Csr<ValueType, IndexType> *u_csc)
+
+#define GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType) \
+    void threshold_select(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Csr<ValueType, IndexType> *m,      \
+                          IndexType rank, Array<ValueType> &tmp,           \
+                          Array<remove_complex<ValueType>> &tmp2,          \
+                          remove_complex<ValueType> &threshold)
+
+#define GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType) \
+    void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Csr<ValueType, IndexType> *m,      \
+                          remove_complex<ValueType> threshold,             \
+                          matrix::Csr<ValueType, IndexType> *m_out,        \
+                          matrix::Coo<ValueType, IndexType> *m_out_coo,    \
+                          bool lower)
+
+#define GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL(ValueType,        \
+                                                            IndexType)        \
+    void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec, \
+                                 const matrix::Csr<ValueType, IndexType> *m,  \
+                                 IndexType rank, Array<ValueType> &tmp,       \
+                                 remove_complex<ValueType> &threshold,        \
+                                 matrix::Csr<ValueType, IndexType> *m_out,    \
+                                 matrix::Coo<ValueType, IndexType> *m_out_coo)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                      \
+    constexpr auto sampleselect_searchtree_height = 8;                    \
+    constexpr auto sampleselect_oversampling = 4;                         \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL(ValueType, IndexType)
+
+
+namespace omp {
+namespace par_ilut_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ilut_factorization
+}  // namespace omp
+
+
+namespace cuda {
+namespace par_ilut_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+
+
+namespace reference {
+namespace par_ilut_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ilut_factorization
+}  // namespace reference
+
+
+namespace hip {
+namespace par_ilut_factorization {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_
diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp
index 4cf6ed742d6..9947e40fc60 100644
--- a/core/log/convergence.cpp
+++ b/core/log/convergence.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,11 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/convergence.hpp>
 
 
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
@@ -60,7 +60,8 @@ void Convergence<ValueType>::on_criterion_check_completed(
             this->residual_norm_.reset(residual_norm->clone().release());
         } else if (residual != nullptr) {
             using Vector = matrix::Dense<ValueType>;
-            this->residual_norm_ = Vector::create(
+            using NormVector = matrix::Dense<remove_complex<ValueType>>;
+            this->residual_norm_ = NormVector::create(
                 residual->get_executor(), dim<2>{1, residual->get_size()[1]});
             auto dense_r = as<Vector>(residual);
             dense_r->compute_norm2(this->residual_norm_.get());
diff --git a/core/log/logger.cpp b/core/log/logger.cpp
index 75f48036937..46ee98b2895 100644
--- a/core/log/logger.cpp
+++ b/core/log/logger.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/logger.hpp>
 
 
@@ -75,5 +74,6 @@ constexpr Logger::mask_type Logger::criterion_check_completed_mask;
 
 constexpr Logger::mask_type Logger::iteration_complete_mask;
 
+
 }  // namespace log
 }  // namespace gko
diff --git a/core/log/papi.cpp b/core/log/papi.cpp
index 50da5bd11c9..1c8a17419fa 100644
--- a/core/log/papi.cpp
+++ b/core/log/papi.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/papi.hpp>
 
 
@@ -42,10 +41,6 @@ namespace gko {
 namespace log {
 
 
-template <typename ValueType>
-size_type Papi<ValueType>::logger_count = 0;
-
-
 template <typename ValueType>
 void Papi<ValueType>::on_allocation_started(const Executor *exec,
                                             const size_type &num_bytes) const
diff --git a/core/log/record.cpp b/core/log/record.cpp
index 19c0992e2a6..48026c1563b 100644
--- a/core/log/record.cpp
+++ b/core/log/record.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/record.hpp>
 
 
diff --git a/core/log/stream.cpp b/core/log/stream.cpp
index 3ffd3c11e25..3cad7421aee 100644
--- a/core/log/stream.cpp
+++ b/core/log/stream.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/stream.hpp>
 
 
@@ -39,14 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
-#include <ginkgo/core/base/name_demangling.hpp>
-
-
 namespace gko {
 namespace log {
 
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 73d001d6dba..2cd8f34982f 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,23 +31,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include <algorithm>
+#include <numeric>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
 #include "core/matrix/coo_kernels.hpp"
 
 
-#include <algorithm>
-#include <numeric>
-
-
 namespace gko {
 namespace matrix {
 
@@ -103,6 +103,25 @@ void Coo<ValueType, IndexType>::apply2_impl(const LinOp *alpha, const LinOp *b,
 }
 
 
+template <typename ValueType, typename IndexType>
+void Coo<ValueType, IndexType>::convert_to(
+    Coo<next_precision<ValueType>, IndexType> *result) const
+{
+    result->values_ = this->values_;
+    result->row_idxs_ = this->row_idxs_;
+    result->col_idxs_ = this->col_idxs_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Coo<ValueType, IndexType>::move_to(
+    Coo<next_precision<ValueType>, IndexType> *result)
+{
+    this->convert_to(result);
+}
+
+
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(
     Csr<ValueType, IndexType> *result) const
@@ -113,7 +132,7 @@ void Coo<ValueType, IndexType>::convert_to(
         result->get_strategy());
     tmp->values_ = this->values_;
     tmp->col_idxs_ = this->col_idxs_;
-    exec->run(coo::make_convert_to_csr(tmp.get(), this));
+    exec->run(coo::make_convert_to_csr(this, tmp.get()));
     tmp->make_srow();
     tmp->move_to(result);
 }
@@ -128,7 +147,7 @@ void Coo<ValueType, IndexType>::move_to(Csr<ValueType, IndexType> *result)
         result->get_strategy());
     tmp->values_ = std::move(this->values_);
     tmp->col_idxs_ = std::move(this->col_idxs_);
-    exec->run(coo::make_convert_to_csr(tmp.get(), this));
+    exec->run(coo::make_convert_to_csr(this, tmp.get()));
     tmp->make_srow();
     tmp->move_to(result);
 }
@@ -139,7 +158,7 @@ void Coo<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(coo::make_convert_to_dense(tmp.get(), this));
+    exec->run(coo::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
diff --git a/core/matrix/coo_builder.hpp b/core/matrix/coo_builder.hpp
new file mode 100644
index 00000000000..de323ad42b6
--- /dev/null
+++ b/core/matrix/coo_builder.hpp
@@ -0,0 +1,89 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_COO_BUILDER_HPP_
+#define GKO_CORE_MATRIX_COO_BUILDER_HPP_
+
+
+#include <ginkgo/core/matrix/coo.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+/**
+ * @internal
+ *
+ * Allows intrusive access to the arrays stored within a @ref Coo matrix.
+ *
+ * @tparam ValueType  the value type of the matrix
+ * @tparam IndexType  the index type of the matrix
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class CooBuilder {
+public:
+    /**
+     * Returns the row index array of the COO matrix.
+     */
+    Array<IndexType> &get_row_idx_array() { return matrix_->row_idxs_; }
+
+    /**
+     * Returns the column index array of the COO matrix.
+     */
+    Array<IndexType> &get_col_idx_array() { return matrix_->col_idxs_; }
+
+    /**
+     * Returns the value array of the COO matrix.
+     */
+    Array<ValueType> &get_value_array() { return matrix_->values_; }
+
+    /**
+     * Initializes a CooBuilder from an existing COO matrix.
+     */
+    explicit CooBuilder(Coo<ValueType, IndexType> *matrix) : matrix_{matrix} {}
+
+    // make this type non-movable
+    CooBuilder(const CooBuilder &) = delete;
+    CooBuilder(CooBuilder &&) = delete;
+    CooBuilder &operator=(const CooBuilder &) = delete;
+    CooBuilder &operator=(CooBuilder &&) = delete;
+
+private:
+    Coo<ValueType, IndexType> *matrix_;
+};
+
+
+}  // namespace matrix
+}  // namespace gko
+
+#endif  // GKO_CORE_MATRIX_COO_BUILDER_HPP_
diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp
index e8508ac1c8c..48a83f8f9b1 100644
--- a/core/matrix/coo_kernels.hpp
+++ b/core/matrix/coo_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_COO_KERNELS_HPP_
 
 
-#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+
+
+#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
@@ -69,15 +71,15 @@ namespace kernels {
                         const matrix::Dense<ValueType> *b,           \
                         matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)  \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Dense<ValueType> *result,            \
-                          const matrix::Coo<ValueType, IndexType> *source)
+#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Coo<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
 
-#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)  \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<ValueType, IndexType> *result,   \
-                        const matrix::Coo<ValueType, IndexType> *source)
+#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Coo<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                             \
     template <typename ValueType, typename IndexType>            \
@@ -121,6 +123,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace coo {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace coo
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 53dd2f02b12..2d0012c7cb6 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -40,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -54,6 +56,9 @@ namespace csr {
 
 GKO_REGISTER_OPERATION(spmv, csr::spmv);
 GKO_REGISTER_OPERATION(advanced_spmv, csr::advanced_spmv);
+GKO_REGISTER_OPERATION(spgemm, csr::spgemm);
+GKO_REGISTER_OPERATION(advanced_spgemm, csr::advanced_spgemm);
+GKO_REGISTER_OPERATION(spgeam, csr::spgeam);
 GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo);
 GKO_REGISTER_OPERATION(convert_to_dense, csr::convert_to_dense);
 GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp);
@@ -62,6 +67,10 @@ GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell);
 GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid);
 GKO_REGISTER_OPERATION(transpose, csr::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
+GKO_REGISTER_OPERATION(row_permute, csr::row_permute);
+GKO_REGISTER_OPERATION(column_permute, csr::column_permute);
+GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute);
+GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute);
 GKO_REGISTER_OPERATION(calculate_max_nnz_per_row,
                        csr::calculate_max_nnz_per_row);
 GKO_REGISTER_OPERATION(calculate_nonzeros_per_row,
@@ -78,7 +87,16 @@ template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
     using Dense = Dense<ValueType>;
-    this->get_executor()->run(csr::make_spmv(this, as<Dense>(b), as<Dense>(x)));
+    using TCsr = Csr<ValueType, IndexType>;
+    if (auto b_csr = dynamic_cast<const TCsr *>(b)) {
+        // if b is a CSR matrix, we compute a SpGeMM
+        auto x_csr = as<TCsr>(x);
+        this->get_executor()->run(csr::make_spgemm(this, b_csr, x_csr));
+    } else {
+        // otherwise we assume that b is dense and compute a SpMV/SpMM
+        this->get_executor()->run(
+            csr::make_spmv(this, as<Dense>(b), as<Dense>(x)));
+    }
 }
 
 
@@ -87,8 +105,46 @@ void Csr<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
                                            const LinOp *beta, LinOp *x) const
 {
     using Dense = Dense<ValueType>;
-    this->get_executor()->run(csr::make_advanced_spmv(
-        as<Dense>(alpha), this, as<Dense>(b), as<Dense>(beta), as<Dense>(x)));
+    using TCsr = Csr<ValueType, IndexType>;
+    if (auto b_csr = dynamic_cast<const TCsr *>(b)) {
+        // if b is a CSR matrix, we compute a SpGeMM
+        auto x_csr = as<TCsr>(x);
+        auto x_copy = x_csr->clone();
+        this->get_executor()->run(
+            csr::make_advanced_spgemm(as<Dense>(alpha), this, b_csr,
+                                      as<Dense>(beta), x_copy.get(), x_csr));
+    } else if (dynamic_cast<const Identity<ValueType> *>(b)) {
+        // if b is an identity matrix, we compute an SpGEAM
+        auto x_csr = as<TCsr>(x);
+        auto x_copy = x_csr->clone();
+        this->get_executor()->run(csr::make_spgeam(
+            as<Dense>(alpha), this, as<Dense>(beta), lend(x_copy), x_csr));
+    } else {
+        // otherwise we assume that b is dense and compute a SpMV/SpMM
+        this->get_executor()->run(
+            csr::make_advanced_spmv(as<Dense>(alpha), this, as<Dense>(b),
+                                    as<Dense>(beta), as<Dense>(x)));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void Csr<ValueType, IndexType>::convert_to(
+    Csr<next_precision<ValueType>, IndexType> *result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->row_ptrs_ = this->row_ptrs_;
+    result->set_size(this->get_size());
+    convert_strategy_helper(result);
+}
+
+
+template <typename ValueType, typename IndexType>
+void Csr<ValueType, IndexType>::move_to(
+    Csr<next_precision<ValueType>, IndexType> *result)
+{
+    this->convert_to(result);
 }
 
 
@@ -101,7 +157,7 @@ void Csr<ValueType, IndexType>::convert_to(
         exec, this->get_size(), this->get_num_stored_elements());
     tmp->values_ = this->values_;
     tmp->col_idxs_ = this->col_idxs_;
-    exec->run(csr::make_convert_to_coo(tmp.get(), this));
+    exec->run(csr::make_convert_to_coo(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -118,7 +174,7 @@ void Csr<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(csr::make_convert_to_dense(tmp.get(), this));
+    exec->run(csr::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -148,7 +204,7 @@ void Csr<ValueType, IndexType>::convert_to(
     auto tmp = Hybrid<ValueType, IndexType>::create(
         exec, this->get_size(), max_nnz_per_row, stride, coo_nnz,
         result->get_strategy());
-    exec->run(csr::make_convert_to_hybrid(tmp.get(), this));
+    exec->run(csr::make_convert_to_hybrid(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -176,7 +232,7 @@ void Csr<ValueType, IndexType>::convert_to(
                                              slice_size));
     auto tmp = Sellp<ValueType, IndexType>::create(
         exec, this->get_size(), slice_size, stride_factor, total_cols);
-    exec->run(csr::make_convert_to_sellp(tmp.get(), this));
+    exec->run(csr::make_convert_to_sellp(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -223,7 +279,7 @@ void Csr<ValueType, IndexType>::convert_to(
     exec->run(csr::make_calculate_max_nnz_per_row(this, &max_nnz_per_row));
     auto tmp = Ell<ValueType, IndexType>::create(exec, this->get_size(),
                                                  max_nnz_per_row);
-    exec->run(csr::make_convert_to_ell(tmp.get(), this));
+    exec->run(csr::make_convert_to_ell(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -300,7 +356,7 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::transpose() const
         Csr::create(exec, gko::transpose(this->get_size()),
                     this->get_num_stored_elements(), this->get_strategy());
 
-    exec->run(csr::make_transpose(trans_cpy.get(), this));
+    exec->run(csr::make_transpose(this, trans_cpy.get()));
     trans_cpy->make_srow();
     return std::move(trans_cpy);
 }
@@ -314,12 +370,82 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::conj_transpose() const
         Csr::create(exec, gko::transpose(this->get_size()),
                     this->get_num_stored_elements(), this->get_strategy());
 
-    exec->run(csr::make_conj_transpose(trans_cpy.get(), this));
+    exec->run(csr::make_conj_transpose(this, trans_cpy.get()));
     trans_cpy->make_srow();
     return std::move(trans_cpy);
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Csr<ValueType, IndexType>::row_permute(
+    const Array<IndexType> *permutation_indices) const
+{
+    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
+    auto exec = this->get_executor();
+    auto permute_cpy =
+        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
+                    this->get_strategy());
+
+    exec->run(
+        csr::make_row_permute(permutation_indices, this, permute_cpy.get()));
+    permute_cpy->make_srow();
+    return std::move(permute_cpy);
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Csr<ValueType, IndexType>::column_permute(
+    const Array<IndexType> *permutation_indices) const
+{
+    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
+    auto exec = this->get_executor();
+    auto permute_cpy =
+        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
+                    this->get_strategy());
+
+    exec->run(
+        csr::make_column_permute(permutation_indices, this, permute_cpy.get()));
+    permute_cpy->make_srow();
+    return std::move(permute_cpy);
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_row_permute(
+    const Array<IndexType> *inverse_permutation_indices) const
+{
+    GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(),
+                  this->get_size()[0]);
+    auto exec = this->get_executor();
+    auto inverse_permute_cpy =
+        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
+                    this->get_strategy());
+
+    exec->run(csr::make_inverse_row_permute(inverse_permutation_indices, this,
+                                            inverse_permute_cpy.get()));
+    inverse_permute_cpy->make_srow();
+    return std::move(inverse_permute_cpy);
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_column_permute(
+    const Array<IndexType> *inverse_permutation_indices) const
+{
+    GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(),
+                  this->get_size()[1]);
+    auto exec = this->get_executor();
+    auto inverse_permute_cpy =
+        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
+                    this->get_strategy());
+
+    exec->run(csr::make_inverse_column_permute(
+        inverse_permutation_indices, this, inverse_permute_cpy.get()));
+    inverse_permute_cpy->make_srow();
+    return std::move(inverse_permute_cpy);
+}
+
+
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::sort_by_column_index()
 {
diff --git a/core/matrix/csr_builder.hpp b/core/matrix/csr_builder.hpp
new file mode 100644
index 00000000000..73f892dc3a8
--- /dev/null
+++ b/core/matrix/csr_builder.hpp
@@ -0,0 +1,89 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_CSR_BUILDER_HPP_
+#define GKO_CORE_MATRIX_CSR_BUILDER_HPP_
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+/**
+ * @internal
+ *
+ * Allows intrusive access to the arrays stored within a @ref Csr matrix.
+ *
+ * @tparam ValueType  the value type of the matrix
+ * @tparam IndexType  the index type of the matrix
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class CsrBuilder {
+public:
+    /**
+     * Returns the column index array of the CSR matrix.
+     */
+    Array<IndexType> &get_col_idx_array() { return matrix_->col_idxs_; }
+
+    /**
+     * Returns the value array of the CSR matrix.
+     */
+    Array<ValueType> &get_value_array() { return matrix_->values_; }
+
+    /**
+     * Initializes a CsrBuilder from an existing CSR matrix.
+     */
+    explicit CsrBuilder(Csr<ValueType, IndexType> *matrix) : matrix_{matrix} {}
+
+    /**
+     * Updates the internal matrix data structures at destruction.
+     */
+    ~CsrBuilder() { matrix_->make_srow(); }
+
+    // make this type non-movable
+    CsrBuilder(const CsrBuilder &) = delete;
+    CsrBuilder(CsrBuilder &&) = delete;
+    CsrBuilder &operator=(const CsrBuilder &) = delete;
+    CsrBuilder &operator=(CsrBuilder &&) = delete;
+
+private:
+    Csr<ValueType, IndexType> *matrix_;
+};
+
+
+}  // namespace matrix
+}  // namespace gko
+
+#endif  // GKO_CORE_MATRIX_CSR_BUILDER_HPP_
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index 2e099594e70..f901c0f0952 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,9 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_CSR_KERNELS_HPP_
 
 
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
@@ -61,30 +64,53 @@ namespace kernels {
                        const matrix::Dense<ValueType> *beta,        \
                        matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)  \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Dense<ValueType> *result,            \
-                          const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType)  \
-    void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Coo<ValueType, IndexType> *result,   \
-                        const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType)  \
-    void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Ell<ValueType, IndexType> *result,   \
-                        const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType)   \
-    void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,  \
-                           matrix::Hybrid<ValueType, IndexType> *result, \
-                           const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType)  \
-    void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Sellp<ValueType, IndexType> *result, \
-                          const matrix::Csr<ValueType, IndexType> *source)
+#define GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType)  \
+    void spgemm(std::shared_ptr<const DefaultExecutor> exec, \
+                const matrix::Csr<ValueType, IndexType> *a,  \
+                const matrix::Csr<ValueType, IndexType> *b,  \
+                matrix::Csr<ValueType, IndexType> *c)
+
+#define GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType)  \
+    void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec, \
+                         const matrix::Dense<ValueType> *alpha,       \
+                         const matrix::Csr<ValueType, IndexType> *a,  \
+                         const matrix::Csr<ValueType, IndexType> *b,  \
+                         const matrix::Dense<ValueType> *beta,        \
+                         const matrix::Csr<ValueType, IndexType> *d,  \
+                         matrix::Csr<ValueType, IndexType> *c)
+
+#define GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType)  \
+    void spgeam(std::shared_ptr<const DefaultExecutor> exec, \
+                const matrix::Dense<ValueType> *alpha,       \
+                const matrix::Csr<ValueType, IndexType> *a,  \
+                const matrix::Dense<ValueType> *beta,        \
+                const matrix::Csr<ValueType, IndexType> *b,  \
+                matrix::Csr<ValueType, IndexType> *c)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Csr<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType)      \
+    void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Csr<ValueType, IndexType> *source, \
+                        matrix::Coo<ValueType, IndexType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType)      \
+    void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Csr<ValueType, IndexType> *source, \
+                        matrix::Ell<ValueType, IndexType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType)      \
+    void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const matrix::Csr<ValueType, IndexType> *source, \
+                           matrix::Hybrid<ValueType, IndexType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType)      \
+    void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Csr<ValueType, IndexType> *source, \
+                          matrix::Sellp<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL(ValueType, IndexType)      \
     void calculate_total_cols(std::shared_ptr<const DefaultExecutor> exec,     \
@@ -92,15 +118,40 @@ namespace kernels {
                               size_type *result, size_type stride_factor,      \
                               size_type slice_size)
 
-#define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType)  \
-    void transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                   matrix::Csr<ValueType, IndexType> *trans,    \
-                   const matrix::Csr<ValueType, IndexType> *orig)
-
-#define GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)  \
-    void conj_transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<ValueType, IndexType> *trans,    \
-                        const matrix::Csr<ValueType, IndexType> *orig)
+#define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType)    \
+    void transpose(std::shared_ptr<const DefaultExecutor> exec,   \
+                   const matrix::Csr<ValueType, IndexType> *orig, \
+                   matrix::Csr<ValueType, IndexType> *trans)
+
+#define GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)    \
+    void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,   \
+                        const matrix::Csr<ValueType, IndexType> *orig, \
+                        matrix::Csr<ValueType, IndexType> *trans)
+
+#define GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                     const Array<IndexType> *permutation_indices,   \
+                     const matrix::Csr<ValueType, IndexType> *orig, \
+                     matrix::Csr<ValueType, IndexType> *row_permuted)
+
+#define GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void column_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                        const Array<IndexType> *permutation_indices,   \
+                        const matrix::Csr<ValueType, IndexType> *orig, \
+                        matrix::Csr<ValueType, IndexType> *column_permuted)
+
+#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                             const Array<IndexType> *permutation_indices,   \
+                             const matrix::Csr<ValueType, IndexType> *orig, \
+                             matrix::Csr<ValueType, IndexType> *row_permuted)
+
+#define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \
+    void inverse_column_permute(                                            \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const Array<IndexType> *permutation_indices,                        \
+        const matrix::Csr<ValueType, IndexType> *orig,                      \
+        matrix::Csr<ValueType, IndexType> *column_permuted)
 
 #define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \
     void calculate_max_nnz_per_row(                                            \
@@ -113,6 +164,7 @@ namespace kernels {
         std::shared_ptr<const DefaultExecutor> exec,                 \
         const matrix::Csr<ValueType, IndexType> *source,             \
         Array<size_type> *result)
+
 #define GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType)         \
     void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec, \
                               matrix::Csr<ValueType, IndexType> *to_sort)
@@ -128,6 +180,12 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType);              \
     template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType);                     \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType);                     \
+    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType);           \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType);             \
@@ -144,6 +202,14 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType);             \
     template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType);                \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);             \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType);        \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType);  \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \
@@ -180,6 +246,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace csr {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace csr
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index b284593d21a..d7ba31b77ad 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -49,9 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <algorithm>
-
-
 namespace gko {
 namespace matrix {
 namespace dense {
@@ -71,6 +72,10 @@ GKO_REGISTER_OPERATION(calculate_nonzeros_per_row,
 GKO_REGISTER_OPERATION(calculate_total_cols, dense::calculate_total_cols);
 GKO_REGISTER_OPERATION(transpose, dense::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, dense::conj_transpose);
+GKO_REGISTER_OPERATION(row_permute, dense::row_permute);
+GKO_REGISTER_OPERATION(column_permute, dense::column_permute);
+GKO_REGISTER_OPERATION(inverse_row_permute, dense::inverse_row_permute);
+GKO_REGISTER_OPERATION(inverse_column_permute, dense::inverse_column_permute);
 GKO_REGISTER_OPERATION(convert_to_coo, dense::convert_to_coo);
 GKO_REGISTER_OPERATION(convert_to_csr, dense::convert_to_csr);
 GKO_REGISTER_OPERATION(convert_to_ell, dense::convert_to_ell);
@@ -96,7 +101,7 @@ inline void conversion_helper(Coo<ValueType, IndexType> *result,
     exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
     auto tmp = Coo<ValueType, IndexType>::create(exec, source->get_size(),
                                                  num_stored_nonzeros);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -108,21 +113,12 @@ inline void conversion_helper(Csr<ValueType, IndexType> *result,
 {
     auto exec = source->get_executor();
 
-    if (source->get_size()) {
-        size_type num_stored_nonzeros = 0;
-        exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
-        auto tmp = Csr<ValueType, IndexType>::create(exec, source->get_size(),
-                                                     num_stored_nonzeros,
-                                                     result->get_strategy());
-        exec->run(op(tmp.get(), source));
-        tmp->move_to(result);
-    }
-    // If source is empty, there is no need to copy data or to call kernels
-    else {
-        auto tmp =
-            Csr<ValueType, IndexType>::create(exec, result->get_strategy());
-        tmp->move_to(result);
-    }
+    size_type num_stored_nonzeros = 0;
+    exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
+    auto tmp = Csr<ValueType, IndexType>::create(
+        exec, source->get_size(), num_stored_nonzeros, result->get_strategy());
+    exec->run(op(source, tmp.get()));
+    tmp->move_to(result);
 }
 
 
@@ -140,7 +136,7 @@ inline void conversion_helper(Ell<ValueType, IndexType> *result,
     const auto stride = std::max(result->get_stride(), source->get_size()[0]);
     auto tmp = Ell<ValueType, IndexType>::create(exec, source->get_size(),
                                                  max_nnz_per_row, stride);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -165,7 +161,7 @@ inline void conversion_helper(Hybrid<ValueType, IndexType> *result,
     auto tmp = Hybrid<ValueType, IndexType>::create(
         exec, source->get_size(), max_nnz_per_row, stride, coo_nnz,
         result->get_strategy());
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -187,7 +183,7 @@ inline void conversion_helper(Sellp<ValueType, IndexType> *result,
                                                stride_factor, slice_size));
     auto tmp = Sellp<ValueType, IndexType>::create(
         exec, source->get_size(), slice_size, stride_factor, total_cols);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -203,7 +199,7 @@ inline void conversion_helper(SparsityCsr<ValueType, IndexType> *result,
     exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
     auto tmp = SparsityCsr<ValueType, IndexType>::create(
         exec, source->get_size(), num_stored_nonzeros);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -271,10 +267,28 @@ void Dense<ValueType>::compute_dot_impl(const LinOp *b, LinOp *result) const
 template <typename ValueType>
 void Dense<ValueType>::compute_norm2_impl(LinOp *result) const
 {
+    using NormVector = Dense<remove_complex<ValueType>>;
     GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
     auto exec = this->get_executor();
     exec->run(dense::make_compute_norm2(as<Dense<ValueType>>(this),
-                                        as<Dense<ValueType>>(result)));
+                                        as<NormVector>(result)));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::convert_to(
+    Dense<next_precision<ValueType>> *result) const
+{
+    result->values_ = this->values_;
+    result->stride_ = this->stride_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::move_to(Dense<next_precision<ValueType>> *result)
+{
+    this->convert_to(result);
 }
 
 
@@ -283,8 +297,8 @@ void Dense<ValueType>::convert_to(Coo<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_coo<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_coo<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -300,8 +314,8 @@ void Dense<ValueType>::convert_to(Coo<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_coo<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_coo<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -317,8 +331,8 @@ void Dense<ValueType>::convert_to(Csr<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_csr<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_csr<const Dense<ValueType> *&,
+                                            decltype(result)>);
     result->make_srow();
 }
 
@@ -335,8 +349,8 @@ void Dense<ValueType>::convert_to(Csr<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_csr<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_csr<const Dense<ValueType> *&,
+                                            decltype(result)>);
     result->make_srow();
 }
 
@@ -353,8 +367,8 @@ void Dense<ValueType>::convert_to(Ell<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_ell<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_ell<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -370,8 +384,8 @@ void Dense<ValueType>::convert_to(Ell<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_ell<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_ell<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -387,8 +401,8 @@ void Dense<ValueType>::convert_to(Hybrid<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_hybrid<decltype(result),
-                                               const Dense<ValueType> *&>);
+        dense::template make_convert_to_hybrid<const Dense<ValueType> *&,
+                                               decltype(result)>);
 }
 
 
@@ -404,8 +418,8 @@ void Dense<ValueType>::convert_to(Hybrid<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_hybrid<decltype(result),
-                                               const Dense<ValueType> *&>);
+        dense::template make_convert_to_hybrid<const Dense<ValueType> *&,
+                                               decltype(result)>);
 }
 
 
@@ -421,8 +435,8 @@ void Dense<ValueType>::convert_to(Sellp<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_sellp<decltype(result),
-                                              const Dense<ValueType> *&>);
+        dense::template make_convert_to_sellp<const Dense<ValueType> *&,
+                                              decltype(result)>);
 }
 
 
@@ -438,8 +452,8 @@ void Dense<ValueType>::convert_to(Sellp<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_sellp<decltype(result),
-                                              const Dense<ValueType> *&>);
+        dense::template make_convert_to_sellp<const Dense<ValueType> *&,
+                                              decltype(result)>);
 }
 
 
@@ -453,9 +467,10 @@ void Dense<ValueType>::move_to(Sellp<ValueType, int64> *result)
 template <typename ValueType>
 void Dense<ValueType>::convert_to(SparsityCsr<ValueType, int32> *result) const
 {
-    conversion_helper(result, this,
-                      dense::template make_convert_to_sparsity_csr<
-                          decltype(result), const Dense<ValueType> *&>);
+    conversion_helper(
+        result, this,
+        dense::template make_convert_to_sparsity_csr<const Dense<ValueType> *&,
+                                                     decltype(result)>);
 }
 
 
@@ -469,9 +484,10 @@ void Dense<ValueType>::move_to(SparsityCsr<ValueType, int32> *result)
 template <typename ValueType>
 void Dense<ValueType>::convert_to(SparsityCsr<ValueType, int64> *result) const
 {
-    conversion_helper(result, this,
-                      dense::template make_convert_to_sparsity_csr<
-                          decltype(result), const Dense<ValueType> *&>);
+    conversion_helper(
+        result, this,
+        dense::template make_convert_to_sparsity_csr<const Dense<ValueType> *&,
+                                                     decltype(result)>);
 }
 
 
@@ -572,7 +588,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::transpose() const
     auto exec = this->get_executor();
     auto trans_cpy = Dense::create(exec, gko::transpose(this->get_size()));
 
-    exec->run(dense::make_transpose(trans_cpy.get(), this));
+    exec->run(dense::make_transpose(this, trans_cpy.get()));
 
     return std::move(trans_cpy);
 }
@@ -584,11 +600,135 @@ std::unique_ptr<LinOp> Dense<ValueType>::conj_transpose() const
     auto exec = this->get_executor();
     auto trans_cpy = Dense::create(exec, gko::transpose(this->get_size()));
 
-    exec->run(dense::make_conj_transpose(trans_cpy.get(), this));
+    exec->run(dense::make_conj_transpose(this, trans_cpy.get()));
     return std::move(trans_cpy);
 }
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::row_permute(
+    const Array<int32> *permutation_indices) const
+{
+    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
+    auto exec = this->get_executor();
+    auto permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(
+        dense::make_row_permute(permutation_indices, this, permute_cpy.get()));
+
+    return std::move(permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::column_permute(
+    const Array<int32> *permutation_indices) const
+{
+    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
+    auto exec = this->get_executor();
+    auto permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(dense::make_column_permute(permutation_indices, this,
+                                         permute_cpy.get()));
+
+    return std::move(permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::row_permute(
+    const Array<int64> *permutation_indices) const
+{
+    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
+    auto exec = this->get_executor();
+    auto permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(
+        dense::make_row_permute(permutation_indices, this, permute_cpy.get()));
+
+    return std::move(permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::column_permute(
+    const Array<int64> *permutation_indices) const
+{
+    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
+    auto exec = this->get_executor();
+    auto permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(dense::make_column_permute(permutation_indices, this,
+                                         permute_cpy.get()));
+
+    return std::move(permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::inverse_row_permute(
+    const Array<int32> *inverse_permutation_indices) const
+{
+    GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(),
+                  this->get_size()[0]);
+    auto exec = this->get_executor();
+    auto inverse_permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(dense::make_inverse_row_permute(inverse_permutation_indices, this,
+                                              inverse_permute_cpy.get()));
+
+    return std::move(inverse_permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::inverse_column_permute(
+    const Array<int32> *inverse_permutation_indices) const
+{
+    GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(),
+                  this->get_size()[1]);
+    auto exec = this->get_executor();
+    auto inverse_permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(dense::make_inverse_column_permute(
+        inverse_permutation_indices, this, inverse_permute_cpy.get()));
+
+    return std::move(inverse_permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::inverse_row_permute(
+    const Array<int64> *inverse_permutation_indices) const
+{
+    GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(),
+                  this->get_size()[0]);
+    auto exec = this->get_executor();
+    auto inverse_permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(dense::make_inverse_row_permute(inverse_permutation_indices, this,
+                                              inverse_permute_cpy.get()));
+
+    return std::move(inverse_permute_cpy);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Dense<ValueType>::inverse_column_permute(
+    const Array<int64> *inverse_permutation_indices) const
+{
+    GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(),
+                  this->get_size()[1]);
+    auto exec = this->get_executor();
+    auto inverse_permute_cpy = Dense::create(exec, this->get_size());
+
+    exec->run(dense::make_inverse_column_permute(
+        inverse_permutation_indices, this, inverse_permute_cpy.get()));
+
+    return std::move(inverse_permute_cpy);
+}
+
+
 #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX);
 
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 4857fb81db9..6c362eeaeb4 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_DENSE_KERNELS_HPP_
 
 
-#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 namespace kernels {
 
@@ -71,37 +74,37 @@ namespace kernels {
 #define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type)               \
     void compute_norm2(std::shared_ptr<const DefaultExecutor> exec, \
                        const matrix::Dense<_type> *x,               \
-                       matrix::Dense<_type> *result)
+                       matrix::Dense<remove_complex<_type>> *result)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec)        \
     void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Coo<_type, _prec> *other,            \
-                        const matrix::Dense<_type> *source)
+                        const matrix::Dense<_type> *source,          \
+                        matrix::Coo<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(_type, _prec)        \
     void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<_type, _prec> *other,            \
-                        const matrix::Dense<_type> *source)
+                        const matrix::Dense<_type> *source,          \
+                        matrix::Csr<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(_type, _prec)        \
     void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Ell<_type, _prec> *other,            \
-                        const matrix::Dense<_type> *source)
+                        const matrix::Dense<_type> *source,          \
+                        matrix::Ell<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(_type, _prec)        \
     void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec, \
-                           matrix::Hybrid<_type, _prec> *other,         \
-                           const matrix::Dense<_type> *source)
+                           const matrix::Dense<_type> *source,          \
+                           matrix::Hybrid<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(_type, _prec)        \
     void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Sellp<_type, _prec> *other,          \
-                          const matrix::Dense<_type> *source)
+                          const matrix::Dense<_type> *source,          \
+                          matrix::Sellp<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(_type, _prec)        \
     void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                                 matrix::SparsityCsr<_type, _prec> *other,    \
-                                 const matrix::Dense<_type> *source)
+                                 const matrix::Dense<_type> *source,          \
+                                 matrix::SparsityCsr<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL(_type)               \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec, \
@@ -125,13 +128,37 @@ namespace kernels {
 
 #define GKO_DECLARE_TRANSPOSE_KERNEL(_type)                     \
     void transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                   matrix::Dense<_type> *trans,                 \
-                   const matrix::Dense<_type> *orig)
+                   const matrix::Dense<_type> *orig,            \
+                   matrix::Dense<_type> *trans)
 
 #define GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(_type)                     \
     void conj_transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Dense<_type> *trans,                 \
-                        const matrix::Dense<_type> *orig)
+                        const matrix::Dense<_type> *orig,            \
+                        matrix::Dense<_type> *trans)
+
+#define GKO_DECLARE_ROW_PERMUTE_KERNEL(_vtype, _itype)            \
+    void row_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                     const Array<_itype> *permutation_indices,    \
+                     const matrix::Dense<_vtype> *orig,           \
+                     matrix::Dense<_vtype> *row_permuted)
+
+#define GKO_DECLARE_COLUMN_PERMUTE_KERNEL(_vtype, _itype)            \
+    void column_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                        const Array<_itype> *permutation_indices,    \
+                        const matrix::Dense<_vtype> *orig,           \
+                        matrix::Dense<_vtype> *column_permuted)
+
+#define GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(_vtype, _itype)            \
+    void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                             const Array<_itype> *permutation_indices,    \
+                             const matrix::Dense<_vtype> *orig,           \
+                             matrix::Dense<_vtype> *row_permuted)
+
+#define GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(_vtype, _itype)            \
+    void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                                const Array<_itype> *permutation_indices,    \
+                                const matrix::Dense<_vtype> *orig,           \
+                                matrix::Dense<_vtype> *column_permuted)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                        \
     template <typename ValueType>                                           \
@@ -169,7 +196,15 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_TRANSPOSE_KERNEL(ValueType);                                \
     template <typename ValueType>                                           \
-    GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(ValueType)
+    GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(ValueType);                           \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_ROW_PERMUTE_KERNEL(ValueType, IndexType);                   \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);                \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
 
 
 namespace omp {
@@ -199,6 +234,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace dense {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace dense
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 1d4bed57288..e12ad5ff83f 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -108,12 +108,32 @@ void Ell<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
 }
 
 
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::convert_to(
+    Ell<next_precision<ValueType>, IndexType> *result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_;
+    result->stride_ = this->stride_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::move_to(
+    Ell<next_precision<ValueType>, IndexType> *result)
+{
+    this->convert_to(result);
+}
+
+
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(ell::make_convert_to_dense(tmp.get(), this));
+    exec->run(ell::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -136,7 +156,7 @@ void Ell<ValueType, IndexType>::convert_to(
 
     auto tmp = Csr<ValueType, IndexType>::create(
         exec, this->get_size(), num_stored_elements, result->get_strategy());
-    exec->run(ell::make_convert_to_csr(tmp.get(), this));
+    exec->run(ell::make_convert_to_csr(this, tmp.get()));
 
     tmp->make_srow();
     tmp->move_to(result);
diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp
index 42728a5ade5..42331c9e53b 100644
--- a/core/matrix/ell_kernels.hpp
+++ b/core/matrix/ell_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,9 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_ELL_KERNELS_HPP_
 
 
+#include <ginkgo/core/matrix/ell.hpp>
+
+
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
 
 
 namespace gko {
@@ -56,15 +58,15 @@ namespace kernels {
                        const matrix::Dense<ValueType> *beta,        \
                        matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)  \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Dense<ValueType> *result,            \
-                          const matrix::Ell<ValueType, IndexType> *source)
+#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Ell<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
 
-#define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)  \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<ValueType, IndexType> *result,   \
-                        const matrix::Ell<ValueType, IndexType> *source)
+#define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Ell<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL(ValueType, IndexType)      \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec,     \
@@ -120,6 +122,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace ell {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace ell
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index 40a50a377ea..adbb48bd1aa 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -109,12 +109,32 @@ void Hybrid<ValueType, IndexType>::apply_impl(const LinOp *alpha,
 }
 
 
+template <typename ValueType, typename IndexType>
+void Hybrid<ValueType, IndexType>::convert_to(
+    Hybrid<next_precision<ValueType>, IndexType> *result) const
+{
+    this->ell_->convert_to(result->ell_.get());
+    this->coo_->convert_to(result->coo_.get());
+    // TODO set strategy correctly
+    // There is no way to correctly clone the strategy like in Csr::convert_to
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Hybrid<ValueType, IndexType>::move_to(
+    Hybrid<next_precision<ValueType>, IndexType> *result)
+{
+    this->convert_to(result);
+}
+
+
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(hybrid::make_convert_to_dense(tmp.get(), this));
+    exec->run(hybrid::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -137,7 +157,7 @@ void Hybrid<ValueType, IndexType>::convert_to(
 
     auto tmp = Csr<ValueType, IndexType>::create(
         exec, this->get_size(), num_stored_elements, result->get_strategy());
-    exec->run(hybrid::make_convert_to_csr(tmp.get(), this));
+    exec->run(hybrid::make_convert_to_csr(this, tmp.get()));
 
     tmp->make_srow();
     tmp->move_to(result);
diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp
index 84230986ada..788fe66e15b 100644
--- a/core/matrix/hybrid_kernels.hpp
+++ b/core/matrix/hybrid_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,23 +34,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_HYBRID_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 
 
+#include <ginkgo/core/matrix/dense.hpp>
+
+
 namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,   \
-                          matrix::Dense<ValueType> *result,              \
-                          const matrix::Hybrid<ValueType, IndexType> *source)
+#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,        \
+                          const matrix::Hybrid<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
 
-#define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,   \
-                        matrix::Csr<ValueType, IndexType> *result,     \
-                        const matrix::Hybrid<ValueType, IndexType> *source)
+#define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,        \
+                        const matrix::Hybrid<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL(ValueType, IndexType)      \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec,        \
@@ -93,6 +95,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace hybrid {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace hybrid
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp
index e5f0ef3a669..884e5781ee8 100644
--- a/core/matrix/identity.cpp
+++ b/core/matrix/identity.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -69,6 +69,20 @@ std::unique_ptr<LinOp> IdentityFactory<ValueType>::generate_impl(
 }
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Identity<ValueType>::transpose() const
+{
+    return this->clone();
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Identity<ValueType>::conj_transpose() const
+{
+    return this->clone();
+}
+
+
 #define GKO_DECLARE_IDENTITY_MATRIX(_type) class Identity<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_MATRIX);
 #define GKO_DECLARE_IDENTITY_FACTORY(_type) class IdentityFactory<_type>
diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp
new file mode 100644
index 00000000000..a8b6e5ff139
--- /dev/null
+++ b/core/matrix/permutation.cpp
@@ -0,0 +1,45 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+#define GKO_DECLARE_PERMUTATION_MATRIX(_type) class Permutation<_type>
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_MATRIX);
+
+
+}  // namespace matrix
+}  // namespace gko
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index 880678b3738..5d282fbc495 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -41,12 +41,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/allocator.hpp"
 #include "core/matrix/sellp_kernels.hpp"
 
 
-#include <vector>
-
-
 namespace gko {
 namespace matrix {
 namespace sellp {
@@ -69,7 +67,7 @@ template <typename ValueType, typename IndexType>
 size_type calculate_total_cols(const matrix_data<ValueType, IndexType> &data,
                                const size_type slice_size,
                                const size_type stride_factor,
-                               std::vector<size_type> &slice_lengths)
+                               vector<size_type> &slice_lengths)
 {
     size_type nonzeros_per_row = 0;
     IndexType current_row = 0;
@@ -122,12 +120,35 @@ void Sellp<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
 }
 
 
+template <typename ValueType, typename IndexType>
+void Sellp<ValueType, IndexType>::convert_to(
+    Sellp<next_precision<ValueType>, IndexType> *result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->slice_lengths_ = this->slice_lengths_;
+    result->slice_sets_ = this->slice_sets_;
+    result->slice_size_ = this->slice_size_;
+    result->stride_factor_ = this->stride_factor_;
+    result->total_cols_ = this->total_cols_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Sellp<ValueType, IndexType>::move_to(
+    Sellp<next_precision<ValueType>, IndexType> *result)
+{
+    this->convert_to(result);
+}
+
+
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(sellp::make_convert_to_dense(tmp.get(), this));
+    exec->run(sellp::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -149,7 +170,7 @@ void Sellp<ValueType, IndexType>::convert_to(
     exec->run(sellp::make_count_nonzeros(this, &num_stored_nonzeros));
     auto tmp = Csr<ValueType, IndexType>::create(
         exec, this->get_size(), num_stored_nonzeros, result->get_strategy());
-    exec->run(sellp::make_convert_to_csr(tmp.get(), this));
+    exec->run(sellp::make_convert_to_csr(this, tmp.get()));
     tmp->make_srow();
     tmp->move_to(result);
 }
@@ -175,7 +196,7 @@ void Sellp<ValueType, IndexType>::read(const mat_data &data)
     // Allocate space for slice_cols.
     size_type slice_num =
         static_cast<index_type>((data.size[0] + slice_size - 1) / slice_size);
-    std::vector<size_type> slice_lengths(slice_num, 0);
+    vector<size_type> slice_lengths(slice_num, 0, {this->get_executor()});
 
     // Get the number of maximum columns for every slice.
     auto total_cols =
diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp
index 0c45b6d7b82..fcd0114e25b 100644
--- a/core/matrix/sellp_kernels.hpp
+++ b/core/matrix/sellp_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,9 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_SELLP_KERNELS_HPP_
 
 
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
 
 
 namespace gko {
@@ -56,15 +58,15 @@ namespace kernels {
                        const matrix::Dense<ValueType> *beta,         \
                        matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,  \
-                          matrix::Dense<ValueType> *result,             \
-                          const matrix::Sellp<ValueType, IndexType> *source)
+#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,       \
+                          const matrix::Sellp<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
 
-#define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,  \
-                        matrix::Csr<ValueType, IndexType> *result,    \
-                        const matrix::Sellp<ValueType, IndexType> *source)
+#define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,       \
+                        const matrix::Sellp<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL(ValueType, IndexType)      \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec,       \
@@ -111,6 +113,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace sellp {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace sellp
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index 3c7cfa1363f..851dcd946a5 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include <memory>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -152,7 +149,7 @@ std::unique_ptr<LinOp> SparsityCsr<ValueType, IndexType>::transpose() const
     auto trans_cpy = SparsityCsr::create(exec, gko::transpose(this->get_size()),
                                          this->get_num_nonzeros());
 
-    exec->run(sparsity_csr::make_transpose(trans_cpy.get(), this));
+    exec->run(sparsity_csr::make_transpose(this, trans_cpy.get()));
     return std::move(trans_cpy);
 }
 
@@ -177,7 +174,7 @@ SparsityCsr<ValueType, IndexType>::to_adjacency_matrix() const
                             this->get_num_nonzeros() - num_diagonal_elements);
 
     exec->run(sparsity_csr::make_remove_diagonal_elements(
-        adj_mat.get(), this->get_const_row_ptrs(), this->get_const_col_idxs()));
+        this->get_const_row_ptrs(), this->get_const_col_idxs(), adj_mat.get()));
     return std::move(adj_mat);
 }
 
diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp
index f9af3dcdffa..58ec58e789f 100644
--- a/core/matrix/sparsity_csr_kernels.hpp
+++ b/core/matrix/sparsity_csr_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,9 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_SPARSITY_CSR_KERNELS_HPP_
 
 
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 namespace gko {
@@ -60,8 +62,8 @@ namespace kernels {
                                                                  IndexType) \
     void remove_diagonal_elements(                                          \
         std::shared_ptr<const DefaultExecutor> exec,                        \
-        matrix::SparsityCsr<ValueType, IndexType> *matrix,                  \
-        const IndexType *row_ptrs, const IndexType *col_idxs)
+        const IndexType *row_ptrs, const IndexType *col_idxs,               \
+        matrix::SparsityCsr<ValueType, IndexType> *matrix)
 
 #define GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL(ValueType, \
                                                                     IndexType) \
@@ -70,10 +72,10 @@ namespace kernels {
         const matrix::SparsityCsr<ValueType, IndexType> *matrix,               \
         size_type *num_diagonal_elements)
 
-#define GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \
-    void transpose(std::shared_ptr<const DefaultExecutor> exec,         \
-                   matrix::SparsityCsr<ValueType, IndexType> *trans,    \
-                   const matrix::SparsityCsr<ValueType, IndexType> *orig)
+#define GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL(ValueType, IndexType)   \
+    void transpose(std::shared_ptr<const DefaultExecutor> exec,           \
+                   const matrix::SparsityCsr<ValueType, IndexType> *orig, \
+                   matrix::SparsityCsr<ValueType, IndexType> *trans)
 
 #define GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType) \
     void sort_by_column_index(                                              \
@@ -133,6 +135,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace sparsity_csr {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace sparsity_csr
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp
new file mode 100644
index 00000000000..0b8738c5594
--- /dev/null
+++ b/core/preconditioner/isai.cpp
@@ -0,0 +1,246 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/solver/lower_trs.hpp>
+#include <ginkgo/core/solver/upper_trs.hpp>
+
+
+#include "core/preconditioner/isai_kernels.hpp"
+
+
+namespace gko {
+namespace preconditioner {
+namespace isai {
+
+
+GKO_REGISTER_OPERATION(generate_tri_inverse, isai::generate_tri_inverse);
+GKO_REGISTER_OPERATION(generate_excess_system, isai::generate_excess_system);
+GKO_REGISTER_OPERATION(scatter_excess_solution, isai::scatter_excess_solution);
+
+
+}  // namespace isai
+
+
+/**
+ * @internal
+ *
+ * Helper function that converts the given matrix to the (const) CSR format with
+ * additional sorting.
+ *
+ * If the given matrix was already sorted, is on the same executor and with a
+ * dynamic type of `const Csr`, the same pointer is returned with an empty
+ * deleter.
+ * In all other cases, a new matrix is created, which stores the converted Csr
+ * matrix.
+ * If `skip_sorting` is false, the matrix will be sorted by column index,
+ * otherwise, it will not be sorted.
+ */
+template <typename Csr>
+std::shared_ptr<const Csr> convert_to_csr_and_sort(
+    std::shared_ptr<const Executor> &exec, std::shared_ptr<const LinOp> mtx,
+    bool skip_sorting)
+{
+    static_assert(
+        std::is_same<Csr, matrix::Csr<typename Csr::value_type,
+                                      typename Csr::index_type>>::value,
+        "The given `Csr` type must be of type `matrix::Csr`!");
+    if (skip_sorting && exec == mtx->get_executor()) {
+        auto csr_mtx = std::dynamic_pointer_cast<const Csr>(mtx);
+        if (csr_mtx) {
+            // Here, we can just forward the pointer with an empty deleter
+            // since it is already sorted and in the correct format
+            return csr_mtx;
+        }
+    }
+    auto copy = Csr::create(exec);
+    as<ConvertibleTo<Csr>>(mtx)->convert_to(lend(copy));
+    // Here, we assume that a sorted matrix converted to CSR will also be
+    // sorted
+    if (!skip_sorting) {
+        copy->sort_by_column_index();
+    }
+    return {std::move(copy)};
+}
+
+
+/**
+ * @internal
+ *
+ * Helper function that extends the sparsity pattern of the matrix M to M^n
+ * without changing its values.
+ *
+ * The input matrix must be sorted and on the correct executor for this to work.
+ * If `power` is 1, the matrix will be returned unchanged.
+ */
+template <typename Csr>
+std::shared_ptr<Csr> extend_sparsity(std::shared_ptr<const Executor> &exec,
+                                     std::shared_ptr<const Csr> mtx, int power)
+{
+    GKO_ASSERT_EQ(power >= 1, true);
+    if (power == 1) {
+        // copy the matrix, as it will be used to store the inverse
+        return {std::move(mtx->clone())};
+    }
+    auto id_power = mtx->clone();
+    auto tmp = Csr::create(exec, mtx->get_size());
+    // accumulates mtx * the remainder from odd powers
+    auto acc = mtx->clone();
+    // compute id^(n-1) using square-and-multiply
+    int i = power - 1;
+    while (i > 1) {
+        if (i % 2 != 0) {
+            // store one power in acc:
+            // i^(2n+1) -> i*i^2n
+            id_power->apply(lend(acc), lend(tmp));
+            std::swap(acc, tmp);
+            i--;
+        }
+        // square id_power: i^2n -> (i^2)^n
+        id_power->apply(lend(id_power), lend(tmp));
+        std::swap(id_power, tmp);
+        i /= 2;
+    }
+    // combine acc and id_power again
+    id_power->apply(lend(acc), lend(tmp));
+    return {std::move(tmp)};
+}
+
+
+template <isai_type IsaiType, typename ValueType, typename IndexType>
+void Isai<IsaiType, ValueType, IndexType>::generate_inverse(
+    std::shared_ptr<const LinOp> input, bool skip_sorting, int power)
+{
+    using Dense = matrix::Dense<ValueType>;
+    using LowerTrs = solver::LowerTrs<ValueType, IndexType>;
+    using UpperTrs = solver::UpperTrs<ValueType, IndexType>;
+    GKO_ASSERT_IS_SQUARE_MATRIX(input);
+    auto exec = this->get_executor();
+    auto to_invert = convert_to_csr_and_sort<Csr>(exec, input, skip_sorting);
+    auto inverted = extend_sparsity(exec, to_invert, power);
+    auto num_rows = inverted->get_size()[0];
+    auto is_lower = IsaiType == isai_type::lower;
+
+    // This stores the beginning of the RHS for the sparse block associated with
+    // each row of inverted_l
+    Array<IndexType> excess_block_ptrs{exec, num_rows + 1};
+    // This stores the beginning of the non-zeros belonging to each row in the
+    // system of excess blocks
+    Array<IndexType> excess_row_ptrs_full{exec, num_rows + 1};
+
+    exec->run(isai::make_generate_tri_inverse(
+        lend(to_invert), lend(inverted), excess_block_ptrs.get_data(),
+        excess_row_ptrs_full.get_data(), is_lower));
+
+    auto excess_dim =
+        exec->copy_val_to_host(excess_block_ptrs.get_const_data() + num_rows);
+    // if we had long rows:
+    if (excess_dim > 0) {
+        // build the excess sparse triangular system
+        auto excess_nnz = exec->copy_val_to_host(
+            excess_row_ptrs_full.get_const_data() + num_rows);
+        auto excess_system =
+            Csr::create(exec, dim<2>(excess_dim, excess_dim), excess_nnz);
+        auto excess_rhs = Dense::create(exec, dim<2>(excess_dim, 1));
+        auto excess_solution = Dense::create(exec, dim<2>(excess_dim, 1));
+        exec->run(isai::make_generate_excess_system(
+            lend(to_invert), lend(inverted), excess_block_ptrs.get_const_data(),
+            excess_row_ptrs_full.get_const_data(), lend(excess_system),
+            lend(excess_rhs)));
+        // solve it after transposing
+        std::unique_ptr<LinOpFactory> trs_factory;
+        if (is_lower) {
+            trs_factory = UpperTrs::build().on(exec);
+        } else {
+            trs_factory = LowerTrs::build().on(exec);
+        }
+        trs_factory->generate(share(excess_system->transpose()))
+            ->apply(lend(excess_rhs), lend(excess_solution));
+        // and copy the results back to the original ISAI
+        exec->run(isai::make_scatter_excess_solution(
+            excess_block_ptrs.get_const_data(), lend(excess_solution),
+            lend(inverted)));
+    }
+
+    approximate_inverse_ = std::move(inverted);
+}
+
+
+template <isai_type IsaiType, typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Isai<IsaiType, ValueType, IndexType>::transpose() const
+{
+    std::unique_ptr<transposed_type> transp{
+        new transposed_type{this->get_executor()}};
+    transp->set_size(gko::transpose(this->get_size()));
+    transp->approximate_inverse_ =
+        share(as<Csr>(this->get_approximate_inverse()->transpose()));
+
+    return std::move(transp);
+}
+
+
+template <isai_type IsaiType, typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Isai<IsaiType, ValueType, IndexType>::conj_transpose()
+    const
+{
+    std::unique_ptr<transposed_type> transp{
+        new transposed_type{this->get_executor()}};
+    transp->set_size(gko::transpose(this->get_size()));
+    transp->approximate_inverse_ =
+        share(as<Csr>(this->get_approximate_inverse()->conj_transpose()));
+
+    return std::move(transp);
+}
+
+
+#define GKO_DECLARE_LOWER_ISAI(ValueType, IndexType) \
+    class Isai<isai_type::lower, ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_ISAI);
+
+#define GKO_DECLARE_UPPER_ISAI(ValueType, IndexType) \
+    class Isai<isai_type::upper, ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_ISAI);
+
+
+}  // namespace preconditioner
+}  // namespace gko
diff --git a/core/preconditioner/isai_kernels.hpp b/core/preconditioner/isai_kernels.hpp
new file mode 100644
index 00000000000..ce53d51cd3c
--- /dev/null
+++ b/core/preconditioner/isai_kernels.hpp
@@ -0,0 +1,121 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_
+#define GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL(ValueType, IndexType)    \
+    void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,    \
+                              const matrix::Csr<ValueType, IndexType> *input, \
+                              matrix::Csr<ValueType, IndexType> *inverse,     \
+                              IndexType *excess_rhs_ptrs,                     \
+                              IndexType *excess_nz_ptrs, bool lower)
+
+#define GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL(ValueType, IndexType) \
+    void generate_excess_system(                                             \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const matrix::Csr<ValueType, IndexType> *input,                      \
+        const matrix::Csr<ValueType, IndexType> *inverse,                    \
+        const IndexType *excess_rhs_ptrs, const IndexType *excess_nz_ptrs,   \
+        matrix::Csr<ValueType, IndexType> *excess_system,                    \
+        matrix::Dense<ValueType> *excess_rhs)
+
+#define GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL(ValueType, IndexType) \
+    void scatter_excess_solution(                                             \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const IndexType *excess_rhs_ptrs,                                     \
+        const matrix::Dense<ValueType> *excess_solution,                      \
+        matrix::Csr<ValueType, IndexType> *inverse)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                      \
+    constexpr auto row_size_limit = 32;                                   \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                     \
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL(ValueType, IndexType)
+
+
+namespace omp {
+namespace isai {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace isai
+}  // namespace omp
+
+
+namespace cuda {
+namespace isai {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace isai
+}  // namespace cuda
+
+
+namespace reference {
+namespace isai {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace isai
+}  // namespace reference
+
+
+namespace hip {
+namespace isai {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace isai
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index 954ad6e480d..f7351cd779c 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -55,6 +55,8 @@ GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
 GKO_REGISTER_OPERATION(apply, jacobi::apply);
 GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
 GKO_REGISTER_OPERATION(generate, jacobi::generate);
+GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
+GKO_REGISTER_OPERATION(conj_transpose_jacobi, jacobi::conj_transpose_jacobi);
 GKO_REGISTER_OPERATION(convert_to_dense, jacobi::convert_to_dense);
 GKO_REGISTER_OPERATION(initialize_precisions, jacobi::initialize_precisions);
 
@@ -142,6 +144,48 @@ void Jacobi<ValueType, IndexType>::write(mat_data &data) const
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Jacobi<ValueType, IndexType>::transpose() const
+{
+    auto res = std::unique_ptr<Jacobi<ValueType, IndexType>>(
+        new Jacobi<ValueType, IndexType>(this->get_executor()));
+    // Jacobi enforces square matrices, so no dim transposition necessary
+    res->set_size(this->get_size());
+    res->storage_scheme_ = storage_scheme_;
+    res->num_blocks_ = num_blocks_;
+    res->blocks_.resize_and_reset(blocks_.get_num_elems());
+    res->conditioning_ = conditioning_;
+    res->parameters_ = parameters_;
+    this->get_executor()->run(jacobi::make_transpose_jacobi(
+        num_blocks_, parameters_.max_block_size,
+        parameters_.storage_optimization.block_wise, parameters_.block_pointers,
+        blocks_, storage_scheme_, res->blocks_));
+
+    return std::move(res);
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Jacobi<ValueType, IndexType>::conj_transpose() const
+{
+    auto res = std::unique_ptr<Jacobi<ValueType, IndexType>>(
+        new Jacobi<ValueType, IndexType>(this->get_executor()));
+    // Jacobi enforces square matrices, so no dim transposition necessary
+    res->set_size(this->get_size());
+    res->storage_scheme_ = storage_scheme_;
+    res->num_blocks_ = num_blocks_;
+    res->blocks_.resize_and_reset(blocks_.get_num_elems());
+    res->conditioning_ = conditioning_;
+    res->parameters_ = parameters_;
+    this->get_executor()->run(jacobi::make_conj_transpose_jacobi(
+        num_blocks_, parameters_.max_block_size,
+        parameters_.storage_optimization.block_wise, parameters_.block_pointers,
+        blocks_, storage_scheme_, res->blocks_));
+
+    return std::move(res);
+}
+
+
 template <typename ValueType, typename IndexType>
 void Jacobi<ValueType, IndexType>::detect_blocks(
     const matrix::Csr<ValueType, IndexType> *system_matrix)
@@ -159,8 +203,7 @@ void Jacobi<ValueType, IndexType>::detect_blocks(
 template <typename ValueType, typename IndexType>
 void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix)
 {
-    GKO_ASSERT_EQUAL_DIMENSIONS(system_matrix,
-                                transpose(system_matrix->get_size()));
+    GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
     const auto exec = this->get_executor();
     const auto csr_mtx = copy_and_convert_to<matrix::Csr<ValueType, IndexType>>(
         exec, system_matrix);
diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp
index 9c839cf556d..12d232c26f8 100644
--- a/core/preconditioner/jacobi_kernels.hpp
+++ b/core/preconditioner/jacobi_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_PRECONDITIONER_JACOBI_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
 
+#include <ginkgo/core/matrix/csr.hpp>
+
+
 namespace gko {
 namespace kernels {
 
@@ -83,6 +85,28 @@ namespace kernels {
         const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
         matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)          \
+    void transpose_jacobi(                                                 \
+        std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
+        uint32 max_block_size,                                             \
+        const Array<precision_reduction> &block_precisions,                \
+        const Array<IndexType> &block_pointers,                            \
+        const Array<ValueType> &blocks,                                    \
+        const preconditioner::block_interleaved_storage_scheme<IndexType>  \
+            &storage_scheme,                                               \
+        Array<ValueType> &out_blocks)
+
+#define GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)     \
+    void conj_transpose_jacobi(                                            \
+        std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
+        uint32 max_block_size,                                             \
+        const Array<precision_reduction> &block_precisions,                \
+        const Array<IndexType> &block_pointers,                            \
+        const Array<ValueType> &blocks,                                    \
+        const preconditioner::block_interleaved_storage_scheme<IndexType>  \
+            &storage_scheme,                                               \
+        Array<ValueType> &out_blocks)
+
 #define GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)   \
     void convert_to_dense(                                                 \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -108,6 +132,10 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                 \
+    GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType);        \
+    template <typename ValueType, typename IndexType>                 \
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \
     GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL()
 
@@ -139,6 +167,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace jacobi {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace jacobi
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp
index 7d46d7187ea..904820cbce2 100644
--- a/core/preconditioner/jacobi_utils.hpp
+++ b/core/preconditioner/jacobi_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
new file mode 100644
index 00000000000..a7519f48a33
--- /dev/null
+++ b/core/solver/bicg.cpp
@@ -0,0 +1,240 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicg.hpp>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/base/utils.hpp>
+
+
+#include "core/solver/bicg_kernels.hpp"
+
+
+namespace gko {
+namespace solver {
+
+
+namespace bicg {
+
+
+GKO_REGISTER_OPERATION(initialize, bicg::initialize);
+GKO_REGISTER_OPERATION(step_1, bicg::step_1);
+GKO_REGISTER_OPERATION(step_2, bicg::step_2);
+
+
+}  // namespace bicg
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Bicg<ValueType>::transpose() const
+{
+    return build()
+        .with_generated_preconditioner(
+            share(as<Transposable>(this->get_preconditioner())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Bicg<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_preconditioner(share(
+            as<Transposable>(this->get_preconditioner())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
+
+
+/**
+ * @internal
+ * Transposes the matrix by converting it into a CSR matrix of type
+ * CsrType, followed by transposing.
+ *
+ * @param mtx  Matrix to transpose
+ * @tparam CsrType  Matrix format in which the matrix mtx is converted into
+ *                  before transposing it
+ */
+template <typename CsrType>
+std::unique_ptr<LinOp> transpose_with_csr(const LinOp *mtx)
+{
+    auto csr_matrix_unique_ptr = copy_and_convert_to<CsrType>(
+        mtx->get_executor(), const_cast<LinOp *>(mtx));
+
+    csr_matrix_unique_ptr->set_strategy(
+        std::make_shared<typename CsrType::classical>());
+
+    return csr_matrix_unique_ptr->transpose();
+}
+
+
+template <typename ValueType>
+void Bicg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
+{
+    using std::swap;
+    using Vector = matrix::Dense<ValueType>;
+    constexpr uint8 RelativeStoppingId{1};
+
+    auto exec = this->get_executor();
+
+    auto one_op = initialize<Vector>({one<ValueType>()}, exec);
+    auto neg_one_op = initialize<Vector>({-one<ValueType>()}, exec);
+
+    auto dense_b = as<const Vector>(b);
+    auto dense_x = as<Vector>(x);
+    auto r = Vector::create_with_config_of(dense_b);
+    auto r2 = Vector::create_with_config_of(dense_b);
+    auto z = Vector::create_with_config_of(dense_b);
+    auto z2 = Vector::create_with_config_of(dense_b);
+    auto p = Vector::create_with_config_of(dense_b);
+    auto p2 = Vector::create_with_config_of(dense_b);
+    auto q = Vector::create_with_config_of(dense_b);
+    auto q2 = Vector::create_with_config_of(dense_b);
+
+    auto alpha = Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
+    auto beta = Vector::create_with_config_of(alpha.get());
+    auto prev_rho = Vector::create_with_config_of(alpha.get());
+    auto rho = Vector::create_with_config_of(alpha.get());
+
+    bool one_changed{};
+    Array<stopping_status> stop_status(alpha->get_executor(),
+                                       dense_b->get_size()[1]);
+
+    // TODO: replace this with automatic merged kernel generator
+    exec->run(bicg::make_initialize(
+        dense_b, r.get(), z.get(), p.get(), q.get(), prev_rho.get(), rho.get(),
+        r2.get(), z2.get(), p2.get(), q2.get(), &stop_status));
+    // rho = 0.0
+    // prev_rho = 1.0
+    // z = p = q = 0
+    // r = r2 = dense_b
+    // z2 = p2 = q2 = 0
+
+    std::unique_ptr<LinOp> trans_A;
+    auto transposable_system_matrix =
+        dynamic_cast<const Transposable *>(system_matrix_.get());
+
+    if (transposable_system_matrix) {
+        trans_A = transposable_system_matrix->transpose();
+    } else {
+        // TODO Extend when adding more IndexTypes
+        // Try to figure out the IndexType that can be used for the CSR matrix
+        using Csr32 = matrix::Csr<ValueType, int32>;
+        using Csr64 = matrix::Csr<ValueType, int64>;
+        auto supports_int64 =
+            dynamic_cast<const ConvertibleTo<Csr64> *>(system_matrix_.get());
+        if (supports_int64) {
+            trans_A = transpose_with_csr<Csr64>(system_matrix_.get());
+        } else {
+            trans_A = transpose_with_csr<Csr32>(system_matrix_.get());
+        }
+    }
+
+    auto trans_preconditioner_tmp =
+        as<const Transposable>(get_preconditioner().get());
+    auto trans_preconditioner = trans_preconditioner_tmp->transpose();
+
+    system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), r.get());
+    // r = r - Ax =  -1.0 * A*dense_x + 1.0*r
+    r2->copy_from(r.get());
+    // r2 = r
+    auto stop_criterion = stop_criterion_factory_->generate(
+        system_matrix_, std::shared_ptr<const LinOp>(b, [](const LinOp *) {}),
+        x, r.get());
+
+    int iter = -1;
+
+    while (true) {
+        get_preconditioner()->apply(r.get(), z.get());
+        trans_preconditioner->apply(r2.get(), z2.get());
+        z->compute_dot(r2.get(), rho.get());
+
+        ++iter;
+        this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
+                                                            dense_x);
+        if (stop_criterion->update()
+                .num_iterations(iter)
+                .residual(r.get())
+                .solution(dense_x)
+                .check(RelativeStoppingId, true, &stop_status, &one_changed)) {
+            break;
+        }
+
+        exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(),
+                                    rho.get(), prev_rho.get(), &stop_status));
+        // tmp = rho / prev_rho
+        // p = z + tmp * p
+        // p2 = z2 + tmp * p2
+        system_matrix_->apply(p.get(), q.get());
+        trans_A->apply(p2.get(), q2.get());
+        p2->compute_dot(q.get(), beta.get());
+        exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(),
+                                    q.get(), q2.get(), beta.get(), rho.get(),
+                                    &stop_status));
+        // tmp = rho / beta
+        // x = x + tmp * p
+        // r = r - tmp * q
+        // r2 = r2 - tmp * q2
+        swap(prev_rho, rho);
+    }
+}
+
+
+template <typename ValueType>
+void Bicg<ValueType>::apply_impl(const LinOp *alpha, const LinOp *b,
+                                 const LinOp *beta, LinOp *x) const
+{
+    auto dense_x = as<matrix::Dense<ValueType>>(x);
+
+    auto x_clone = dense_x->clone();
+    this->apply(b, x_clone.get());
+    dense_x->scale(beta);
+    dense_x->add_scaled(alpha, x_clone.get());
+}
+
+
+#define GKO_DECLARE_BICG(_type) class Bicg<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG);
+
+
+}  // namespace solver
+}  // namespace gko
diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp
new file mode 100644
index 00000000000..9ef21b3a243
--- /dev/null
+++ b/core/solver/bicg_kernels.hpp
@@ -0,0 +1,134 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_SOLVER_BICG_KERNELS_HPP_
+#define GKO_CORE_SOLVER_BICG_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/stopping_status.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace bicg {
+
+
+#define GKO_DECLARE_BICG_INITIALIZE_KERNEL(_type)                            \
+    void initialize(std::shared_ptr<const DefaultExecutor> exec,             \
+                    const matrix::Dense<_type> *b, matrix::Dense<_type> *r,  \
+                    matrix::Dense<_type> *z, matrix::Dense<_type> *p,        \
+                    matrix::Dense<_type> *q, matrix::Dense<_type> *prev_rho, \
+                    matrix::Dense<_type> *rho, matrix::Dense<_type> *r2,     \
+                    matrix::Dense<_type> *z2, matrix::Dense<_type> *p2,      \
+                    matrix::Dense<_type> *q2,                                \
+                    Array<stopping_status> *stop_status)
+
+
+#define GKO_DECLARE_BICG_STEP_1_KERNEL(_type)                             \
+    void step_1(std::shared_ptr<const DefaultExecutor> exec,              \
+                matrix::Dense<_type> *p, const matrix::Dense<_type> *z,   \
+                matrix::Dense<_type> *p2, const matrix::Dense<_type> *z2, \
+                const matrix::Dense<_type> *rho,                          \
+                const matrix::Dense<_type> *prev_rho,                     \
+                const Array<stopping_status> *stop_status)
+
+
+#define GKO_DECLARE_BICG_STEP_2_KERNEL(_type)                                  \
+    void step_2(std::shared_ptr<const DefaultExecutor> exec,                   \
+                matrix::Dense<_type> *x, matrix::Dense<_type> *r,              \
+                matrix::Dense<_type> *r2, const matrix::Dense<_type> *p,       \
+                const matrix::Dense<_type> *q, const matrix::Dense<_type> *q2, \
+                const matrix::Dense<_type> *beta,                              \
+                const matrix::Dense<_type> *rho,                               \
+                const Array<stopping_status> *stop_status)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES               \
+    template <typename ValueType>                  \
+    GKO_DECLARE_BICG_INITIALIZE_KERNEL(ValueType); \
+    template <typename ValueType>                  \
+    GKO_DECLARE_BICG_STEP_1_KERNEL(ValueType);     \
+    template <typename ValueType>                  \
+    GKO_DECLARE_BICG_STEP_2_KERNEL(ValueType)
+
+
+}  // namespace bicg
+
+
+namespace omp {
+namespace bicg {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace bicg
+}  // namespace omp
+
+
+namespace cuda {
+namespace bicg {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace bicg
+}  // namespace cuda
+
+
+namespace reference {
+namespace bicg {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace bicg
+}  // namespace reference
+
+
+namespace hip {
+namespace bicg {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace bicg
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_SOLVER_BICG_KERNELS_HPP_
diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp
index 3d1b55346af..570c9daee6f 100644
--- a/core/solver/bicgstab.cpp
+++ b/core/solver/bicgstab.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -58,6 +58,32 @@ GKO_REGISTER_OPERATION(finalize, bicgstab::finalize);
 }  // namespace bicgstab
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Bicgstab<ValueType>::transpose() const
+{
+    return build()
+        .with_generated_preconditioner(
+            share(as<Transposable>(this->get_preconditioner())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Bicgstab<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_preconditioner(share(
+            as<Transposable>(this->get_preconditioner())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
+
+
 template <typename ValueType>
 void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp
index 0080cdb73ef..8b48151a50f 100644
--- a/core/solver/bicgstab_kernels.hpp
+++ b/core/solver/bicgstab_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -139,6 +139,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace bicgstab {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace bicgstab
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index aa1a59111b1..838ede4a882 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,6 +59,32 @@ GKO_REGISTER_OPERATION(step_2, cg::step_2);
 }  // namespace cg
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Cg<ValueType>::transpose() const
+{
+    return build()
+        .with_generated_preconditioner(
+            share(as<Transposable>(this->get_preconditioner())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Cg<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_preconditioner(share(
+            as<Transposable>(this->get_preconditioner())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
+
+
 template <typename ValueType>
 void Cg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp
index 59e79e60c0a..3a52974033a 100644
--- a/core/solver/cg_kernels.hpp
+++ b/core/solver/cg_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -110,6 +110,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace cg {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace cg
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
@@ -117,4 +126,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_CG_KERNELS_HPP
+#endif  // GKO_CORE_SOLVER_CG_KERNELS_HPP_
diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp
index 7ac786745be..f92f9afc30f 100644
--- a/core/solver/cgs.cpp
+++ b/core/solver/cgs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,6 +59,32 @@ GKO_REGISTER_OPERATION(step_3, cgs::step_3);
 }  // namespace cgs
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Cgs<ValueType>::transpose() const
+{
+    return build()
+        .with_generated_preconditioner(
+            share(as<Transposable>(this->get_preconditioner())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Cgs<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_preconditioner(share(
+            as<Transposable>(this->get_preconditioner())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
+
+
 template <typename ValueType>
 void Cgs<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp
index 11899f374ea..1404303b2ce 100644
--- a/core/solver/cgs_kernels.hpp
+++ b/core/solver/cgs_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
+
 namespace gko {
 namespace kernels {
 namespace cgs {
@@ -125,6 +126,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace cgs {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace cgs
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
@@ -132,4 +142,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_CGS_KERNELS_HPP
+#endif  // GKO_CORE_SOLVER_CGS_KERNELS_HPP_
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index 11e86aa4d3a..595476f1637 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -56,6 +56,32 @@ GKO_REGISTER_OPERATION(step_2, fcg::step_2);
 }  // namespace fcg
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Fcg<ValueType>::transpose() const
+{
+    return build()
+        .with_generated_preconditioner(
+            share(as<Transposable>(this->get_preconditioner())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Fcg<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_preconditioner(share(
+            as<Transposable>(this->get_preconditioner())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
+
+
 template <typename ValueType>
 void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
@@ -67,7 +93,6 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
     constexpr uint8 RelativeStoppingId{1};
 
     auto exec = this->get_executor();
-    size_type num_vectors = dense_b->get_size()[1];
 
     auto one_op = initialize<Vector>({one<ValueType>()}, exec);
     auto neg_one_op = initialize<Vector>({-one<ValueType>()}, exec);
diff --git a/core/solver/fcg_kernels.hpp b/core/solver/fcg_kernels.hpp
index 28be5ade514..dc269f2fa19 100644
--- a/core/solver/fcg_kernels.hpp
+++ b/core/solver/fcg_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -111,6 +111,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace fcg {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace fcg
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index b56ef394d7b..9e9c39c3848 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -63,33 +63,32 @@ GKO_REGISTER_OPERATION(step_2, gmres::step_2);
 }  // namespace gmres
 
 
-namespace {
-
-
 template <typename ValueType>
-void apply_preconditioner(
-    const LinOp *preconditioner, matrix::Dense<ValueType> *krylov_bases,
-    std::shared_ptr<matrix::Dense<ValueType>> &preconditioned_vector,
-    const size_type iter)
+std::unique_ptr<LinOp> Gmres<ValueType>::transpose() const
 {
-    std::shared_ptr<matrix::Dense<ValueType>> target_basis =
-        krylov_bases->create_submatrix(
-            span{0, krylov_bases->get_size()[0]},
-            span{iter * preconditioned_vector->get_size()[1],
-                 (iter + 1) * preconditioned_vector->get_size()[1]});
-
-    // Apply preconditioner
-    auto identity_pointer =
-        dynamic_cast<const matrix::Identity<ValueType> *>(preconditioner);
-    if (identity_pointer) {
-        preconditioned_vector = target_basis;
-    } else {
-        preconditioner->apply(target_basis.get(), preconditioned_vector.get());
-    }
+    return build()
+        .with_generated_preconditioner(
+            share(as<Transposable>(this->get_preconditioner())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .with_krylov_dim(this->get_krylov_dim())
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
 }
 
 
-}  // namespace
+template <typename ValueType>
+std::unique_ptr<LinOp> Gmres<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_preconditioner(share(
+            as<Transposable>(this->get_preconditioner())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .with_krylov_dim(this->get_krylov_dim())
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
 
 
 template <typename ValueType>
@@ -98,6 +97,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
     GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix_);
 
     using Vector = matrix::Dense<ValueType>;
+    using NormVector = matrix::Dense<remove_complex<ValueType>>;
 
     constexpr uint8 RelativeStoppingId{1};
 
@@ -110,9 +110,8 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
     auto dense_x = as<Vector>(x);
     auto residual = Vector::create_with_config_of(dense_b);
     auto krylov_bases = Vector::create(
-        exec, dim<2>{system_matrix_->get_size()[1],
-                     (krylov_dim_ + 1) * dense_b->get_size()[1]});
-    auto next_krylov_basis = Vector::create_with_config_of(dense_b);
+        exec, dim<2>{system_matrix_->get_size()[1] * (krylov_dim_ + 1),
+                     dense_b->get_size()[1]});
     std::shared_ptr<matrix::Dense<ValueType>> preconditioned_vector =
         Vector::create_with_config_of(dense_b);
     auto hessenberg = Vector::create(
@@ -124,8 +123,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
     auto residual_norm_collection =
         Vector::create(exec, dim<2>{krylov_dim_ + 1, dense_b->get_size()[1]});
     auto residual_norm =
-        Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
-    auto b_norm = Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
+        NormVector::create(exec, dim<2>{1, dense_b->get_size()[1]});
     Array<size_type> final_iter_nums(this->get_executor(),
                                      dense_b->get_size()[1]);
     auto y = Vector::create(exec, dim<2>{krylov_dim_, dense_b->get_size()[1]});
@@ -135,21 +133,19 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
                                        dense_b->get_size()[1]);
 
     // Initialization
-    exec->run(gmres::make_initialize_1(dense_b, b_norm.get(), residual.get(),
+    exec->run(gmres::make_initialize_1(dense_b, residual.get(),
                                        givens_sin.get(), givens_cos.get(),
                                        &stop_status, krylov_dim_));
-    // b_norm = norm(b)
     // residual = dense_b
     // givens_sin = givens_cos = 0
     system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(),
                           residual.get());
     // residual = residual - Ax
-
     exec->run(gmres::make_initialize_2(
         residual.get(), residual_norm.get(), residual_norm_collection.get(),
         krylov_bases.get(), &final_iter_nums, krylov_dim_));
     // residual_norm = norm(residual)
-    // residual_norm_collection = {residual_norm, 0, ..., 0}
+    // residual_norm_collection = {residual_norm, unchanged}
     // krylov_bases(:, 1) = residual / residual_norm
     // final_iter_nums = {0, ..., 0}
 
@@ -178,6 +174,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
             break;
         }
 
+
         if (restart_iter == krylov_dim_) {
             // Restart
             exec->run(gmres::make_step_2(residual_norm_collection.get(),
@@ -186,12 +183,13 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
                                          &final_iter_nums));
             // Solve upper triangular.
             // y = hessenberg \ residual_norm_collection
+            // before_preconditioner = krylov_bases * y
 
             get_preconditioner()->apply(before_preconditioner.get(),
                                         after_preconditioner.get());
             dense_x->add_scaled(one_op.get(), after_preconditioner.get());
             // Solve x
-            // x = x + get_preconditioner() * krylov_bases * y
+            // x = x + get_preconditioner() * before_preconditioner
             residual->copy_from(dense_b);
             // residual = dense_b
             system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(),
@@ -202,16 +200,23 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
                 residual_norm_collection.get(), krylov_bases.get(),
                 &final_iter_nums, krylov_dim_));
             // residual_norm = norm(residual)
-            // residual_norm_collection = {residual_norm, 0, ..., 0}
+            // residual_norm_collection = {residual_norm, unchanged}
             // krylov_bases(:, 1) = residual / residual_norm
             // final_iter_nums = {0, ..., 0}
             restart_iter = 0;
         }
-
-        apply_preconditioner(get_preconditioner().get(), krylov_bases.get(),
-                             preconditioned_vector, restart_iter);
-        // preconditioned_vector = get_preconditioner() *
-        //                         krylov_bases(:, restart_iter)
+        auto this_krylov = krylov_bases->create_submatrix(
+            span{system_matrix_->get_size()[0] * restart_iter,
+                 system_matrix_->get_size()[0] * (restart_iter + 1)},
+            span{0, dense_b->get_size()[1]});
+
+        auto next_krylov = krylov_bases->create_submatrix(
+            span{system_matrix_->get_size()[0] * (restart_iter + 1),
+                 system_matrix_->get_size()[0] * (restart_iter + 2)},
+            span{0, dense_b->get_size()[1]});
+        get_preconditioner()->apply(this_krylov.get(),
+                                    preconditioned_vector.get());
+        // preconditioned_vector = get_preconditioner() * this_krylov
 
         // Do Arnoldi and givens rotation
         auto hessenberg_iter = hessenberg->create_submatrix(
@@ -220,46 +225,58 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
                  dense_b->get_size()[1] * (restart_iter + 1)});
 
         // Start of arnoldi
-        system_matrix_->apply(preconditioned_vector.get(),
-                              next_krylov_basis.get());
-        // next_krylov_basis = A * preconditioned_vector
+        system_matrix_->apply(preconditioned_vector.get(), next_krylov.get());
+        // next_krylov = A * preconditioned_vector
 
         exec->run(gmres::make_step_1(
-            next_krylov_basis.get(), givens_sin.get(), givens_cos.get(),
+            dense_b->get_size()[0], givens_sin.get(), givens_cos.get(),
             residual_norm.get(), residual_norm_collection.get(),
-            krylov_bases.get(), hessenberg_iter.get(), b_norm.get(),
-            restart_iter, &final_iter_nums, &stop_status));
-        // for i in 0:restart_iter
+            krylov_bases.get(), hessenberg_iter.get(), restart_iter,
+            &final_iter_nums, &stop_status));
+        // final_iter_nums += 1 (unconverged)
+        // next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases
+        // for i in 0:restart_iter(include)
         //     hessenberg(restart_iter, i) = next_krylov_basis' *
-        //     krylov_bases(:, i) next_krylov_basis  -= hessenberg(restart_iter,
-        //     i) * krylov_bases(:, i)
+        //         krylov_bases(:, i)
+        //     next_krylov_basis  -= hessenberg(restart_iter, i) *
+        //         krylov_bases(:, i)
         // end
-        // hessenberg(restart_iter, restart_iter + 1) = norm(next_krylov_basis)
-        // next_krylov_basis /= hessenberg(restart_iter, restart_iter + 1)
+        // hessenberg(restart_iter+1, restart_iter) = norm(next_krylov_basis)
+        // next_krylov_basis /= hessenberg(restart_iter + 1, restart_iter)
         // End of arnoldi
         // Start apply givens rotation
-        // for j in 0:restart_iter
+        // for j in 0:restart_iter(exclude)
         //     temp             =  cos(j)*hessenberg(j) +
         //                         sin(j)*hessenberg(j+1)
-        //     hessenberg(j+1)  = -sin(j)*hessenberg(j) +
-        //                         cos(j)*hessenberg(j+1)
+        //     hessenberg(j+1)  = -conj(sin(j))*hessenberg(j) +
+        //                         conj(cos(j))*hessenberg(j+1)
         //     hessenberg(j)    =  temp;
         // end
         // Calculate sin and cos
+        // this_hess = hessenberg(restart_iter)
+        // next_hess = hessenberg(restart_iter+1)
+        // hypotenuse = sqrt(this_hess * this_hess + next_hess * next_hess);
+        // cos(restart_iter) = conj(this_hess) / hypotenuse;
+        // sin(restart_iter) = conj(next_hess) / this_hess
         // hessenberg(restart_iter)   =
-        // cos(restart_iter)*hessenberg(restart_iter) +
-        //                      sin(restart_iter)*hessenberg(restart_iter)
+        //      cos(restart_iter)*hessenberg(restart_iter) +
+        //      sin(restart_iter)*hessenberg(restart_iter)
         // hessenberg(restart_iter+1) = 0
         // End apply givens rotation
         // Calculate residual norm
+        // this_rnc = residual_norm_collection(restart_iter)
+        // next_rnc = -conj(sin(restart_iter)) * this_rnc
+        // residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc
+        // residual_norm = abs(next_rnc)
+        // residual_norm_collection(restart_iter + 1) = next_rnc
 
         restart_iter++;
     }
 
     // Solve x
     auto krylov_bases_small = krylov_bases->create_submatrix(
-        span{0, system_matrix_->get_size()[0]},
-        span{0, dense_b->get_size()[1] * (restart_iter + 1)});
+        span{0, system_matrix_->get_size()[0] * (restart_iter + 1)},
+        span{0, dense_b->get_size()[1]});
     auto hessenberg_small = hessenberg->create_submatrix(
         span{0, restart_iter},
         span{0, dense_b->get_size()[1] * (restart_iter)});
@@ -270,12 +287,12 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
         &final_iter_nums));
     // Solve upper triangular.
     // y = hessenberg \ residual_norm_collection
-
+    // before_preconditioner = krylov_bases * y
     get_preconditioner()->apply(before_preconditioner.get(),
                                 after_preconditioner.get());
     dense_x->add_scaled(one_op.get(), after_preconditioner.get());
     // Solve x
-    // x = x + get_preconditioner() * krylov_bases * y
+    // x = x + get_preconditioner() * before_preconditioner
 }
 
 
diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp
index ecb448e3e20..644a8cf708e 100644
--- a/core/solver/gmres_kernels.hpp
+++ b/core/solver/gmres_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,40 +40,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
+
 namespace gko {
 namespace kernels {
 namespace gmres {
 
 
-#define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type)                           \
-    void initialize_1(                                                         \
-        std::shared_ptr<const DefaultExecutor> exec,                           \
-        const matrix::Dense<_type> *b, matrix::Dense<_type> *b_norm,           \
-        matrix::Dense<_type> *residual, matrix::Dense<_type> *givens_sin,      \
-        matrix::Dense<_type> *givens_cos, Array<stopping_status> *stop_status, \
-        size_type krylov_dim)
+#define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type)                        \
+    void initialize_1(                                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const matrix::Dense<_type> *b, matrix::Dense<_type> *residual,      \
+        matrix::Dense<_type> *givens_sin, matrix::Dense<_type> *givens_cos, \
+        Array<stopping_status> *stop_status, size_type krylov_dim)
 
 
-#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type)                  \
-    void initialize_2(std::shared_ptr<const DefaultExecutor> exec,    \
-                      const matrix::Dense<_type> *residual,           \
-                      matrix::Dense<_type> *residual_norm,            \
-                      matrix::Dense<_type> *residual_norm_collection, \
-                      matrix::Dense<_type> *krylov_bases,             \
+#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type)                       \
+    void initialize_2(std::shared_ptr<const DefaultExecutor> exec,         \
+                      const matrix::Dense<_type> *residual,                \
+                      matrix::Dense<remove_complex<_type>> *residual_norm, \
+                      matrix::Dense<_type> *residual_norm_collection,      \
+                      matrix::Dense<_type> *krylov_bases,                  \
                       Array<size_type> *final_iter_nums, size_type krylov_dim)
 
 
-#define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type)                      \
-    void step_1(std::shared_ptr<const DefaultExecutor> exec,        \
-                matrix::Dense<_type> *next_krylov_basis,            \
-                matrix::Dense<_type> *givens_sin,                   \
-                matrix::Dense<_type> *givens_cos,                   \
-                matrix::Dense<_type> *residual_norm,                \
-                matrix::Dense<_type> *residual_norm_collection,     \
-                matrix::Dense<_type> *krylov_bases,                 \
-                matrix::Dense<_type> *hessenberg_iter,              \
-                const matrix::Dense<_type> *b_norm, size_type iter, \
-                Array<size_type> *final_iter_nums,                  \
+#define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type)                         \
+    void step_1(std::shared_ptr<const DefaultExecutor> exec,           \
+                size_type num_rows, matrix::Dense<_type> *givens_sin,  \
+                matrix::Dense<_type> *givens_cos,                      \
+                matrix::Dense<remove_complex<_type>> *residual_norm,   \
+                matrix::Dense<_type> *residual_norm_collection,        \
+                matrix::Dense<_type> *krylov_bases,                    \
+                matrix::Dense<_type> *hessenberg_iter, size_type iter, \
+                Array<size_type> *final_iter_nums,                     \
                 const Array<stopping_status> *stop_status)
 
 
@@ -128,6 +126,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace gmres {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace gmres
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
@@ -135,4 +142,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_GMRES_KERNELS_HPP
+#endif  // GKO_CORE_SOLVER_GMRES_KERNELS_HPP_
diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp
index f1ea9bd51e4..63e80f86c04 100644
--- a/core/solver/ir.cpp
+++ b/core/solver/ir.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,34 @@ GKO_REGISTER_OPERATION(initialize, ir::initialize);
 }  // namespace ir
 
 
+template <typename ValueType>
+std::unique_ptr<LinOp> Ir<ValueType>::transpose() const
+{
+    return build()
+        .with_generated_solver(
+            share(as<Transposable>(this->get_solver())->transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .with_relaxation_factor(parameters_.relaxation_factor)
+        .on(this->get_executor())
+        ->generate(
+            share(as<Transposable>(this->get_system_matrix())->transpose()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> Ir<ValueType>::conj_transpose() const
+{
+    return build()
+        .with_generated_solver(
+            share(as<Transposable>(this->get_solver())->conj_transpose()))
+        .with_criteria(this->stop_criterion_factory_)
+        .with_relaxation_factor(conj(parameters_.relaxation_factor))
+        .on(this->get_executor())
+        ->generate(share(
+            as<Transposable>(this->get_system_matrix())->conj_transpose()));
+}
+
+
 template <typename ValueType>
 void Ir<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
 {
@@ -63,6 +91,7 @@ void Ir<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
     auto dense_b = as<const Vector>(b);
     auto dense_x = as<Vector>(x);
     auto residual = Vector::create_with_config_of(dense_b);
+    auto inner_solution = Vector::create_with_config_of(dense_b);
 
     bool one_changed{};
     Array<stopping_status> stop_status(exec, dense_b->get_size()[1]);
@@ -91,10 +120,30 @@ void Ir<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
             break;
         }
 
-        solver_->apply(lend(one_op), lend(residual), lend(one_op), dense_x);
-        residual->copy_from(dense_b);
-        system_matrix_->apply(lend(neg_one_op), dense_x, lend(one_op),
-                              lend(residual));
+        if (solver_->apply_uses_initial_guess()) {
+            // Use the inner solver to solve
+            // A * inner_solution = residual
+            // with residual as initial guess.
+            inner_solution->copy_from(lend(residual));
+            solver_->apply(lend(residual), lend(inner_solution));
+
+            // x = x + relaxation_factor * inner_solution
+            dense_x->add_scaled(lend(relaxation_factor_), lend(inner_solution));
+
+            // residual = b - A * x
+            residual->copy_from(dense_b);
+            system_matrix_->apply(lend(neg_one_op), dense_x, lend(one_op),
+                                  lend(residual));
+        } else {
+            // x = x + relaxation_factor * A \ residual
+            solver_->apply(lend(relaxation_factor_), lend(residual),
+                           lend(one_op), dense_x);
+
+            // residual = b - A * x
+            residual->copy_from(dense_b);
+            system_matrix_->apply(lend(neg_one_op), dense_x, lend(one_op),
+                                  lend(residual));
+        }
     }
 }
 
diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp
index 56c78e9e853..9fe59ba4a6c 100644
--- a/core/solver/ir_kernels.hpp
+++ b/core/solver/ir_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -83,6 +83,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace ir {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace ir
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
@@ -90,4 +99,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_IR_KERNELS_HPP
+#endif  // GKO_CORE_SOLVER_IR_KERNELS_HPP_
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
index 987e8609573..bb4bb19c25b 100644
--- a/core/solver/lower_trs.cpp
+++ b/core/solver/lower_trs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/upper_trs.hpp>
 
 
 #include "core/solver/lower_trs_kernels.hpp"
@@ -61,6 +62,26 @@ GKO_REGISTER_OPERATION(solve, lower_trs::solve);
 }  // namespace lower_trs
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> LowerTrs<ValueType, IndexType>::transpose() const
+{
+    return transposed_type::build()
+        .with_num_rhs(this->parameters_.num_rhs)
+        .on(this->get_executor())
+        ->generate(share(this->get_system_matrix()->transpose()));
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> LowerTrs<ValueType, IndexType>::conj_transpose() const
+{
+    return transposed_type::build()
+        .with_num_rhs(this->parameters_.num_rhs)
+        .on(this->get_executor())
+        ->generate(share(this->get_system_matrix()->conj_transpose()));
+}
+
+
 template <typename ValueType, typename IndexType>
 void LowerTrs<ValueType, IndexType>::init_trs_solve_struct()
 {
diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp
index b2c931d76cf..799c50129e0 100644
--- a/core/solver/lower_trs_kernels.hpp
+++ b/core/solver/lower_trs_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,13 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_SOLVER_LOWER_TRS_KERNELS_HPP_
 
 
+#include <ginkgo/core/solver/lower_trs.hpp>
+
+
 #include <memory>
 
 
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/solver/lower_trs.hpp>
 
 
 namespace gko {
@@ -112,6 +114,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace lower_trs {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace lower_trs
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
@@ -119,4 +130,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_LOWER_TRS_KERNELS_HPP
+#endif  // GKO_CORE_SOLVER_LOWER_TRS_KERNELS_HPP_
diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
index 081ce3aac01..236de82a27b 100644
--- a/core/solver/upper_trs.cpp
+++ b/core/solver/upper_trs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/lower_trs.hpp>
 
 
 #include "core/solver/upper_trs_kernels.hpp"
@@ -61,6 +62,26 @@ GKO_REGISTER_OPERATION(solve, upper_trs::solve);
 }  // namespace upper_trs
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> UpperTrs<ValueType, IndexType>::transpose() const
+{
+    return transposed_type::build()
+        .with_num_rhs(this->parameters_.num_rhs)
+        .on(this->get_executor())
+        ->generate(share(this->get_system_matrix()->transpose()));
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> UpperTrs<ValueType, IndexType>::conj_transpose() const
+{
+    return transposed_type::build()
+        .with_num_rhs(this->parameters_.num_rhs)
+        .on(this->get_executor())
+        ->generate(share(this->get_system_matrix()->conj_transpose()));
+}
+
+
 template <typename ValueType, typename IndexType>
 void UpperTrs<ValueType, IndexType>::init_trs_solve_struct()
 {
diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp
index 34e4426ff68..cce48ea2812 100644
--- a/core/solver/upper_trs_kernels.hpp
+++ b/core/solver/upper_trs_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,13 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_SOLVER_UPPER_TRS_KERNELS_HPP_
 
 
+#include <ginkgo/core/solver/upper_trs.hpp>
+
+
 #include <memory>
 
 
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/solver/upper_trs.hpp>
 
 
 namespace gko {
@@ -112,6 +114,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace reference
 
 
+namespace hip {
+namespace upper_trs {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace upper_trs
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 
@@ -119,4 +130,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_UPPER_TRS_KERNELS_HPP
+#endif  // GKO_CORE_SOLVER_UPPER_TRS_KERNELS_HPP_
diff --git a/core/stop/combined.cpp b/core/stop/combined.cpp
index 502d868a78e..f80df54b90b 100644
--- a/core/stop/combined.cpp
+++ b/core/stop/combined.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/stop/combined.hpp>
 
 
diff --git a/core/stop/criterion.cpp b/core/stop/criterion.cpp
index db225012a20..25019d7d0d7 100644
--- a/core/stop/criterion.cpp
+++ b/core/stop/criterion.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/stop/criterion.hpp>
 
 
diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp
index b666b9ef8dd..07eb8f2798c 100644
--- a/core/stop/criterion_kernels.hpp
+++ b/core/stop/criterion_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -78,6 +78,15 @@ GKO_DECLARE_SET_ALL_STATUSES_KERNEL();
 
 }  // namespace set_all_statuses
 }  // namespace reference
+
+
+namespace hip {
+namespace set_all_statuses {
+
+GKO_DECLARE_SET_ALL_STATUSES_KERNEL();
+
+}  // namespace set_all_statuses
+}  // namespace hip
 }  // namespace kernels
 }  // namespace gko
 
diff --git a/core/stop/iteration.cpp b/core/stop/iteration.cpp
index 684ed00ec1e..8c1a6bc5a7d 100644
--- a/core/stop/iteration.cpp
+++ b/core/stop/iteration.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/stop/iteration.hpp>
 
 
diff --git a/core/stop/residual_norm_reduction.cpp b/core/stop/residual_norm.cpp
similarity index 57%
rename from core/stop/residual_norm_reduction.cpp
rename to core/stop/residual_norm.cpp
index 35285a37dc3..5c928bbf48d 100644
--- a/core/stop/residual_norm_reduction.cpp
+++ b/core/stop/residual_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,34 +30,34 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <ginkgo/core/stop/residual_norm.hpp>
 
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
 
-
-#include "core/stop/residual_norm_reduction_kernels.hpp"
+#include "core/components/fill_array.hpp"
+#include "core/stop/residual_norm_kernels.hpp"
 
 
 namespace gko {
 namespace stop {
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 
-GKO_REGISTER_OPERATION(residual_norm_reduction,
-                       residual_norm_reduction::residual_norm_reduction);
+GKO_REGISTER_OPERATION(residual_norm, residual_norm::residual_norm);
+GKO_REGISTER_OPERATION(fill_array, components::fill_array);
 
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 
 
 template <typename ValueType>
-bool ResidualNormReduction<ValueType>::check_impl(
-    uint8 stoppingId, bool setFinalized, Array<stopping_status> *stop_status,
-    bool *one_changed, const Criterion::Updater &updater)
+bool ResidualNorm<ValueType>::check_impl(uint8 stoppingId, bool setFinalized,
+                                         Array<stopping_status> *stop_status,
+                                         bool *one_changed,
+                                         const Criterion::Updater &updater)
 {
-    std::unique_ptr<Vector> u_dense_tau;
-    const Vector *dense_tau;
+    const NormVector *dense_tau;
     if (updater.residual_norm_ != nullptr) {
-        dense_tau = as<Vector>(updater.residual_norm_);
+        dense_tau = as<NormVector>(updater.residual_norm_);
     } else if (updater.residual_ != nullptr) {
         auto *dense_r = as<Vector>(updater.residual_);
         dense_r->compute_norm2(u_dense_tau_.get());
@@ -67,18 +67,29 @@ bool ResidualNormReduction<ValueType>::check_impl(
     }
     bool all_converged = true;
 
-    this->get_executor()->run(
-        residual_norm_reduction::make_residual_norm_reduction(
-            dense_tau, starting_tau_.get(), parameters_.reduction_factor,
-            stoppingId, setFinalized, stop_status, &this->device_storage_,
-            &all_converged, one_changed));
+    this->get_executor()->run(residual_norm::make_residual_norm(
+        dense_tau, starting_tau_.get(), tolerance_, stoppingId, setFinalized,
+        stop_status, &device_storage_, &all_converged, one_changed));
+
     return all_converged;
 }
 
+template <typename ValueType>
+void AbsoluteResidualNorm<ValueType>::initialize_starting_tau()
+{
+    this->get_executor()->run(residual_norm::make_fill_array(
+        this->starting_tau_->get_values(), this->starting_tau_->get_size()[1],
+        gko::one<remove_complex<ValueType>>()));
+}
+
+
+#define GKO_DECLARE_RESIDUAL_NORM(_type) class ResidualNorm<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM);
+
 
-#define GKO_DECLARE_RESIDUAL_NORM_REDUCTION(_type) \
-    class ResidualNormReduction<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION);
+#define GKO_DECLARE_ABSOLUTE_RESIDUAL_NORM(_type) \
+    class AbsoluteResidualNorm<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_ABSOLUTE_RESIDUAL_NORM);
 
 
 }  // namespace stop
diff --git a/core/stop/residual_norm_reduction_kernels.hpp b/core/stop/residual_norm_kernels.hpp
similarity index 71%
rename from core/stop/residual_norm_reduction_kernels.hpp
rename to core/stop/residual_norm_kernels.hpp
index e56be461448..30407cf9b9f 100644
--- a/core/stop/residual_norm_reduction_kernels.hpp
+++ b/core/stop/residual_norm_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_KERNELS_HPP_
-#define GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_KERNELS_HPP_
+#ifndef GKO_CORE_STOP_RESIDUAL_NORM_KERNELS_HPP_
+#define GKO_CORE_STOP_RESIDUAL_NORM_KERNELS_HPP_
 
 
 #include <ginkgo/core/base/array.hpp>
@@ -43,56 +43,65 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace gko {
 namespace kernels {
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 
-#define GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(_type)                      \
-    void residual_norm_reduction(                                              \
+#define GKO_DECLARE_RESIDUAL_NORM_KERNEL(_type)                                \
+    void residual_norm(                                                        \
         std::shared_ptr<const DefaultExecutor> exec,                           \
         const matrix::Dense<_type> *tau, const matrix::Dense<_type> *orig_tau, \
-        remove_complex<_type> rel_residual_goal, uint8 stoppingId,             \
-        bool setFinalized, Array<stopping_status> *stop_status,                \
-        Array<bool> *device_storage, bool *all_converged, bool *one_changed)
+        _type rel_residual_goal, uint8 stoppingId, bool setFinalized,          \
+        Array<stopping_status> *stop_status, Array<bool> *device_storage,      \
+        bool *all_converged, bool *one_changed)
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES \
     template <typename ValueType>    \
-    GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(ValueType)
+    GKO_DECLARE_RESIDUAL_NORM_KERNEL(ValueType)
 
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 
 
 namespace omp {
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 GKO_DECLARE_ALL_AS_TEMPLATES;
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace omp
 
 
 namespace cuda {
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 GKO_DECLARE_ALL_AS_TEMPLATES;
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace cuda
 
 
 namespace reference {
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 GKO_DECLARE_ALL_AS_TEMPLATES;
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace reference
 
 
+namespace hip {
+namespace residual_norm {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace residual_norm
+}  // namespace hip
+
+
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
 
 }  // namespace kernels
 }  // namespace gko
 
-#endif  // GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_KERNELS_HPP_
+#endif  // GKO_CORE_STOP_RESIDUAL_NORM_KERNELS_HPP_
diff --git a/core/stop/time.cpp b/core/stop/time.cpp
index aea2e6c3952..8ec4ad4948a 100644
--- a/core/stop/time.cpp
+++ b/core/stop/time.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/stop/time.hpp>
 
 
diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp
index 26497b7346f..c757d4dcd1d 100644
--- a/core/synthesizer/implementation_selection.hpp
+++ b/core/synthesizer/implementation_selection.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_
-#define GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_
+#ifndef GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_HPP_
+#define GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_HPP_
+
+
+#include <utility>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -39,9 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
-#include <utility>
-
-
 namespace gko {
 namespace syn {
 
@@ -76,4 +76,4 @@ namespace syn {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_
+#endif  // GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_HPP_
diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt
index 322bf38e6a7..f0e39d5c568 100644
--- a/core/test/CMakeLists.txt
+++ b/core/test/CMakeLists.txt
@@ -1,3 +1,5 @@
+include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
+
 add_subdirectory(base)
 add_subdirectory(factorization)
 add_subdirectory(log)
diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index 914b58ab1d3..1183b339e07 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(abstract_factory)
+ginkgo_create_test(allocator)
 ginkgo_create_test(array)
 ginkgo_create_test(combination)
 ginkgo_create_test(composition)
@@ -16,6 +17,7 @@ ginkgo_create_test(perturbation)
 ginkgo_create_test(polymorphic_object)
 ginkgo_create_test(range)
 ginkgo_create_test(range_accessors)
+ginkgo_create_thread_test(sanitizers)
 ginkgo_create_test(types)
 ginkgo_create_test(utils)
 ginkgo_create_test(version)
diff --git a/core/test/base/abstract_factory.cpp b/core/test/base/abstract_factory.cpp
index 46591eab5b9..cf57531e7fe 100644
--- a/core/test/base/abstract_factory.cpp
+++ b/core/test/base/abstract_factory.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/base/allocator.cpp b/core/test/base/allocator.cpp
new file mode 100644
index 00000000000..32e4c9db85d
--- /dev/null
+++ b/core/test/base/allocator.cpp
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/base/allocator.hpp"
+
+
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+TEST(ExecutorAllocator, Works)
+{
+    auto exec = gko::ReferenceExecutor::create();
+    auto alloc = gko::ExecutorAllocator<int>(exec);
+
+    int *ptr{};
+    ASSERT_NO_THROW(ptr = alloc.allocate(10));
+    // This test can only fail with sanitizers
+    ptr[0] = 0;
+    ptr[9] = 0;
+
+    ASSERT_NO_THROW(alloc.deallocate(ptr, 10));
+}
+
+
+TEST(ExecutorAllocator, WorksWithStdlib)
+{
+    auto exec = gko::ReferenceExecutor::create();
+    auto alloc = gko::ExecutorAllocator<int>(exec);
+    auto vec = std::vector<int, gko::ExecutorAllocator<int>>(10, 0, exec);
+
+    // This test can only fail with sanitizers
+    vec[0] = 0;
+    vec[9] = 0;
+}
+
+
+TEST(ExecutorAllocator, ComparesEqual)
+{
+    auto exec = gko::ReferenceExecutor::create();
+    auto alloc1 = gko::ExecutorAllocator<int>(exec);
+    auto alloc2 = gko::ExecutorAllocator<float>(exec);
+
+    ASSERT_TRUE(alloc1 == alloc2);
+}
+
+
+TEST(ExecutorAllocator, ComparesNotEqual)
+{
+    auto exec1 = gko::ReferenceExecutor::create();
+    auto exec2 = gko::OmpExecutor::create();
+    auto alloc1 = gko::ExecutorAllocator<int>(exec1);
+    auto alloc2 = gko::ExecutorAllocator<float>(exec2);
+
+    ASSERT_TRUE(alloc1 != alloc2);
+}
+
+
+}  // namespace
diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index 5b02b151c9a..b1e04dd2f39 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,15 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 
 
+#include <algorithm>
+
+
 #include <gtest/gtest.h>
 
 
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Array : public ::testing::Test {
 protected:
     Array() : exec(gko::ReferenceExecutor::create()), x(exec, 2)
@@ -50,259 +57,472 @@ class Array : public ::testing::Test {
         x.get_data()[1] = 2;
     }
 
-    static void assert_equal_to_original_x(gko::Array<int> &a)
+    static void assert_equal_to_original_x(gko::Array<T> &a)
     {
         ASSERT_EQ(a.get_num_elems(), 2);
-        EXPECT_EQ(a.get_data()[0], 5);
-        EXPECT_EQ(a.get_data()[1], 2);
-        EXPECT_EQ(a.get_const_data()[0], 5);
-        EXPECT_EQ(a.get_const_data()[1], 2);
+        EXPECT_EQ(a.get_data()[0], T{5});
+        EXPECT_EQ(a.get_data()[1], T{2});
+        EXPECT_EQ(a.get_const_data()[0], T{5});
+        EXPECT_EQ(a.get_const_data()[1], T{2});
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    gko::Array<int> x;
+    gko::Array<T> x;
 };
 
+TYPED_TEST_CASE(Array, gko::test::ValueAndIndexTypes);
 
-TEST_F(Array, CanBeCreatedWithoutAnExecutor)
+
+TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor)
 {
-    gko::Array<int> a;
+    gko::Array<TypeParam> a;
 
     ASSERT_EQ(a.get_executor(), nullptr);
     ASSERT_EQ(a.get_num_elems(), 0);
 }
 
 
-TEST_F(Array, CanBeEmpty)
+TYPED_TEST(Array, CanBeEmpty)
 {
-    gko::Array<int> a(exec);
+    gko::Array<TypeParam> a(this->exec);
 
     ASSERT_EQ(a.get_num_elems(), 0);
 }
 
 
-TEST_F(Array, ReturnsNullWhenEmpty)
+TYPED_TEST(Array, ReturnsNullWhenEmpty)
 {
-    gko::Array<int> a(exec);
+    gko::Array<TypeParam> a(this->exec);
 
     EXPECT_EQ(a.get_const_data(), nullptr);
     ASSERT_EQ(a.get_data(), nullptr);
 }
 
 
-TEST_F(Array, CanBeCreatedFromExistingData)
+TYPED_TEST(Array, CanBeCreatedFromExistingData)
 {
-    gko::Array<int> a{exec, 3, new int[3], std::default_delete<int[]>{}};
+    gko::Array<TypeParam> a{this->exec, 3, new TypeParam[3],
+                            std::default_delete<TypeParam[]>{}};
 
     EXPECT_EQ(a.get_num_elems(), 3);
 }
 
 
-TEST_F(Array, CanBeCreatedFromDataOnExecutor)
+TYPED_TEST(Array, CanBeCreatedFromDataOnExecutor)
 {
-    gko::Array<int> a{exec, 3, exec->alloc<int>(3)};
+    gko::Array<TypeParam> a{this->exec, 3,
+                            this->exec->template alloc<TypeParam>(3)};
 
     EXPECT_EQ(a.get_num_elems(), 3);
 }
 
 
-TEST_F(Array, CanBeCreatedFromRange)
+TYPED_TEST(Array, CanBeCreatedFromRange)
 {
     using std::begin;
     auto data = {1, 2, 3};
 
-    gko::Array<int> a{exec, begin(data), end(data)};
+    gko::Array<TypeParam> a{this->exec, begin(data), end(data)};
 
-    EXPECT_EQ(a.get_const_data()[0], 1);
-    EXPECT_EQ(a.get_const_data()[1], 2);
-    ASSERT_EQ(a.get_const_data()[2], 3);
+    EXPECT_EQ(a.get_const_data()[0], TypeParam{1});
+    EXPECT_EQ(a.get_const_data()[1], TypeParam{2});
+    ASSERT_EQ(a.get_const_data()[2], TypeParam{3});
 }
 
 
-TEST_F(Array, CanBeCreatedFromInitializerList)
+TYPED_TEST(Array, CanBeCreatedFromInitializerList)
 {
-    gko::Array<int> a{exec, {1, 2, 3}};
+    gko::Array<TypeParam> a{this->exec, {1, 2, 3}};
 
-    EXPECT_EQ(a.get_const_data()[0], 1);
-    EXPECT_EQ(a.get_const_data()[1], 2);
-    ASSERT_EQ(a.get_const_data()[2], 3);
+    EXPECT_EQ(a.get_const_data()[0], TypeParam{1});
+    EXPECT_EQ(a.get_const_data()[1], TypeParam{2});
+    ASSERT_EQ(a.get_const_data()[2], TypeParam{3});
 }
 
 
-TEST_F(Array, KnowsItsSize) { ASSERT_EQ(x.get_num_elems(), 2); }
+TYPED_TEST(Array, KnowsItsSize) { ASSERT_EQ(this->x.get_num_elems(), 2); }
 
 
-TEST_F(Array, ReturnsValidDataPtr)
+TYPED_TEST(Array, ReturnsValidDataPtr)
 {
-    EXPECT_EQ(x.get_data()[0], 5);
-    EXPECT_EQ(x.get_data()[1], 2);
+    EXPECT_EQ(this->x.get_data()[0], TypeParam{5});
+    EXPECT_EQ(this->x.get_data()[1], TypeParam{2});
 }
 
 
-TEST_F(Array, ReturnsValidConstDataPtr)
+TYPED_TEST(Array, ReturnsValidConstDataPtr)
 {
-    EXPECT_EQ(x.get_const_data()[0], 5);
-    EXPECT_EQ(x.get_const_data()[1], 2);
+    EXPECT_EQ(this->x.get_const_data()[0], TypeParam{5});
+    EXPECT_EQ(this->x.get_const_data()[1], TypeParam{2});
 }
 
 
-TEST_F(Array, KnowsItsExecutor) { ASSERT_EQ(x.get_executor(), exec); }
+TYPED_TEST(Array, KnowsItsExecutor)
+{
+    ASSERT_EQ(this->x.get_executor(), this->exec);
+}
 
 
-TEST_F(Array, CanBeCopyConstructed)
+TYPED_TEST(Array, CanBeCopyConstructed)
 {
-    gko::Array<int> a(x);
-    x.get_data()[0] = 7;
+    gko::Array<TypeParam> a(this->x);
+    this->x.get_data()[0] = 7;
 
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeMoveConstructed)
+TYPED_TEST(Array, CanBeMoveConstructed)
 {
-    gko::Array<int> a(std::move(x));
+    gko::Array<TypeParam> a(std::move(this->x));
 
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeCopyConstructedToADifferentExecutor)
+TYPED_TEST(Array, CanBeCopyConstructedToADifferentExecutor)
 {
-    gko::Array<int> a{exec, x};
+    gko::Array<TypeParam> a{this->exec, this->x};
 
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeMoveConstructedToADifferentExecutor)
+TYPED_TEST(Array, CanBeMoveConstructedToADifferentExecutor)
 {
-    gko::Array<int> a{exec, std::move(x)};
+    gko::Array<TypeParam> a{this->exec, std::move(this->x)};
 
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeCopied)
+TYPED_TEST(Array, CanBeCopied)
 {
     auto omp = gko::OmpExecutor::create();
-    gko::Array<int> a(omp, 3);
+    gko::Array<TypeParam> a(omp, 3);
 
-    a = x;
-    x.get_data()[0] = 7;
+    a = this->x;
+    this->x.get_data()[0] = 7;
 
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeCopiedToExecutorlessArray)
+TYPED_TEST(Array, CanBeCopiedToExecutorlessArray)
 {
-    gko::Array<int> a;
+    gko::Array<TypeParam> a;
 
-    a = x;
+    a = this->x;
 
-    ASSERT_EQ(a.get_executor(), x.get_executor());
-    assert_equal_to_original_x(a);
+    ASSERT_EQ(a.get_executor(), this->x.get_executor());
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeCopiedFromExecutorlessArray)
+TYPED_TEST(Array, CanBeCopiedFromExecutorlessArray)
 {
-    gko::Array<int> a;
+    gko::Array<TypeParam> a;
 
-    x = a;
+    this->x = a;
 
-    ASSERT_NE(x.get_executor(), nullptr);
-    ASSERT_EQ(x.get_num_elems(), 0);
+    ASSERT_NE(this->x.get_executor(), nullptr);
+    ASSERT_EQ(this->x.get_num_elems(), 0);
 }
 
 
-TEST_F(Array, CanBeMoved)
+TYPED_TEST(Array, CanBeMoved)
 {
     auto omp = gko::OmpExecutor::create();
-    gko::Array<int> a(omp, 3);
+    gko::Array<TypeParam> a(omp, 3);
 
-    a = std::move(x);
+    a = std::move(this->x);
 
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeMovedToExecutorlessArray)
+TYPED_TEST(Array, CanBeMovedToExecutorlessArray)
 {
-    gko::Array<int> a;
+    gko::Array<TypeParam> a;
 
-    a = std::move(x);
+    a = std::move(this->x);
 
     ASSERT_NE(a.get_executor(), nullptr);
-    assert_equal_to_original_x(a);
+    this->assert_equal_to_original_x(a);
 }
 
 
-TEST_F(Array, CanBeMovedFromExecutorlessArray)
+TYPED_TEST(Array, CanBeMovedFromExecutorlessArray)
 {
-    gko::Array<int> a;
+    gko::Array<TypeParam> a;
 
-    x = std::move(a);
+    this->x = std::move(a);
 
-    ASSERT_NE(x.get_executor(), nullptr);
-    ASSERT_EQ(x.get_num_elems(), 0);
+    ASSERT_NE(this->x.get_executor(), nullptr);
+    ASSERT_EQ(this->x.get_num_elems(), 0);
 }
 
 
-TEST_F(Array, CanBeCleared)
+TYPED_TEST(Array, CanBeCleared)
 {
-    x.clear();
+    this->x.clear();
 
-    ASSERT_EQ(x.get_num_elems(), 0);
-    ASSERT_EQ(x.get_data(), nullptr);
-    ASSERT_EQ(x.get_const_data(), nullptr);
+    ASSERT_EQ(this->x.get_num_elems(), 0);
+    ASSERT_EQ(this->x.get_data(), nullptr);
+    ASSERT_EQ(this->x.get_const_data(), nullptr);
 }
 
 
-TEST_F(Array, CanBeResized)
+TYPED_TEST(Array, CanBeResized)
 {
-    x.resize_and_reset(3);
+    this->x.resize_and_reset(3);
+
+    this->x.get_data()[0] = 1;
+    this->x.get_data()[1] = 8;
+    this->x.get_data()[2] = 7;
+
+    EXPECT_EQ(this->x.get_const_data()[0], TypeParam{1});
+    EXPECT_EQ(this->x.get_const_data()[1], TypeParam{8});
+    EXPECT_EQ(this->x.get_const_data()[2], TypeParam{7});
+}
+
+
+TYPED_TEST(Array, ViewCannotBeResized)
+{
+    TypeParam data[] = {1, 2, 3};
+    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+
+    EXPECT_THROW(view.resize_and_reset(1), gko::NotSupported);
+    EXPECT_EQ(view.get_num_elems(), 3);
+    ASSERT_EQ(view.get_data()[0], TypeParam{1});
+}
+
+
+template <typename T>
+class my_null_deleter {
+public:
+    using pointer = T *;
+
+    void operator()(pointer) const noexcept {}
+};
+
+template <typename T>
+class my_null_deleter<T[]> {
+public:
+    using pointer = T[];
+
+    void operator()(pointer) const noexcept {}
+};
+
 
-    x.get_data()[0] = 1;
-    x.get_data()[1] = 8;
-    x.get_data()[2] = 7;
+TYPED_TEST(Array, CustomDeleterCannotBeResized)
+{
+    TypeParam data[] = {1, 2, 3};
+    auto view_custom_deleter = gko::Array<TypeParam>(
+        this->exec, 3, data, my_null_deleter<TypeParam[]>{});
 
-    EXPECT_EQ(x.get_const_data()[0], 1);
-    EXPECT_EQ(x.get_const_data()[1], 8);
-    EXPECT_EQ(x.get_const_data()[2], 7);
+    EXPECT_THROW(view_custom_deleter.resize_and_reset(1), gko::NotSupported);
+    EXPECT_EQ(view_custom_deleter.get_num_elems(), 3);
+    ASSERT_EQ(view_custom_deleter.get_data()[0], TypeParam{1});
 }
 
 
-TEST_F(Array, CanBeAssignedAnExecutor)
+TYPED_TEST(Array, CanBeAssignedAnExecutor)
 {
-    gko::Array<int> a;
+    gko::Array<TypeParam> a;
 
-    a.set_executor(exec);
+    a.set_executor(this->exec);
 
-    ASSERT_EQ(a.get_executor(), exec);
+    ASSERT_EQ(a.get_executor(), this->exec);
 }
 
 
-TEST_F(Array, ChangesExecutors)
+TYPED_TEST(Array, ChangesExecutors)
 {
     auto omp = gko::OmpExecutor::create();
-    x.set_executor(omp);
+    this->x.set_executor(omp);
+
+    ASSERT_EQ(this->x.get_executor(), omp);
+    this->assert_equal_to_original_x(this->x);
+}
+
+
+TYPED_TEST(Array, ViewModifiesOriginalData)
+{
+    TypeParam data[] = {1, 2, 3};
+    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+
+    TypeParam new_data[] = {5, 4, 2};
+    std::copy(new_data, new_data + 3, view.get_data());
+
+    EXPECT_EQ(data[0], TypeParam{5});
+    EXPECT_EQ(data[1], TypeParam{4});
+    EXPECT_EQ(data[2], TypeParam{2});
+    ASSERT_EQ(view.get_num_elems(), 3);
+}
+
+
+TYPED_TEST(Array, CopyArrayToArray)
+{
+    gko::Array<TypeParam> array(this->exec, {1, 2, 3});
+    gko::Array<TypeParam> array2(this->exec, {5, 4, 2, 1});
+
+    array = array2;
+
+    EXPECT_EQ(array.get_data()[0], TypeParam{5});
+    EXPECT_EQ(array.get_data()[1], TypeParam{4});
+    EXPECT_EQ(array.get_data()[2], TypeParam{2});
+    EXPECT_EQ(array.get_data()[3], TypeParam{1});
+    EXPECT_EQ(array.get_num_elems(), 4);
+    EXPECT_NE(array.get_data(), array2.get_data());
+    ASSERT_EQ(array2.get_num_elems(), 4);
+}
+
+
+TYPED_TEST(Array, CopyViewToView)
+{
+    TypeParam data[] = {1, 2, 3};
+    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    TypeParam data2[] = {5, 4, 2};
+    auto view2 = gko::Array<TypeParam>::view(this->exec, 3, data2);
+    TypeParam data_size4[] = {5, 4, 2, 1};
+    auto view_size4 = gko::Array<TypeParam>::view(this->exec, 4, data_size4);
+
+    view = view2;
+    view2.get_data()[0] = 2;
+
+    EXPECT_EQ(data[0], TypeParam{5});
+    EXPECT_EQ(data[1], TypeParam{4});
+    EXPECT_EQ(data[2], TypeParam{2});
+    EXPECT_EQ(view.get_num_elems(), 3);
+    EXPECT_EQ(view2.get_num_elems(), 3);
+    EXPECT_EQ(view2.get_data()[0], TypeParam{2});
+    ASSERT_THROW(view2 = view_size4, gko::OutOfBoundsError);
+}
+
+
+TYPED_TEST(Array, CopyViewToArray)
+{
+    TypeParam data[] = {1, 2, 3, 4};
+    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    gko::Array<TypeParam> array(this->exec, {5, 4, 2});
+
+    array = view;
+    view.get_data()[0] = 2;
+
+    EXPECT_EQ(array.get_data()[0], TypeParam{1});
+    EXPECT_EQ(array.get_data()[1], TypeParam{2});
+    EXPECT_EQ(array.get_data()[2], TypeParam{3});
+    EXPECT_EQ(array.get_data()[3], TypeParam{4});
+    EXPECT_EQ(array.get_num_elems(), 4);
+    ASSERT_EQ(view.get_num_elems(), 4);
+}
+
+
+TYPED_TEST(Array, CopyArrayToView)
+{
+    TypeParam data[] = {1, 2, 3};
+    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    gko::Array<TypeParam> array_size2(this->exec, {5, 4});
+    gko::Array<TypeParam> array_size4(this->exec, {5, 4, 2, 1});
+
+    view = array_size2;
+
+    EXPECT_EQ(data[0], TypeParam{5});
+    EXPECT_EQ(data[1], TypeParam{4});
+    EXPECT_EQ(data[2], TypeParam{3});
+    EXPECT_EQ(view.get_num_elems(), 3);
+    EXPECT_EQ(array_size2.get_num_elems(), 2);
+    ASSERT_THROW(view = array_size4, gko::OutOfBoundsError);
+}
+
 
-    ASSERT_EQ(x.get_executor(), omp);
-    assert_equal_to_original_x(x);
+TYPED_TEST(Array, MoveArrayToArray)
+{
+    gko::Array<TypeParam> array(this->exec, {1, 2, 3});
+    gko::Array<TypeParam> array2(this->exec, {5, 4, 2, 1});
+    auto data2 = array2.get_data();
+
+    array = std::move(array2);
+
+    EXPECT_EQ(array.get_data(), data2);
+    EXPECT_EQ(array.get_data()[0], TypeParam{5});
+    EXPECT_EQ(array.get_data()[1], TypeParam{4});
+    EXPECT_EQ(array.get_data()[2], TypeParam{2});
+    EXPECT_EQ(array.get_data()[3], TypeParam{1});
+    EXPECT_EQ(array.get_num_elems(), 4);
+    EXPECT_EQ(array2.get_data(), nullptr);
+    ASSERT_EQ(array2.get_num_elems(), 0);
 }
 
 
-TEST_F(Array, CanCreateView)
+TYPED_TEST(Array, MoveViewToView)
 {
-    int data[] = {1, 2, 3};
+    TypeParam data[] = {1, 2, 3, 4};
+    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    TypeParam data2[] = {5, 4, 2};
+    auto view2 = gko::Array<TypeParam>::view(this->exec, 3, data2);
+
+    view = std::move(view2);
+
+    EXPECT_EQ(view.get_data(), data2);
+    EXPECT_EQ(view.get_data()[0], TypeParam{5});
+    EXPECT_EQ(view.get_data()[1], TypeParam{4});
+    EXPECT_EQ(view.get_data()[2], TypeParam{2});
+    EXPECT_EQ(view.get_num_elems(), 3);
+    EXPECT_EQ(view2.get_data(), nullptr);
+    EXPECT_EQ(view2.get_num_elems(), 0);
+    EXPECT_NE(data, nullptr);
+    EXPECT_EQ(data[0], TypeParam{1});
+    EXPECT_EQ(data[1], TypeParam{2});
+    EXPECT_EQ(data[2], TypeParam{3});
+    ASSERT_EQ(data[3], TypeParam{4});
+}
 
-    auto view = gko::Array<int>::view(exec, 3, data);
-    view = gko::Array<int>{exec, {5, 4, 2}};
 
-    EXPECT_EQ(data[0], 5);
-    EXPECT_EQ(data[1], 4);
-    EXPECT_EQ(data[2], 2);
+TYPED_TEST(Array, MoveViewToArray)
+{
+    TypeParam data[] = {1, 2, 3, 4};
+    gko::Array<TypeParam> array(this->exec, {5, 4, 2});
+    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+
+    array = std::move(view);
+
+    EXPECT_EQ(array.get_data(), data);
+    EXPECT_EQ(array.get_data()[0], TypeParam{1});
+    EXPECT_EQ(array.get_data()[1], TypeParam{2});
+    EXPECT_EQ(array.get_data()[2], TypeParam{3});
+    EXPECT_EQ(array.get_data()[3], TypeParam{4});
+    EXPECT_EQ(array.get_num_elems(), 4);
+    EXPECT_EQ(data[0], TypeParam{1});
+    EXPECT_EQ(data[1], TypeParam{2});
+    EXPECT_EQ(data[2], TypeParam{3});
+    EXPECT_EQ(data[3], TypeParam{4});
+    EXPECT_EQ(view.get_data(), nullptr);
+    ASSERT_EQ(view.get_num_elems(), 0);
+}
+
+
+TYPED_TEST(Array, MoveArrayToView)
+{
+    TypeParam data[] = {1, 2, 3};
+    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    gko::Array<TypeParam> array_size2(this->exec, {5, 4});
+    gko::Array<TypeParam> array_size4(this->exec, {5, 4, 2, 1});
+    auto size2_ptr = array_size2.get_data();
+    auto size4_ptr = array_size4.get_data();
+
+    view = std::move(array_size2);
+
+    EXPECT_EQ(view.get_data()[0], TypeParam{5});
+    EXPECT_EQ(view.get_data()[1], TypeParam{4});
+    EXPECT_EQ(view.get_num_elems(), 2);
+    EXPECT_NE(view.get_data(), data);
+    EXPECT_EQ(view.get_data(), size2_ptr);
+    EXPECT_NO_THROW(view = std::move(array_size4));
+    EXPECT_EQ(view.get_data(), size4_ptr);
+    EXPECT_EQ(array_size2.get_data(), nullptr);
+    ASSERT_EQ(array_size2.get_num_elems(), 0);
 }
 
 
diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp
index f22772d5f1a..8c2eaf35b6f 100644
--- a/core/test/base/combination.cpp
+++ b/core/test/base/combination.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -55,6 +58,7 @@ struct DummyOperator : public gko::EnableLinOp<DummyOperator> {
 };
 
 
+template <typename T>
 class Combination : public ::testing::Test {
 protected:
     Combination()
@@ -70,10 +74,12 @@ class Combination : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> coefficients;
 };
 
+TYPED_TEST_CASE(Combination, gko::test::ValueTypes);
+
 
-TEST_F(Combination, CanBeEmpty)
+TYPED_TEST(Combination, CanBeEmpty)
 {
-    auto cmb = gko::Combination<>::create(exec);
+    auto cmb = gko::Combination<TypeParam>::create(this->exec);
 
     ASSERT_EQ(cmb->get_size(), gko::dim<2>(0, 0));
     ASSERT_EQ(cmb->get_coefficients().size(), 0);
@@ -81,34 +87,35 @@ TEST_F(Combination, CanBeEmpty)
 }
 
 
-TEST_F(Combination, CanCreateFromIterators)
+TYPED_TEST(Combination, CanCreateFromIterators)
 {
-    auto cmb =
-        gko::Combination<>::create(begin(coefficients), end(coefficients),
-                                   begin(operators), end(operators));
+    auto cmb = gko::Combination<TypeParam>::create(
+        begin(this->coefficients), end(this->coefficients),
+        begin(this->operators), end(this->operators));
 
     ASSERT_EQ(cmb->get_size(), gko::dim<2>(1, 1));
     ASSERT_EQ(cmb->get_coefficients().size(), 2);
     ASSERT_EQ(cmb->get_operators().size(), 2);
-    ASSERT_EQ(cmb->get_coefficients()[0], coefficients[0]);
-    ASSERT_EQ(cmb->get_operators()[0], operators[0]);
-    ASSERT_EQ(cmb->get_coefficients()[1], coefficients[1]);
-    ASSERT_EQ(cmb->get_operators()[1], operators[1]);
+    ASSERT_EQ(cmb->get_coefficients()[0], this->coefficients[0]);
+    ASSERT_EQ(cmb->get_operators()[0], this->operators[0]);
+    ASSERT_EQ(cmb->get_coefficients()[1], this->coefficients[1]);
+    ASSERT_EQ(cmb->get_operators()[1], this->operators[1]);
 }
 
 
-TEST_F(Combination, CanCreateFromList)
+TYPED_TEST(Combination, CanCreateFromList)
 {
-    auto cmb = gko::Combination<>::create(coefficients[0], operators[0],
-                                          coefficients[1], operators[1]);
+    auto cmb = gko::Combination<TypeParam>::create(
+        this->coefficients[0], this->operators[0], this->coefficients[1],
+        this->operators[1]);
 
     ASSERT_EQ(cmb->get_size(), gko::dim<2>(1, 1));
     ASSERT_EQ(cmb->get_coefficients().size(), 2);
     ASSERT_EQ(cmb->get_operators().size(), 2);
-    ASSERT_EQ(cmb->get_coefficients()[0], coefficients[0]);
-    ASSERT_EQ(cmb->get_operators()[0], operators[0]);
-    ASSERT_EQ(cmb->get_coefficients()[1], coefficients[1]);
-    ASSERT_EQ(cmb->get_operators()[1], operators[1]);
+    ASSERT_EQ(cmb->get_coefficients()[0], this->coefficients[0]);
+    ASSERT_EQ(cmb->get_operators()[0], this->operators[0]);
+    ASSERT_EQ(cmb->get_coefficients()[1], this->coefficients[1]);
+    ASSERT_EQ(cmb->get_operators()[1], this->operators[1]);
 }
 
 
diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp
index bea2dbd2755..aa9df458456 100644
--- a/core/test/base/composition.cpp
+++ b/core/test/base/composition.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -56,6 +59,7 @@ struct DummyOperator : public gko::EnableLinOp<DummyOperator> {
 };
 
 
+template <typename T>
 class Composition : public ::testing::Test {
 protected:
     Composition()
@@ -68,35 +72,39 @@ class Composition : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> operators;
 };
 
+TYPED_TEST_CASE(Composition, gko::test::ValueTypes);
+
 
-TEST_F(Composition, CanBeEmpty)
+TYPED_TEST(Composition, CanBeEmpty)
 {
-    auto cmp = gko::Composition<>::create(exec);
+    auto cmp = gko::Composition<TypeParam>::create(this->exec);
 
     ASSERT_EQ(cmp->get_size(), gko::dim<2>(0, 0));
     ASSERT_EQ(cmp->get_operators().size(), 0);
 }
 
 
-TEST_F(Composition, CanCreateFromIterators)
+TYPED_TEST(Composition, CanCreateFromIterators)
 {
-    auto cmp = gko::Composition<>::create(begin(operators), end(operators));
+    auto cmp = gko::Composition<TypeParam>::create(begin(this->operators),
+                                                   end(this->operators));
 
     ASSERT_EQ(cmp->get_size(), gko::dim<2>(2, 3));
     ASSERT_EQ(cmp->get_operators().size(), 2);
-    ASSERT_EQ(cmp->get_operators()[0], operators[0]);
-    ASSERT_EQ(cmp->get_operators()[1], operators[1]);
+    ASSERT_EQ(cmp->get_operators()[0], this->operators[0]);
+    ASSERT_EQ(cmp->get_operators()[1], this->operators[1]);
 }
 
 
-TEST_F(Composition, CanCreateFromList)
+TYPED_TEST(Composition, CanCreateFromList)
 {
-    auto cmp = gko::Composition<>::create(operators[0], operators[1]);
+    auto cmp = gko::Composition<TypeParam>::create(this->operators[0],
+                                                   this->operators[1]);
 
     ASSERT_EQ(cmp->get_size(), gko::dim<2>(2, 3));
     ASSERT_EQ(cmp->get_operators().size(), 2);
-    ASSERT_EQ(cmp->get_operators()[0], operators[0]);
-    ASSERT_EQ(cmp->get_operators()[1], operators[1]);
+    ASSERT_EQ(cmp->get_operators()[0], this->operators[0]);
+    ASSERT_EQ(cmp->get_operators()[1], this->operators[1]);
 }
 
 
diff --git a/core/test/base/dim.cpp b/core/test/base/dim.cpp
index 80143caf9aa..b94ef6672c1 100644
--- a/core/test/base/dim.cpp
+++ b/core/test/base/dim.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/dim.hpp>
 
 
+#include <memory>
+
+
 #include <gtest/gtest.h>
 
 
@@ -48,6 +51,16 @@ TEST(Dim, ConstructsCorrectObject)
 }
 
 
+TEST(Dim, ConstructsCorrectConstexprObject)
+{
+    constexpr gko::dim<3> d{4, 5, 6};
+
+    ASSERT_EQ(d[0], 4);
+    ASSERT_EQ(d[1], 5);
+    ASSERT_EQ(d[2], 6);
+}
+
+
 TEST(Dim, ConstructsSquareObject)
 {
     gko::dim<2> d{5};
@@ -66,6 +79,34 @@ TEST(Dim, ConstructsNullObject)
 }
 
 
+class dim_manager {
+public:
+    using dim = gko::dim<3>;
+    const dim &get_size() const { return size_; }
+
+    static std::unique_ptr<dim_manager> create(const dim &size)
+    {
+        return std::unique_ptr<dim_manager>{new dim_manager{size}};
+    }
+
+private:
+    dim_manager(const dim &size) : size_{size} {}
+    dim size_;
+};
+
+
+TEST(Dim, CopiesProperlyOnHeap)
+{
+    auto manager = dim_manager::create(gko::dim<3>{1, 2, 3});
+
+    const auto copy = manager->get_size();
+
+    ASSERT_EQ(copy[0], 1);
+    ASSERT_EQ(copy[1], 2);
+    ASSERT_EQ(copy[2], 3);
+}
+
+
 TEST(Dim, ConvertsToBool)
 {
     gko::dim<2> d1{};
diff --git a/core/test/base/exception.cpp b/core/test/base/exception.cpp
index 815a08991af..b04d7553103 100644
--- a/core/test/base/exception.cpp
+++ b/core/test/base/exception.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -98,6 +98,30 @@ TEST(ExceptionClasses, CusparseErrorReturnsCorrectWhatMessage)
 }
 
 
+TEST(ExceptionClasses, HipErrorReturnsCorrectWhatMessage)
+{
+    gko::HipError error("test_file.cpp", 123, "test_func", 1);
+    std::string expected = "test_file.cpp:123: test_func: ";
+    ASSERT_EQ(expected, std::string(error.what()).substr(0, expected.size()));
+}
+
+
+TEST(ExceptionClasses, HipblasErrorReturnsCorrectWhatMessage)
+{
+    gko::HipblasError error("test_file.cpp", 123, "test_func", 1);
+    std::string expected = "test_file.cpp:123: test_func: ";
+    ASSERT_EQ(expected, std::string(error.what()).substr(0, expected.size()));
+}
+
+
+TEST(ExceptionClasses, HipsparseErrorReturnsCorrectWhatMessage)
+{
+    gko::HipsparseError error("test_file.cpp", 123, "test_func", 1);
+    std::string expected = "test_file.cpp:123: test_func: ";
+    ASSERT_EQ(expected, std::string(error.what()).substr(0, expected.size()));
+}
+
+
 TEST(ExceptionClasses, DimensionMismatchReturnsCorrectWhatMessage)
 {
     gko::DimensionMismatch error("test_file.cpp", 243, "test_func", "a", 3, 4,
diff --git a/core/test/base/exception_helpers.cpp b/core/test/base/exception_helpers.cpp
index dd013835300..cad8a3d5684 100644
--- a/core/test/base/exception_helpers.cpp
+++ b/core/test/base/exception_helpers.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -55,11 +55,48 @@ TEST(NotCompiled, ThrowsWhenUsed)
 }
 
 
-void does_not_support_int() { GKO_NOT_SUPPORTED(int); }
+template <typename Expected, typename T>
+void test_not_supported_impl(const T &obj)
+{
+    try {
+        GKO_NOT_SUPPORTED(obj);
+        FAIL();
+    } catch (gko::NotSupported &m) {
+        // check for equal suffix
+        std::string msg{m.what()};
+        auto expected = gko::name_demangling::get_type_name(typeid(Expected));
+        ASSERT_TRUE(
+            std::equal(expected.rbegin(), expected.rend(), msg.rbegin()));
+    }
+}
 
-TEST(NotSupported, ReturnsNotSupportedException)
+
+TEST(NotSupported, ReturnsIntNotSupportedException)
 {
-    ASSERT_THROW(does_not_support_int(), gko::NotSupported);
+    test_not_supported_impl<int>(int{});
+}
+
+
+struct Base {
+    virtual ~Base() = default;
+};
+
+struct Derived : Base {};
+
+
+TEST(NotSupported, ReturnsPtrNotSupportedException)
+{
+    Derived d;
+    Base *b = &d;
+    test_not_supported_impl<Derived>(b);
+}
+
+
+TEST(NotSupported, ReturnsRefNotSupportedException)
+{
+    Derived d;
+    Base &b = d;
+    test_not_supported_impl<Derived>(b);
 }
 
 
@@ -87,6 +124,30 @@ TEST(CudaError, ReturnsCusparseError)
 }
 
 
+void throws_hip_error() { throw GKO_HIP_ERROR(0); }
+
+TEST(HipError, ReturnsHipError)
+{
+    ASSERT_THROW(throws_hip_error(), gko::HipError);
+}
+
+
+void throws_hipblas_error() { throw GKO_HIPBLAS_ERROR(0); }
+
+TEST(HipError, ReturnsHipblasError)
+{
+    ASSERT_THROW(throws_hipblas_error(), gko::HipblasError);
+}
+
+
+void throws_hipsparse_error() { throw GKO_HIPSPARSE_ERROR(0); }
+
+TEST(HipError, ReturnsHipsparseError)
+{
+    ASSERT_THROW(throws_hipsparse_error(), gko::HipsparseError);
+}
+
+
 TEST(AssertIsSquareMatrix, DoesNotThrowWhenIsSquareMatrix)
 {
     ASSERT_NO_THROW(GKO_ASSERT_IS_SQUARE_MATRIX(gko::dim<2>(3, 3)));
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index dca77292eda..1b2e1b0698e 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,10 +59,14 @@ class ExampleOperation : public gko::Operation {
     {
         value = 2;
     }
-    void run(std::shared_ptr<const gko::ReferenceExecutor>) const override
+    void run(std::shared_ptr<const gko::HipExecutor>) const override
     {
         value = 3;
     }
+    void run(std::shared_ptr<const gko::ReferenceExecutor>) const override
+    {
+        value = 4;
+    }
 
     int &value;
 };
@@ -83,9 +87,10 @@ TEST(OmpExecutor, RunsCorrectLambdaOperation)
     int value = 0;
     auto omp_lambda = [&value]() { value = 1; };
     auto cuda_lambda = [&value]() { value = 2; };
+    auto hip_lambda = [&value]() { value = 3; };
     exec_ptr omp = gko::OmpExecutor::create();
 
-    omp->run(omp_lambda, cuda_lambda);
+    omp->run(omp_lambda, cuda_lambda, hip_lambda);
     ASSERT_EQ(1, value);
 }
 
@@ -128,7 +133,7 @@ TEST(OmpExecutor, CopiesData)
     int *copy = omp->alloc<int>(num_elems);
 
     // user code is run on the OMP, so local variables are in OMP memory
-    omp->copy_from(omp.get(), num_elems, orig, copy);
+    omp->copy(num_elems, orig, copy);
     EXPECT_EQ(3, copy[0]);
     EXPECT_EQ(8, copy[1]);
 
@@ -150,7 +155,7 @@ TEST(ReferenceExecutor, RunsCorrectOperation)
     exec_ptr ref = gko::ReferenceExecutor::create();
 
     ref->run(ExampleOperation(value));
-    ASSERT_EQ(3, value);
+    ASSERT_EQ(4, value);
 }
 
 
@@ -159,9 +164,10 @@ TEST(ReferenceExecutor, RunsCorrectLambdaOperation)
     int value = 0;
     auto omp_lambda = [&value]() { value = 1; };
     auto cuda_lambda = [&value]() { value = 2; };
+    auto hip_lambda = [&value]() { value = 3; };
     exec_ptr ref = gko::ReferenceExecutor::create();
 
-    ref->run(omp_lambda, cuda_lambda);
+    ref->run(omp_lambda, cuda_lambda, hip_lambda);
     ASSERT_EQ(1, value);
 }
 
@@ -204,7 +210,7 @@ TEST(ReferenceExecutor, CopiesData)
     int *copy = ref->alloc<int>(num_elems);
 
     // ReferenceExecutor is a type of OMP executor, so this is O.K.
-    ref->copy_from(ref.get(), num_elems, orig, copy);
+    ref->copy(num_elems, orig, copy);
     EXPECT_EQ(3, copy[0]);
     EXPECT_EQ(8, copy[1]);
 
@@ -212,6 +218,18 @@ TEST(ReferenceExecutor, CopiesData)
 }
 
 
+TEST(ReferenceExecutor, CopiesSingleValue)
+{
+    exec_ptr ref = gko::ReferenceExecutor::create();
+    int *el = ref->alloc<int>(1);
+    el[0] = 83683;
+
+    EXPECT_EQ(83683, ref->copy_val_to_host(el));
+
+    ref->free(el);
+}
+
+
 TEST(ReferenceExecutor, CopiesDataFromOmp)
 {
     int orig[] = {3, 8};
@@ -257,7 +275,8 @@ TEST(ReferenceExecutor, IsItsOwnMaster)
 TEST(CudaExecutor, RunsCorrectOperation)
 {
     int value = 0;
-    exec_ptr cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    exec_ptr cuda =
+        gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
 
     cuda->run(ExampleOperation(value));
     ASSERT_EQ(2, value);
@@ -269,9 +288,11 @@ TEST(CudaExecutor, RunsCorrectLambdaOperation)
     int value = 0;
     auto omp_lambda = [&value]() { value = 1; };
     auto cuda_lambda = [&value]() { value = 2; };
-    exec_ptr cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    auto hip_lambda = [&value]() { value = 3; };
+    exec_ptr cuda =
+        gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
 
-    cuda->run(omp_lambda, cuda_lambda);
+    cuda->run(omp_lambda, cuda_lambda, hip_lambda);
     ASSERT_EQ(2, value);
 }
 
@@ -294,6 +315,105 @@ TEST(CudaExecutor, KnowsItsDeviceId)
 }
 
 
+TEST(CudaExecutor, CanGetDeviceResetBoolean)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto cuda = gko::CudaExecutor::create(0, omp);
+
+    ASSERT_EQ(false, cuda->get_device_reset());
+}
+
+
+TEST(CudaExecutor, CanSetDefaultDeviceResetBoolean)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto cuda = gko::CudaExecutor::create(0, omp, true);
+
+    ASSERT_EQ(true, cuda->get_device_reset());
+}
+
+
+TEST(CudaExecutor, CanSetDeviceResetBoolean)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto cuda = gko::CudaExecutor::create(0, omp);
+
+    cuda->set_device_reset(true);
+
+    ASSERT_EQ(true, cuda->get_device_reset());
+}
+
+
+TEST(HipExecutor, RunsCorrectOperation)
+{
+    int value = 0;
+    exec_ptr hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+
+    hip->run(ExampleOperation(value));
+    ASSERT_EQ(3, value);
+}
+
+
+TEST(HipExecutor, RunsCorrectLambdaOperation)
+{
+    int value = 0;
+    auto omp_lambda = [&value]() { value = 1; };
+    auto cuda_lambda = [&value]() { value = 2; };
+    auto hip_lambda = [&value]() { value = 3; };
+    exec_ptr hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+
+    hip->run(omp_lambda, cuda_lambda, hip_lambda);
+    ASSERT_EQ(3, value);
+}
+
+
+TEST(HipExecutor, KnowsItsMaster)
+{
+    auto omp = gko::OmpExecutor::create();
+    exec_ptr hip = gko::HipExecutor::create(0, omp);
+
+    ASSERT_EQ(omp, hip->get_master());
+}
+
+
+TEST(HipExecutor, KnowsItsDeviceId)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto hip = gko::HipExecutor::create(0, omp);
+
+    ASSERT_EQ(0, hip->get_device_id());
+}
+
+
+TEST(HipExecutor, CanGetDeviceResetBoolean)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto hip = gko::HipExecutor::create(0, omp);
+
+    ASSERT_EQ(false, hip->get_device_reset());
+}
+
+
+TEST(HipExecutor, CanSetDefaultDeviceResetBoolean)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto hip = gko::HipExecutor::create(0, omp, true);
+
+    ASSERT_EQ(true, hip->get_device_reset());
+}
+
+
+TEST(HipExecutor, CanSetDeviceResetBoolean)
+{
+    auto omp = gko::OmpExecutor::create();
+    auto hip = gko::HipExecutor::create(0, omp);
+
+    hip->set_device_reset(true);
+
+    ASSERT_EQ(true, hip->get_device_reset());
+}
+
+
 template <typename T>
 struct mock_free : T {
     /**
diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp
index ef31e5c9210..f2f7da597bf 100644
--- a/core/test/base/extended_float.cpp
+++ b/core/test/base/extended_float.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,16 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <core/base/extended_float.hpp>
-
-
-#include <gtest/gtest.h>
+#include "core/base/extended_float.hpp"
 
 
 #include <bitset>
 #include <string>
 
 
+#include <gtest/gtest.h>
+
+
 namespace {
 
 
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index 6e350e40382..b2b873a627a 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,23 +40,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class IteratorFactory : public ::testing::Test {
 protected:
-    using int_type = int;
-    using double_type = double;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     IteratorFactory()
-        : reversed_int{100, 50, 10, 9, 8, 7, 5, 5, 4, 3, 2, 1, 0, -1, -2},
-          ordered_int{-2, -1, 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 50, 100},
-          reversed_double{15., 14., 13., 12., 11., 10., 9., 7.,
-                          7.,  6.,  5.,  4.,  3.,  2.,  -1.},
-          ordered_double{-1., 2.,  3.,  4.,  5.,  6.,  7., 7.,
-                         9.,  10., 11., 12., 13., 14., 15.}
+        : reversed_index{100, 50, 10, 9, 8, 7, 5, 5, 4, 3, 2, 1, 0, -1, -2},
+          ordered_index{-2, -1, 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 50, 100},
+          reversed_value{15., 14., 13., 12., 11., 10., 9., 7.,
+                         7.,  6.,  5.,  4.,  3.,  2.,  -1.},
+          ordered_value{-1., 2.,  3.,  4.,  5.,  6.,  7., 7.,
+                        9.,  10., 11., 12., 13., 14., 15.}
     {}
 
     template <typename T1, typename T2>
@@ -87,16 +90,20 @@ class IteratorFactory : public ::testing::Test {
         return true;
     }
 
-    const std::vector<int_type> reversed_int;
-    const std::vector<int_type> ordered_int;
-    const std::vector<double_type> reversed_double;
-    const std::vector<double_type> ordered_double;
+    const std::vector<index_type> reversed_index;
+    const std::vector<index_type> ordered_index;
+    const std::vector<value_type> reversed_value;
+    const std::vector<value_type> ordered_value;
 };
 
+TYPED_TEST_CASE(IteratorFactory, gko::test::ValueIndexTypes);
 
-TEST_F(IteratorFactory, EmptyIterator)
+
+TYPED_TEST(IteratorFactory, EmptyIterator)
 {
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         nullptr, nullptr, 0);
 
     ASSERT_TRUE(test_iter.begin() == test_iter.end());
@@ -104,66 +111,78 @@ TEST_F(IteratorFactory, EmptyIterator)
 }
 
 
-TEST_F(IteratorFactory, SortingReversedWithIterator)
+TYPED_TEST(IteratorFactory, SortingReversedWithIterator)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->reversed_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
     std::sort(test_iter.begin(), test_iter.end());
 
-    check_vector_equal(vec1, ordered_int);
-    check_vector_equal(vec2, reversed_double);
+    this->check_vector_equal(vec1, this->ordered_index);
+    this->check_vector_equal(vec2, this->reversed_value);
 }
 
 
-TEST_F(IteratorFactory, SortingAlreadySortedWithIterator)
+TYPED_TEST(IteratorFactory, SortingAlreadySortedWithIterator)
 {
-    std::vector<int_type> vec1{ordered_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->ordered_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
     std::sort(test_iter.begin(), test_iter.end());
 
-    check_vector_equal(vec1, ordered_int);
-    check_vector_equal(vec2, ordered_double);
+    this->check_vector_equal(vec1, this->ordered_index);
+    this->check_vector_equal(vec2, this->ordered_value);
 }
 
 
-TEST_F(IteratorFactory, IteratorReferenceOperatorSmaller)
+TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->reversed_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
-    bool is_sorted = is_sorted_iterator(test_iter.begin(), test_iter.end());
+    bool is_sorted =
+        this->is_sorted_iterator(test_iter.begin(), test_iter.end());
 
     ASSERT_FALSE(is_sorted);
 }
 
 
-TEST_F(IteratorFactory, IteratorReferenceOperatorSmaller2)
+TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller2)
 {
-    std::vector<int_type> vec1{ordered_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->ordered_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
-    bool is_sorted = is_sorted_iterator(test_iter.begin(), test_iter.end());
+    bool is_sorted =
+        this->is_sorted_iterator(test_iter.begin(), test_iter.end());
 
     ASSERT_TRUE(is_sorted);
 }
 
 
-TEST_F(IteratorFactory, IncreasingIterator)
+TYPED_TEST(IteratorFactory, IncreasingIterator)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->reversed_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
     auto begin = test_iter.begin();
     auto plus_2 = begin + 2;
@@ -187,12 +206,14 @@ TEST_F(IteratorFactory, IncreasingIterator)
 }
 
 
-TEST_F(IteratorFactory, DecreasingIterator)
+TYPED_TEST(IteratorFactory, DecreasingIterator)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->reversed_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
     auto iter = test_iter.begin() + 5;
     auto minus_2 = iter - 2;
@@ -216,16 +237,18 @@ TEST_F(IteratorFactory, DecreasingIterator)
 }
 
 
-TEST_F(IteratorFactory, CorrectDereferencing)
+TYPED_TEST(IteratorFactory, CorrectDereferencing)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type_it = typename TestFixture::index_type;
+    using value_type_it = typename TestFixture::value_type;
+    std::vector<index_type_it> vec1{this->reversed_index};
+    std::vector<value_type_it> vec2{this->ordered_value};
     constexpr int element_to_test = 3;
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type_it, value_type_it>(
         vec1.data(), vec2.data(), vec1.size());
     auto begin = test_iter.begin();
-    using value_type = decltype(begin)::value_type;
+    using value_type = typename decltype(begin)::value_type;
     auto to_test_ref = *(begin + element_to_test);
     value_type to_test_pair = to_test_ref;  // Testing implicit conversion
 
@@ -236,51 +259,55 @@ TEST_F(IteratorFactory, CorrectDereferencing)
 }
 
 
-TEST_F(IteratorFactory, CorrectSwapping)
+TYPED_TEST(IteratorFactory, CorrectSwapping)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->reversed_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
     auto first_el_reference = *test_iter.begin();
     auto second_el_reference = *(test_iter.begin() + 1);
     swap(first_el_reference, second_el_reference);
 
-    ASSERT_TRUE(vec1[0] == reversed_int[1]);
-    ASSERT_TRUE(vec1[1] == reversed_int[0]);
-    ASSERT_TRUE(vec2[0] == ordered_double[1]);
-    ASSERT_TRUE(vec2[1] == ordered_double[0]);
+    ASSERT_TRUE(vec1[0] == this->reversed_index[1]);
+    ASSERT_TRUE(vec1[1] == this->reversed_index[0]);
+    ASSERT_TRUE(vec2[0] == this->ordered_value[1]);
+    ASSERT_TRUE(vec2[1] == this->ordered_value[0]);
     // Make sure the other values were not touched.
     for (size_t i = 2; i < vec1.size(); ++i) {
-        ASSERT_TRUE(vec1[i] == reversed_int[i]);
-        ASSERT_TRUE(vec2[i] == ordered_double[i]);
+        ASSERT_TRUE(vec1[i] == this->reversed_index[i]);
+        ASSERT_TRUE(vec2[i] == this->ordered_value[i]);
     }
 }
 
 
-TEST_F(IteratorFactory, CorrectHandWrittenSwapping)
+TYPED_TEST(IteratorFactory, CorrectHandWrittenSwapping)
 {
-    std::vector<int_type> vec1{reversed_int};
-    std::vector<double_type> vec2{ordered_double};
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    std::vector<index_type> vec1{this->reversed_index};
+    std::vector<value_type> vec2{this->ordered_value};
 
-    auto test_iter = gko::detail::IteratorFactory<int_type, double_type>(
+    auto test_iter = gko::detail::IteratorFactory<index_type, value_type>(
         vec1.data(), vec2.data(), vec1.size());
     auto first_el_reference = *test_iter.begin();
     auto second_el_reference = *(test_iter.begin() + 1);
-    auto temp = static_cast<decltype(test_iter.begin())::value_type>(
+    auto temp = static_cast<typename decltype(test_iter.begin())::value_type>(
         first_el_reference);
     first_el_reference = second_el_reference;
     second_el_reference = temp;
 
-    ASSERT_TRUE(vec1[0] == reversed_int[1]);
-    ASSERT_TRUE(vec1[1] == reversed_int[0]);
-    ASSERT_TRUE(vec2[0] == ordered_double[1]);
-    ASSERT_TRUE(vec2[1] == ordered_double[0]);
+    ASSERT_TRUE(vec1[0] == this->reversed_index[1]);
+    ASSERT_TRUE(vec1[1] == this->reversed_index[0]);
+    ASSERT_TRUE(vec2[0] == this->ordered_value[1]);
+    ASSERT_TRUE(vec2[1] == this->ordered_value[0]);
     // Make sure the other values were not touched.
     for (size_t i = 2; i < vec1.size(); ++i) {
-        ASSERT_TRUE(vec1[i] == reversed_int[i]);
-        ASSERT_TRUE(vec2[i] == ordered_double[i]);
+        ASSERT_TRUE(vec1[i] == this->reversed_index[i]);
+        ASSERT_TRUE(vec2[i] == this->ordered_value[i]);
     }
 }
 
diff --git a/core/test/base/lin_op.cpp b/core/test/base/lin_op.cpp
index 2ade2cd50b9..a8c2866f3cc 100644
--- a/core/test/base/lin_op.cpp
+++ b/core/test/base/lin_op.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -236,6 +236,12 @@ TEST_F(EnableLinOp, ExtendedApplyCopiesBackOnlyX)
 }
 
 
+TEST_F(EnableLinOp, ApplyUsesInitialGuessReturnsFalse)
+{
+    ASSERT_FALSE(op->apply_uses_initial_guess());
+}
+
+
 template <typename T = int>
 class DummyLinOpWithFactory
     : public gko::EnableLinOp<DummyLinOpWithFactory<T>> {
diff --git a/core/test/base/math.cpp b/core/test/base/math.cpp
index efe0a05943b..c63cd4ae8e9 100644
--- a/core/test/base/math.cpp
+++ b/core/test/base/math.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cmath>
 #include <complex>
 #include <limits>
+#include <type_traits>
 
 
 #include <gtest/gtest.h>
@@ -44,30 +45,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
+static_assert(
+    std::is_same<double, decltype(real(std::complex<double>()))>::value,
+    "real must return a real type");
+static_assert(
+    std::is_same<double, decltype(imag(std::complex<double>()))>::value,
+    "imag must return a real type");
+
+
 template <typename T>
-void test_real_isfinite()
+void test_real_is_finite()
 {
     using limits = std::numeric_limits<T>;
     constexpr auto inf = limits::infinity();
     // Use volatile to avoid MSVC report divided by zero.
     volatile const T zero{0};
-    ASSERT_TRUE(gko::isfinite(T{0}));
-    ASSERT_TRUE(gko::isfinite(-T{0}));
-    ASSERT_TRUE(gko::isfinite(T{1}));
-    ASSERT_FALSE(gko::isfinite(inf));
-    ASSERT_FALSE(gko::isfinite(-inf));
-    ASSERT_FALSE(gko::isfinite(limits::quiet_NaN()));
-    ASSERT_FALSE(gko::isfinite(limits::signaling_NaN()));
-    ASSERT_FALSE(gko::isfinite(inf - inf));    // results in nan
-    ASSERT_FALSE(gko::isfinite(inf / inf));    // results in nan
-    ASSERT_FALSE(gko::isfinite(inf * T{2}));   // results in inf
-    ASSERT_FALSE(gko::isfinite(T{1} / zero));  // results in inf
-    ASSERT_FALSE(gko::isfinite(T{0} / zero));  // results in nan
+    ASSERT_TRUE(gko::is_finite(T{0}));
+    ASSERT_TRUE(gko::is_finite(-T{0}));
+    ASSERT_TRUE(gko::is_finite(T{1}));
+    ASSERT_FALSE(gko::is_finite(inf));
+    ASSERT_FALSE(gko::is_finite(-inf));
+    ASSERT_FALSE(gko::is_finite(limits::quiet_NaN()));
+    ASSERT_FALSE(gko::is_finite(limits::signaling_NaN()));
+    ASSERT_FALSE(gko::is_finite(inf - inf));    // results in nan
+    ASSERT_FALSE(gko::is_finite(inf / inf));    // results in nan
+    ASSERT_FALSE(gko::is_finite(inf * T{2}));   // results in inf
+    ASSERT_FALSE(gko::is_finite(T{1} / zero));  // results in inf
+    ASSERT_FALSE(gko::is_finite(T{0} / zero));  // results in nan
 }
 
 
 template <typename ComplexType>
-void test_complex_isfinite()
+void test_complex_is_finite()
 {
     static_assert(gko::is_complex_s<ComplexType>::value,
                   "Template type must be a complex type.");
@@ -78,31 +87,52 @@ void test_complex_isfinite()
     constexpr auto quiet_nan = limits::quiet_NaN();
     constexpr auto signaling_nan = limits::signaling_NaN();
 
-    ASSERT_TRUE(gko::isfinite(c_type{T{0}, T{0}}));
-    ASSERT_TRUE(gko::isfinite(c_type{-T{0}, -T{0}}));
-    ASSERT_TRUE(gko::isfinite(c_type{T{1}, T{0}}));
-    ASSERT_TRUE(gko::isfinite(c_type{T{0}, T{1}}));
-    ASSERT_FALSE(gko::isfinite(c_type{inf, T{0}}));
-    ASSERT_FALSE(gko::isfinite(c_type{-inf, T{0}}));
-    ASSERT_FALSE(gko::isfinite(c_type{quiet_nan, T{0}}));
-    ASSERT_FALSE(gko::isfinite(c_type{signaling_nan, T{0}}));
-    ASSERT_FALSE(gko::isfinite(c_type{T{0}, inf}));
-    ASSERT_FALSE(gko::isfinite(c_type{T{0}, -inf}));
-    ASSERT_FALSE(gko::isfinite(c_type{T{0}, quiet_nan}));
-    ASSERT_FALSE(gko::isfinite(c_type{T{0}, signaling_nan}));
+    ASSERT_TRUE(gko::is_finite(c_type{T{0}, T{0}}));
+    ASSERT_TRUE(gko::is_finite(c_type{-T{0}, -T{0}}));
+    ASSERT_TRUE(gko::is_finite(c_type{T{1}, T{0}}));
+    ASSERT_TRUE(gko::is_finite(c_type{T{0}, T{1}}));
+    ASSERT_FALSE(gko::is_finite(c_type{inf, T{0}}));
+    ASSERT_FALSE(gko::is_finite(c_type{-inf, T{0}}));
+    ASSERT_FALSE(gko::is_finite(c_type{quiet_nan, T{0}}));
+    ASSERT_FALSE(gko::is_finite(c_type{signaling_nan, T{0}}));
+    ASSERT_FALSE(gko::is_finite(c_type{T{0}, inf}));
+    ASSERT_FALSE(gko::is_finite(c_type{T{0}, -inf}));
+    ASSERT_FALSE(gko::is_finite(c_type{T{0}, quiet_nan}));
+    ASSERT_FALSE(gko::is_finite(c_type{T{0}, signaling_nan}));
 }
 
 
-TEST(IsFinite, Float) { test_real_isfinite<float>(); }
+TEST(IsFinite, Float) { test_real_is_finite<float>(); }
+
+
+TEST(IsFinite, Double) { test_real_is_finite<double>(); }
+
+
+TEST(IsFinite, FloatComplex) { test_complex_is_finite<std::complex<float>>(); }
+
 
+TEST(IsFinite, DoubleComplex)
+{
+    test_complex_is_finite<std::complex<double>>();
+}
 
-TEST(IsFinite, Double) { test_real_isfinite<double>(); }
 
+TEST(Conjugate, FloatComplex)
+{
+    std::complex<float> a(1, 1);
+    std::complex<float> b(1, -1);
 
-TEST(IsFinite, FloatComplex) { test_complex_isfinite<std::complex<float>>(); }
+    ASSERT_EQ(conj(a), b);
+}
 
 
-TEST(IsFinite, DoubleComplex) { test_complex_isfinite<std::complex<double>>(); }
+TEST(Conjugate, DoubleComplex)
+{
+    std::complex<double> a(1, 1);
+    std::complex<double> b(1, -1);
+
+    ASSERT_EQ(conj(a), b);
+}
 
 
 }  // namespace
diff --git a/core/test/base/matrix_data.cpp b/core/test/base/matrix_data.cpp
index 129c5c8c470..fcb2f48f29b 100644
--- a/core/test/base/matrix_data.cpp
+++ b/core/test/base/matrix_data.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/matrix_data.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
 namespace {
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index aa733b74781..a5a17beeefc 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/mtx_io.hpp>
 
 
-#include <gtest/gtest.h>
+#include <sstream>
 
 
-#include <sstream>
+#include <gtest/gtest.h>
 
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
-TEST(MtxReader, ReadsDenseRealMtx)
+TEST(MtxReader, ReadsDenseDoubleRealMtx)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
     std::istringstream iss(
@@ -72,9 +75,35 @@ TEST(MtxReader, ReadsDenseRealMtx)
 }
 
 
-TEST(MtxReader, ReadsDenseIntegerMtx)
+TEST(MtxReader, ReadsDenseDoubleRealMtxWith64Index)
 {
-    using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
+    using tpl = gko::matrix_data<double, gko::int64>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array real general\n"
+        "2 3\n"
+        "1.0\n"
+        "0.0\n"
+        "3.0\n"
+        "5.0\n"
+        "2.0\n"
+        "0.0\n");
+
+    auto data = gko::read_raw<double, gko::int64>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto &v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, 1.0));
+    ASSERT_EQ(v[1], tpl(0, 1, 3.0));
+    ASSERT_EQ(v[2], tpl(0, 2, 2.0));
+    ASSERT_EQ(v[3], tpl(1, 0, 0.0));
+    ASSERT_EQ(v[4], tpl(1, 1, 5.0));
+    ASSERT_EQ(v[5], tpl(1, 2, 0.0));
+}
+
+
+TEST(MtxReader, ReadsDenseFloatIntegerMtx)
+{
+    using tpl = gko::matrix_data<float, gko::int32>::nonzero_type;
     std::istringstream iss(
         "%%MatrixMarket matrix array integer general\n"
         "2 3\n"
@@ -85,7 +114,7 @@ TEST(MtxReader, ReadsDenseIntegerMtx)
         "2\n"
         "0\n");
 
-    auto data = gko::read_raw<double, gko::int32>(iss);
+    auto data = gko::read_raw<float, gko::int32>(iss);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     auto &v = data.nonzeros;
@@ -98,9 +127,89 @@ TEST(MtxReader, ReadsDenseIntegerMtx)
 }
 
 
-TEST(MtxReader, ReadsDenseComplexMtx)
+TEST(MtxReader, ReadsDenseFloatIntegerMtxWith64Index)
+{
+    using tpl = gko::matrix_data<float, gko::int64>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array integer general\n"
+        "2 3\n"
+        "1\n"
+        "0\n"
+        "3\n"
+        "5\n"
+        "2\n"
+        "0\n");
+
+    auto data = gko::read_raw<float, gko::int64>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto &v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, 1.0));
+    ASSERT_EQ(v[1], tpl(0, 1, 3.0));
+    ASSERT_EQ(v[2], tpl(0, 2, 2.0));
+    ASSERT_EQ(v[3], tpl(1, 0, 0.0));
+    ASSERT_EQ(v[4], tpl(1, 1, 5.0));
+    ASSERT_EQ(v[5], tpl(1, 2, 0.0));
+}
+
+
+TEST(MtxReader, ReadsDenseComplexDoubleMtx)
+{
+    using cpx = std::complex<double>;
+    using tpl = gko::matrix_data<cpx, gko::int32>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array complex general\n"
+        "2 3\n"
+        "1.0 2.0\n"
+        "0.0 0.0\n"
+        "3.0 1.0\n"
+        "5.0 3.0\n"
+        "2.0 4.0\n"
+        "0.0 0.0\n");
+
+    auto data = gko::read_raw<cpx, gko::int32>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto &v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, cpx(1.0, 2.0)));
+    ASSERT_EQ(v[1], tpl(0, 1, cpx(3.0, 1.0)));
+    ASSERT_EQ(v[2], tpl(0, 2, cpx(2.0, 4.0)));
+    ASSERT_EQ(v[3], tpl(1, 0, cpx(0.0, 0.0)));
+    ASSERT_EQ(v[4], tpl(1, 1, cpx(5.0, 3.0)));
+    ASSERT_EQ(v[5], tpl(1, 2, cpx(0.0, 0.0)));
+}
+
+
+TEST(MtxReader, ReadsDenseComplexDoubleMtxWith64Index)
 {
     using cpx = std::complex<double>;
+    using tpl = gko::matrix_data<cpx, gko::int64>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array complex general\n"
+        "2 3\n"
+        "1.0 2.0\n"
+        "0.0 0.0\n"
+        "3.0 1.0\n"
+        "5.0 3.0\n"
+        "2.0 4.0\n"
+        "0.0 0.0\n");
+
+    auto data = gko::read_raw<cpx, gko::int64>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto &v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, cpx(1.0, 2.0)));
+    ASSERT_EQ(v[1], tpl(0, 1, cpx(3.0, 1.0)));
+    ASSERT_EQ(v[2], tpl(0, 2, cpx(2.0, 4.0)));
+    ASSERT_EQ(v[3], tpl(1, 0, cpx(0.0, 0.0)));
+    ASSERT_EQ(v[4], tpl(1, 1, cpx(5.0, 3.0)));
+    ASSERT_EQ(v[5], tpl(1, 2, cpx(0.0, 0.0)));
+}
+
+
+TEST(MtxReader, ReadsDenseComplexFloatMtx)
+{
+    using cpx = std::complex<float>;
     using tpl = gko::matrix_data<cpx, gko::int32>::nonzero_type;
     std::istringstream iss(
         "%%MatrixMarket matrix array complex general\n"
@@ -125,6 +234,33 @@ TEST(MtxReader, ReadsDenseComplexMtx)
 }
 
 
+TEST(MtxReader, ReadsDenseComplexFloatMtxWith64Index)
+{
+    using cpx = std::complex<float>;
+    using tpl = gko::matrix_data<cpx, gko::int64>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array complex general\n"
+        "2 3\n"
+        "1.0 2.0\n"
+        "0.0 0.0\n"
+        "3.0 1.0\n"
+        "5.0 3.0\n"
+        "2.0 4.0\n"
+        "0.0 0.0\n");
+
+    auto data = gko::read_raw<cpx, gko::int64>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto &v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, cpx(1.0, 2.0)));
+    ASSERT_EQ(v[1], tpl(0, 1, cpx(3.0, 1.0)));
+    ASSERT_EQ(v[2], tpl(0, 2, cpx(2.0, 4.0)));
+    ASSERT_EQ(v[3], tpl(1, 0, cpx(0.0, 0.0)));
+    ASSERT_EQ(v[4], tpl(1, 1, cpx(5.0, 3.0)));
+    ASSERT_EQ(v[5], tpl(1, 2, cpx(0.0, 0.0)));
+}
+
+
 TEST(MtxReader, ReadsSparseRealMtx)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
@@ -273,10 +409,57 @@ TEST(MtxReader, FailsWhenReadingSparseComplexMtxToRealMtx)
 }
 
 
-TEST(MatrixData, WritesRealMatrixToMatrixMarketArray)
+TEST(MatrixData, WritesDoubleRealMatrixToMatrixMarketArray)
+{
+    // clang-format off
+    gko::matrix_data<double, gko::int32> data{
+        {1.0, 2.0},
+        {2.1, 0.0},
+        {3.0, 3.2}};
+    // clang-format on
+    std::ostringstream oss{};
+
+    write_raw(oss, data);
+
+    ASSERT_EQ(oss.str(),
+              "%%MatrixMarket matrix array real general\n"
+              "3 2\n"
+              "1\n"
+              "2.1\n"
+              "3\n"
+              "2\n"
+              "0\n"
+              "3.2\n");
+}
+
+
+TEST(MatrixData, WritesFloatRealMatrixToMatrixMarketCoordinate)
 {
     // clang-format off
-    gko::matrix_data<> data{
+    gko::matrix_data<float, gko::int32> data{
+        {1.0, 2.0},
+        {2.1, 0.0},
+        {3.0, 3.2}};
+    // clang-format on
+    std::ostringstream oss{};
+
+    write_raw(oss, data, gko::layout_type::coordinate);
+
+    ASSERT_EQ(oss.str(),
+              "%%MatrixMarket matrix coordinate real general\n"
+              "3 2 5\n"
+              "1 1 1\n"
+              "1 2 2\n"
+              "2 1 2.1\n"
+              "3 1 3\n"
+              "3 2 3.2\n");
+}
+
+
+TEST(MatrixData, WritesDoubleRealMatrixToMatrixMarketArrayWith64Index)
+{
+    // clang-format off
+    gko::matrix_data<double, gko::int64> data{
         {1.0, 2.0},
         {2.1, 0.0},
         {3.0, 3.2}};
@@ -297,10 +480,10 @@ TEST(MatrixData, WritesRealMatrixToMatrixMarketArray)
 }
 
 
-TEST(MatrixData, WritesRealMatrixToMatrixMarketCoordinate)
+TEST(MatrixData, WritesFloatRealMatrixToMatrixMarketCoordinateWith64Index)
 {
     // clang-format off
-    gko::matrix_data<> data{
+    gko::matrix_data<float, gko::int64> data{
         {1.0, 2.0},
         {2.1, 0.0},
         {3.0, 3.2}};
@@ -320,10 +503,57 @@ TEST(MatrixData, WritesRealMatrixToMatrixMarketCoordinate)
 }
 
 
-TEST(MatrixData, WritesComplexMatrixToMatrixMarketArray)
+TEST(MatrixData, WritesComplexDoubleMatrixToMatrixMarketArray)
+{
+    // clang-format off
+    gko::matrix_data<std::complex<double>, gko::int32> data{
+        {{1.0, 0.0}, {2.0, 3.2}},
+        {{2.1, 2.2}, {0.0, 0.0}},
+        {{0.0, 3.0}, {3.2, 5.3}}};
+    // clang-format on
+    std::ostringstream oss{};
+
+    write_raw(oss, data);
+
+    ASSERT_EQ(oss.str(),
+              "%%MatrixMarket matrix array complex general\n"
+              "3 2\n"
+              "1 0\n"
+              "2.1 2.2\n"
+              "0 3\n"
+              "2 3.2\n"
+              "0 0\n"
+              "3.2 5.3\n");
+}
+
+
+TEST(MatrixData, WritesComplexFloatMatrixToMatrixMarketCoordinate)
+{
+    // clang-format off
+    gko::matrix_data<std::complex<float>, gko::int32> data{
+        {{1.0, 0.0}, {2.0, 3.2}},
+        {{2.1, 2.2}, {0.0, 0.0}},
+        {{0.0, 3.0}, {3.2, 5.3}}};
+    // clang-format on
+    std::ostringstream oss{};
+
+    write_raw(oss, data, gko::layout_type::coordinate);
+
+    ASSERT_EQ(oss.str(),
+              "%%MatrixMarket matrix coordinate complex general\n"
+              "3 2 5\n"
+              "1 1 1 0\n"
+              "1 2 2 3.2\n"
+              "2 1 2.1 2.2\n"
+              "3 1 0 3\n"
+              "3 2 3.2 5.3\n");
+}
+
+
+TEST(MatrixData, WritesComplexDoubleMatrixToMatrixMarketArrayWith64Index)
 {
     // clang-format off
-    gko::matrix_data<std::complex<double>> data{
+    gko::matrix_data<std::complex<double>, gko::int64> data{
         {{1.0, 0.0}, {2.0, 3.2}},
         {{2.1, 2.2}, {0.0, 0.0}},
         {{0.0, 3.0}, {3.2, 5.3}}};
@@ -344,10 +574,10 @@ TEST(MatrixData, WritesComplexMatrixToMatrixMarketArray)
 }
 
 
-TEST(MatrixData, WritesComplexMatrixToMatrixMarketCoordinate)
+TEST(MatrixData, WritesComplexFloatMatrixToMatrixMarketCoordinateWith64Index)
 {
     // clang-format off
-    gko::matrix_data<std::complex<double>> data{
+    gko::matrix_data<std::complex<float>, gko::int64> data{
         {{1.0, 0.0}, {2.0, 3.2}},
         {{2.1, 2.2}, {0.0, 0.0}},
         {{0.0, 3.0}, {3.2, 5.3}}};
@@ -401,9 +631,23 @@ class DummyLinOp
 };
 
 
-TEST(MtxReader, ReadsLinOpFromStream)
+template <typename ValueIndexType>
+class RealDummyLinOpTest : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+};
+
+TYPED_TEST_CASE(RealDummyLinOpTest, gko::test::RealValueIndexTypes);
+
+
+TYPED_TEST(RealDummyLinOpTest, ReadsLinOpFromStream)
 {
-    using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
     std::istringstream iss(
         "%%MatrixMarket matrix array real general\n"
         "2 3\n"
@@ -414,7 +658,7 @@ TEST(MtxReader, ReadsLinOpFromStream)
         "2.0\n"
         "0.0\n");
 
-    auto lin_op = gko::read<DummyLinOp<double, gko::int32>>(
+    auto lin_op = gko::read<DummyLinOp<value_type, index_type>>(
         iss, gko::ReferenceExecutor::create());
 
     const auto &data = lin_op->data_;
@@ -429,9 +673,10 @@ TEST(MtxReader, ReadsLinOpFromStream)
 }
 
 
-TEST(MtxReader, WritesLinOpToStream)
+TYPED_TEST(RealDummyLinOpTest, WritesLinOpToStream)
 {
-    using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     std::istringstream iss(
         "%%MatrixMarket matrix array real general\n"
         "2 3\n"
@@ -441,7 +686,7 @@ TEST(MtxReader, WritesLinOpToStream)
         "5.0\n"
         "2.0\n"
         "0.0\n");
-    auto lin_op = gko::read<DummyLinOp<double, gko::int32>>(
+    auto lin_op = gko::read<DummyLinOp<value_type, index_type>>(
         iss, gko::ReferenceExecutor::create());
     std::ostringstream oss{};
 
@@ -459,4 +704,77 @@ TEST(MtxReader, WritesLinOpToStream)
 }
 
 
+template <typename ValueIndexType>
+class ComplexDummyLinOpTest : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+};
+
+TYPED_TEST_CASE(ComplexDummyLinOpTest, gko::test::ComplexValueIndexTypes);
+
+
+TYPED_TEST(ComplexDummyLinOpTest, ReadsLinOpFromStream)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array complex general\n"
+        "2 3\n"
+        "1.0 2.0\n"
+        "0.0 0.0\n"
+        "3.0 4.0\n"
+        "5.0 6.0\n"
+        "2.0 3.0\n"
+        "0.0 0.0\n");
+
+    auto lin_op = gko::read<DummyLinOp<value_type, index_type>>(
+        iss, gko::ReferenceExecutor::create());
+
+    const auto &data = lin_op->data_;
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    const auto &v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, value_type{1.0, 2.0}));
+    ASSERT_EQ(v[1], tpl(0, 1, value_type{3.0, 4.0}));
+    ASSERT_EQ(v[2], tpl(0, 2, value_type{2.0, 3.0}));
+    ASSERT_EQ(v[3], tpl(1, 0, value_type{0.0, 0.0}));
+    ASSERT_EQ(v[4], tpl(1, 1, value_type{5.0, 6.0}));
+    ASSERT_EQ(v[5], tpl(1, 2, value_type{0.0, 0.0}));
+}
+
+
+TYPED_TEST(ComplexDummyLinOpTest, WritesLinOpToStream)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array complex general\n"
+        "2 3\n"
+        "1.0 2.0\n"
+        "0.0 0.0\n"
+        "3.0 4.0\n"
+        "5.0 6.0\n"
+        "2.0 3.0\n"
+        "0.0 0.0\n");
+    auto lin_op = gko::read<DummyLinOp<value_type, index_type>>(
+        iss, gko::ReferenceExecutor::create());
+    std::ostringstream oss{};
+
+    write(oss, lend(lin_op));
+
+    ASSERT_EQ(oss.str(),
+              "%%MatrixMarket matrix array complex general\n"
+              "2 3\n"
+              "1 2\n"
+              "0 0\n"
+              "3 4\n"
+              "5 6\n"
+              "2 3\n"
+              "0 0\n");
+}
+
+
 }  // namespace
diff --git a/core/test/base/perturbation.cpp b/core/test/base/perturbation.cpp
index 950d81be2df..c0ddbd73cdc 100644
--- a/core/test/base/perturbation.cpp
+++ b/core/test/base/perturbation.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/base/polymorphic_object.cpp b/core/test/base/polymorphic_object.cpp
index 70d1f849a5d..2b2e32bd409 100644
--- a/core/test/base/polymorphic_object.cpp
+++ b/core/test/base/polymorphic_object.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/base/range.cpp b/core/test/base/range.cpp
index 4df4e3ba65e..6be342ca380 100644
--- a/core/test/base/range.cpp
+++ b/core/test/base/range.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/range.hpp>
 
 
-#include <gtest/gtest.h>
+#include <array>
 
 
-#include <array>
+#include <gtest/gtest.h>
 
 
 namespace {
diff --git a/core/test/base/range_accessors.cpp b/core/test/base/range_accessors.cpp
index 232c5feff7b..9066566a33a 100644
--- a/core/test/base/range_accessors.cpp
+++ b/core/test/base/range_accessors.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/base/sanitizers.cpp b/core/test/base/sanitizers.cpp
new file mode 100644
index 00000000000..724b7f38871
--- /dev/null
+++ b/core/test/base/sanitizers.cpp
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <limits>
+#include <memory>
+#include <thread>
+
+
+#include <gtest/gtest.h>
+
+
+TEST(Sanitizers, UseAfterFree)
+{
+    char *x = new char[50];
+    x[0] = 'H';
+    x[1] = 'I';
+    x[2] = '\n';
+
+    std::free(x);
+
+    static volatile char z = x[0];
+}
+
+
+TEST(Sanitizers, MemoryLeak)
+{
+    char *x = new char[50];
+    x[0] = 'H';
+    x[1] = 'I';
+    x[2] = '\n';
+}
+
+
+TEST(Sanitizers, UndefinedBehavior)
+{
+    int x = std::numeric_limits<int>::max();
+    int y = 10001;
+
+    static volatile int z = x + y;
+}
+
+
+int Global = 0;
+void *Thread(void *x)
+{
+    Global = 42;
+    return x;
+}
+
+
+TEST(Sanitizers, RaceCondition)
+{
+    std::thread t(Thread, &Global);
+
+    Global = 43;
+    t.join();
+}
diff --git a/core/test/base/types.cpp b/core/test/base/types.cpp
index 21e444e31ea..50979cd8a67 100644
--- a/core/test/base/types.cpp
+++ b/core/test/base/types.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp
index 07bcd6e41b0..df187cfb81f 100644
--- a/core/test/base/utils.cpp
+++ b/core/test/base/utils.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -50,11 +50,17 @@ struct Base {
 struct Derived : Base {};
 
 
-struct NonRelated {
-    virtual ~NonRelated() = default;
+struct NonRelated : Base {};
+
+
+struct Base2 {
+    virtual ~Base2() = default;
 };
 
 
+struct MultipleDerived : Base, Base2 {};
+
+
 struct ClonableDerived : Base {
     ClonableDerived(std::shared_ptr<const gko::Executor> exec = nullptr)
         : executor(exec)
@@ -248,7 +254,15 @@ TEST(As, FailsToConvertIfNotRelated)
     Derived d;
     Base *b = &d;
 
-    ASSERT_THROW(gko::as<NonRelated>(b), gko::NotSupported);
+    try {
+        gko::as<NonRelated>(b);
+        FAIL();
+    } catch (gko::NotSupported &m) {
+        std::string msg{m.what()};
+        auto expected = gko::name_demangling::get_type_name(typeid(Derived));
+        ASSERT_TRUE(
+            std::equal(expected.rbegin(), expected.rend(), msg.rbegin()));
+    }
 }
 
 
@@ -266,7 +280,98 @@ TEST(As, FailsToConvertConstantIfNotRelated)
     Derived d;
     const Base *b = &d;
 
-    ASSERT_THROW(gko::as<NonRelated>(b), gko::NotSupported);
+    try {
+        gko::as<NonRelated>(b);
+        FAIL();
+    } catch (gko::NotSupported &m) {
+        std::string msg{m.what()};
+        auto expected = gko::name_demangling::get_type_name(typeid(Derived));
+        ASSERT_TRUE(
+            std::equal(expected.rbegin(), expected.rend(), msg.rbegin()));
+    }
+}
+
+
+TEST(As, ConvertsPolymorphicTypeUniquePtr)
+{
+    auto expected = new Derived{};
+
+    ASSERT_EQ(gko::as<Derived>(std::unique_ptr<Base>{expected}).get(),
+              expected);
+}
+
+
+TEST(As, FailsToConvertUniquePtrIfNotRelated)
+{
+    auto expected = new Derived{};
+
+    ASSERT_THROW(gko::as<NonRelated>(std::unique_ptr<Base>{expected}),
+                 gko::NotSupported);
+}
+
+
+TEST(As, ConvertsPolymorphicTypeSharedPtr)
+{
+    auto expected = new Derived{};
+
+    ASSERT_EQ(gko::as<Derived>(std::shared_ptr<Base>{expected}).get(),
+              expected);
+}
+
+
+TEST(As, FailsToConvertSharedPtrIfNotRelated)
+{
+    auto expected = new Derived{};
+
+    ASSERT_THROW(gko::as<NonRelated>(std::shared_ptr<Base>{expected}),
+                 gko::NotSupported);
+}
+
+
+TEST(As, ConvertsConstPolymorphicTypeSharedPtr)
+{
+    auto expected = new Derived{};
+
+    ASSERT_EQ(gko::as<Derived>(std::shared_ptr<const Base>{expected}).get(),
+              expected);
+}
+
+
+TEST(As, FailsToConvertConstSharedPtrIfNotRelated)
+{
+    auto expected = new Derived{};
+
+    ASSERT_THROW(gko::as<NonRelated>(std::shared_ptr<const Base>{expected}),
+                 gko::NotSupported);
+}
+
+
+TEST(As, CanCrossCastUniquePtr)
+{
+    auto obj = std::unique_ptr<MultipleDerived>(new MultipleDerived{});
+    auto ptr = obj.get();
+    auto base = gko::as<Base>(std::move(obj));
+
+    ASSERT_EQ(gko::as<MultipleDerived>(gko::as<Base2>(std::move(base))).get(),
+              ptr);
+}
+
+
+TEST(As, CanCrossCastSharedPtr)
+{
+    auto obj = std::make_shared<MultipleDerived>();
+    auto base = gko::as<Base>(obj);
+
+    ASSERT_EQ(gko::as<MultipleDerived>(gko::as<Base2>(base)), base);
+}
+
+
+TEST(As, CanCrossCastConstSharedPtr)
+{
+    auto obj = std::make_shared<const MultipleDerived>();
+    auto base = gko::as<const Base>(obj);
+
+    ASSERT_EQ(gko::as<const MultipleDerived>(gko::as<const Base2>(base)), base);
 }
 
 
diff --git a/core/test/base/version.cpp b/core/test/base/version.cpp
index ce209e05bb5..0fc86a03b7f 100644
--- a/core/test/base/version.cpp
+++ b/core/test/base/version.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/version.hpp>
 
 
-#include <gtest/gtest.h>
+#include <sstream>
 
 
-#include <sstream>
+#include <gtest/gtest.h>
 
 
 namespace {
diff --git a/core/test/factorization/CMakeLists.txt b/core/test/factorization/CMakeLists.txt
index 16f1fe27d91..9b2e3082e51 100644
--- a/core/test/factorization/CMakeLists.txt
+++ b/core/test/factorization/CMakeLists.txt
@@ -1 +1,2 @@
 ginkgo_create_test(par_ilu)
+ginkgo_create_test(par_ilut)
diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp
index 5dc94c752b4..75c714fcab1 100644
--- a/core/test/factorization/par_ilu.cpp
+++ b/core/test/factorization/par_ilu.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,13 +39,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class ParIlu : public ::testing::Test {
 public:
-    using value_type = gko::default_precision;
-    using index_type = gko::int32;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using ilu_factory_type = gko::factorization::ParIlu<value_type, index_type>;
 
 protected:
@@ -54,29 +60,35 @@ class ParIlu : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
+TYPED_TEST_CASE(ParIlu, gko::test::ValueIndexTypes);
+
 
-TEST_F(ParIlu, SetIterations)
+TYPED_TEST(ParIlu, SetIterations)
 {
-    auto factory = ilu_factory_type::build().with_iterations(5u).on(ref);
+    auto factory =
+        TestFixture::ilu_factory_type::build().with_iterations(5u).on(
+            this->ref);
 
     ASSERT_EQ(factory->get_parameters().iterations, 5u);
 }
 
 
-TEST_F(ParIlu, SetSkip)
+TYPED_TEST(ParIlu, SetSkip)
 {
-    auto factory = ilu_factory_type::build().with_skip_sorting(true).on(ref);
+    auto factory =
+        TestFixture::ilu_factory_type::build().with_skip_sorting(true).on(
+            this->ref);
 
     ASSERT_EQ(factory->get_parameters().skip_sorting, true);
 }
 
 
-TEST_F(ParIlu, SetEverything)
+TYPED_TEST(ParIlu, SetEverything)
 {
-    auto factory = ilu_factory_type::build()
+    auto factory = TestFixture::ilu_factory_type::build()
                        .with_skip_sorting(false)
                        .with_iterations(7u)
-                       .on(ref);
+                       .on(this->ref);
 
     ASSERT_EQ(factory->get_parameters().skip_sorting, false);
     ASSERT_EQ(factory->get_parameters().iterations, 7u);
diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp
new file mode 100644
index 00000000000..e33f0bc35b5
--- /dev/null
+++ b/core/test/factorization/par_ilut.cpp
@@ -0,0 +1,131 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/par_ilut.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace {
+
+
+class ParIlut : public ::testing::Test {
+public:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using ilut_factory_type =
+        gko::factorization::ParIlut<value_type, index_type>;
+
+protected:
+    ParIlut() : ref(gko::ReferenceExecutor::create()) {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+};
+
+
+TEST_F(ParIlut, SetIterations)
+{
+    auto factory = ilut_factory_type::build().with_iterations(6u).on(ref);
+
+    ASSERT_EQ(factory->get_parameters().iterations, 6u);
+}
+
+
+TEST_F(ParIlut, SetSkip)
+{
+    auto factory = ilut_factory_type::build().with_skip_sorting(true).on(ref);
+
+    ASSERT_EQ(factory->get_parameters().skip_sorting, true);
+}
+
+
+TEST_F(ParIlut, SetApprox)
+{
+    auto factory =
+        ilut_factory_type::build().with_approximate_select(false).on(ref);
+
+    ASSERT_EQ(factory->get_parameters().approximate_select, false);
+}
+
+
+TEST_F(ParIlut, SetDeterministic)
+{
+    auto factory =
+        ilut_factory_type::build().with_deterministic_sample(true).on(ref);
+
+    ASSERT_EQ(factory->get_parameters().deterministic_sample, true);
+}
+
+
+TEST_F(ParIlut, SetFillIn)
+{
+    auto factory = ilut_factory_type::build().with_fill_in_limit(1.2).on(ref);
+
+    ASSERT_EQ(factory->get_parameters().fill_in_limit, 1.2);
+}
+
+
+TEST_F(ParIlut, SetDefaults)
+{
+    auto factory = ilut_factory_type::build().on(ref);
+
+    ASSERT_EQ(factory->get_parameters().skip_sorting, false);
+    ASSERT_EQ(factory->get_parameters().iterations, 5u);
+    ASSERT_EQ(factory->get_parameters().approximate_select, true);
+    ASSERT_EQ(factory->get_parameters().deterministic_sample, false);
+    ASSERT_EQ(factory->get_parameters().fill_in_limit, 2.0);
+}
+
+
+TEST_F(ParIlut, SetEverything)
+{
+    auto factory = ilut_factory_type::build()
+                       .with_skip_sorting(true)
+                       .with_iterations(7u)
+                       .with_approximate_select(false)
+                       .with_deterministic_sample(true)
+                       .with_fill_in_limit(1.2)
+                       .on(ref);
+
+    ASSERT_EQ(factory->get_parameters().skip_sorting, true);
+    ASSERT_EQ(factory->get_parameters().iterations, 7u);
+    ASSERT_EQ(factory->get_parameters().approximate_select, false);
+    ASSERT_EQ(factory->get_parameters().deterministic_sample, true);
+    ASSERT_EQ(factory->get_parameters().fill_in_limit, 1.2);
+}
+
+
+}  // namespace
diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp
index 116895d178f..bb05007817b 100644
--- a/core/test/log/convergence.cpp
+++ b/core/test/log/convergence.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,13 +39,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
+template <typename T>
+class Convergence : public ::testing::Test {};
+
+TYPED_TEST_CASE(Convergence, gko::test::ValueTypes);
+
 
-TEST(Record, CanGetData)
+TYPED_TEST(Convergence, CanGetData)
 {
     auto exec = gko::ReferenceExecutor::create();
-    auto logger = gko::log::Convergence<>::create(
+    auto logger = gko::log::Convergence<TypeParam>::create(
         exec, gko::log::Logger::iteration_complete_mask);
 
     ASSERT_EQ(logger->get_num_iterations(), 0);
diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp
index 92e4cdfad5e..e051e16f692 100644
--- a/core/test/log/logger.cpp
+++ b/core/test/log/logger.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,14 +31,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <ginkgo/core/log/logger.hpp>
-#include <ginkgo/core/log/record.hpp>
-#include <ginkgo/core/log/stream.hpp>
 
 
-#include <gtest/gtest.h>
 #include <memory>
 
 
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/log/record.hpp>
+#include <ginkgo/core/log/stream.hpp>
+
+
 namespace {
 
 
diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp
index 676aaf5261b..d318a29f228 100644
--- a/core/test/log/papi.cpp
+++ b/core/test/log/papi.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,28 +30,32 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/papi.hpp>
 
 
+#include <stdexcept>
+
+
 #include <gtest/gtest.h>
 #include <papi.h>
-#include <stdexcept>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Papi : public ::testing::Test {
 protected:
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<T>;
 
     Papi() : exec(gko::ReferenceExecutor::create()), eventset(PAPI_NULL) {}
 
@@ -69,11 +73,11 @@ class Papi : public ::testing::Test {
 
     void TearDown() { eventset = PAPI_NULL; }
 
-    template <typename T>
+    template <typename U>
     const std::string init(const gko::log::Logger::mask_type &event,
-                           const std::string &event_name, T *ptr)
+                           const std::string &event_name, U *ptr)
     {
-        logger = gko::log::Papi<>::create(exec, event);
+        logger = gko::log::Papi<T>::create(exec, event);
         std::ostringstream os;
         os << "sde:::" << logger->get_handle_name() << "::" << event_name
            << "::" << reinterpret_cast<gko::uintptr>(ptr);
@@ -110,352 +114,377 @@ class Papi : public ::testing::Test {
         }
     }
 
-    std::shared_ptr<const gko::log::Papi<>> logger;
+    std::shared_ptr<const gko::log::Papi<T>> logger;
     std::shared_ptr<const gko::Executor> exec;
     int eventset;
 };
 
+TYPED_TEST_CASE(Papi, gko::test::ValueTypes);
+
 
-TEST_F(Papi, CatchesAllocationStarted)
+TYPED_TEST(Papi, CatchesAllocationStarted)
 {
     int logged_value = 42;
-    auto str = init(gko::log::Logger::allocation_started_mask,
-                    "allocation_started", exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::allocation_started_mask,
+                          "allocation_started", this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::allocation_started>(exec.get(), logged_value);
+    this->start();
+    this->logger->template on<gko::log::Logger::allocation_started>(
+        this->exec.get(), logged_value);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, logged_value);
 }
 
 
-TEST_F(Papi, CatchesAllocationCompleted)
+TYPED_TEST(Papi, CatchesAllocationCompleted)
 {
     int logged_value = 42;
-    auto str = init(gko::log::Logger::allocation_completed_mask,
-                    "allocation_completed", exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::allocation_completed_mask,
+                          "allocation_completed", this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::allocation_completed>(exec.get(), logged_value,
-                                                       0);
+    this->start();
+    this->logger->template on<gko::log::Logger::allocation_completed>(
+        this->exec.get(), logged_value, 0);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, logged_value);
 }
 
 
-TEST_F(Papi, CatchesFreeStarted)
+TYPED_TEST(Papi, CatchesFreeStarted)
 {
-    auto str =
-        init(gko::log::Logger::free_started_mask, "free_started", exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::free_started_mask, "free_started",
+                          this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::free_started>(exec.get(), 0);
+    this->start();
+    this->logger->template on<gko::log::Logger::free_started>(this->exec.get(),
+                                                              0);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesFreeCompleted)
+TYPED_TEST(Papi, CatchesFreeCompleted)
 {
-    auto str = init(gko::log::Logger::free_completed_mask, "free_completed",
-                    exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::free_completed_mask,
+                          "free_completed", this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::free_completed>(exec.get(), 0);
+    this->start();
+    this->logger->template on<gko::log::Logger::free_completed>(
+        this->exec.get(), 0);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesCopyStarted)
+TYPED_TEST(Papi, CatchesCopyStarted)
 {
     auto logged_value = 42;
-    auto str = init(gko::log::Logger::copy_started_mask, "copy_started_from",
-                    exec.get());
+    auto str = this->init(gko::log::Logger::copy_started_mask,
+                          "copy_started_from", this->exec.get());
     std::ostringstream os_out;
-    os_out << "sde:::" << logger->get_handle_name() << "::copy_started_to::"
-           << reinterpret_cast<gko::uintptr>(exec.get());
-    add_event(str);
-    add_event(os_out.str());
-
-    start();
-    logger->on<gko::log::Logger::copy_started>(exec.get(), exec.get(), 0, 0,
-                                               logged_value);
+    os_out << "sde:::" << this->logger->get_handle_name()
+           << "::copy_started_to::"
+           << reinterpret_cast<gko::uintptr>(this->exec.get());
+    this->add_event(str);
+    this->add_event(os_out.str());
+
+    this->start();
+    this->logger->template on<gko::log::Logger::copy_started>(
+        this->exec.get(), this->exec.get(), 0, 0, logged_value);
     long long int values[2];
-    stop(values);
+    this->stop(values);
 
     ASSERT_EQ(values[0], logged_value);
     ASSERT_EQ(values[1], logged_value);
 }
 
 
-TEST_F(Papi, CatchesCopyCompleted)
+TYPED_TEST(Papi, CatchesCopyCompleted)
 {
     auto logged_value = 42;
-    auto str = init(gko::log::Logger::copy_completed_mask,
-                    "copy_completed_from", exec.get());
+    auto str = this->init(gko::log::Logger::copy_completed_mask,
+                          "copy_completed_from", this->exec.get());
     std::ostringstream os_out;
-    os_out << "sde:::" << logger->get_handle_name() << "::copy_completed_to::"
-           << reinterpret_cast<gko::uintptr>(exec.get());
-    add_event(str);
-    add_event(os_out.str());
-
-    start();
-    logger->on<gko::log::Logger::copy_completed>(exec.get(), exec.get(), 0, 0,
-                                                 logged_value);
+    os_out << "sde:::" << this->logger->get_handle_name()
+           << "::copy_completed_to::"
+           << reinterpret_cast<gko::uintptr>(this->exec.get());
+    this->add_event(str);
+    this->add_event(os_out.str());
+
+    this->start();
+    this->logger->template on<gko::log::Logger::copy_completed>(
+        this->exec.get(), this->exec.get(), 0, 0, logged_value);
     long long int values[2];
-    stop(values);
+    this->stop(values);
 
     ASSERT_EQ(values[0], logged_value);
     ASSERT_EQ(values[1], logged_value);
 }
 
 
-TEST_F(Papi, CatchesOperationLaunched)
+TYPED_TEST(Papi, CatchesOperationLaunched)
 {
-    auto str = init(gko::log::Logger::operation_launched_mask,
-                    "operation_launched", exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::operation_launched_mask,
+                          "operation_launched", this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::operation_launched>(exec.get(), nullptr);
+    this->start();
+    this->logger->template on<gko::log::Logger::operation_launched>(
+        this->exec.get(), nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesOperationCompleted)
+TYPED_TEST(Papi, CatchesOperationCompleted)
 {
-    auto str = init(gko::log::Logger::operation_completed_mask,
-                    "operation_completed", exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::operation_completed_mask,
+                          "operation_completed", this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::operation_completed>(exec.get(), nullptr);
+    this->start();
+    this->logger->template on<gko::log::Logger::operation_completed>(
+        this->exec.get(), nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesPolymorphicObjectCreateStarted)
+TYPED_TEST(Papi, CatchesPolymorphicObjectCreateStarted)
 {
-    auto str = init(gko::log::Logger::polymorphic_object_create_started_mask,
-                    "polymorphic_object_create_started", exec.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::polymorphic_object_create_started>(exec.get(),
-                                                                    nullptr);
+    auto str =
+        this->init(gko::log::Logger::polymorphic_object_create_started_mask,
+                   "polymorphic_object_create_started", this->exec.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger
+        ->template on<gko::log::Logger::polymorphic_object_create_started>(
+            this->exec.get(), nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesPolymorphicObjectCreateCompleted)
+TYPED_TEST(Papi, CatchesPolymorphicObjectCreateCompleted)
 {
-    auto str = init(gko::log::Logger::polymorphic_object_create_completed_mask,
-                    "polymorphic_object_create_completed", exec.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::polymorphic_object_create_completed>(
-        exec.get(), nullptr, nullptr);
+    auto str =
+        this->init(gko::log::Logger::polymorphic_object_create_completed_mask,
+                   "polymorphic_object_create_completed", this->exec.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger
+        ->template on<gko::log::Logger::polymorphic_object_create_completed>(
+            this->exec.get(), nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesPolymorphicObjectCopyStarted)
+TYPED_TEST(Papi, CatchesPolymorphicObjectCopyStarted)
 {
-    auto str = init(gko::log::Logger::polymorphic_object_copy_started_mask,
-                    "polymorphic_object_copy_started", exec.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::polymorphic_object_copy_started>(
-        exec.get(), nullptr, nullptr);
+    auto str =
+        this->init(gko::log::Logger::polymorphic_object_copy_started_mask,
+                   "polymorphic_object_copy_started", this->exec.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger
+        ->template on<gko::log::Logger::polymorphic_object_copy_started>(
+            this->exec.get(), nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesPolymorphicObjectCopyCompleted)
+TYPED_TEST(Papi, CatchesPolymorphicObjectCopyCompleted)
 {
-    auto str = init(gko::log::Logger::polymorphic_object_copy_completed_mask,
-                    "polymorphic_object_copy_completed", exec.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::polymorphic_object_copy_completed>(
-        exec.get(), nullptr, nullptr);
+    auto str =
+        this->init(gko::log::Logger::polymorphic_object_copy_completed_mask,
+                   "polymorphic_object_copy_completed", this->exec.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger
+        ->template on<gko::log::Logger::polymorphic_object_copy_completed>(
+            this->exec.get(), nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesPolymorphicObjectDeleted)
+TYPED_TEST(Papi, CatchesPolymorphicObjectDeleted)
 {
-    auto str = init(gko::log::Logger::polymorphic_object_deleted_mask,
-                    "polymorphic_object_deleted", exec.get());
-    add_event(str);
+    auto str = this->init(gko::log::Logger::polymorphic_object_deleted_mask,
+                          "polymorphic_object_deleted", this->exec.get());
+    this->add_event(str);
 
-    start();
-    logger->on<gko::log::Logger::polymorphic_object_deleted>(exec.get(),
-                                                             nullptr);
+    this->start();
+    this->logger->template on<gko::log::Logger::polymorphic_object_deleted>(
+        this->exec.get(), nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesLinOpApplyStarted)
+TYPED_TEST(Papi, CatchesLinOpApplyStarted)
 {
-    auto A = Dense::create(exec);
-    auto str = init(gko::log::Logger::linop_apply_started_mask,
-                    "linop_apply_started", A.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::linop_apply_started>(A.get(), nullptr,
-                                                      nullptr);
+    using Dense = typename TestFixture::Dense;
+    auto A = Dense::create(this->exec);
+    auto str = this->init(gko::log::Logger::linop_apply_started_mask,
+                          "linop_apply_started", A.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::linop_apply_started>(
+        A.get(), nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesLinOpApplyCompleted)
+TYPED_TEST(Papi, CatchesLinOpApplyCompleted)
 {
-    auto A = Dense::create(exec);
-    auto str = init(gko::log::Logger::linop_apply_completed_mask,
-                    "linop_apply_completed", A.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::linop_apply_completed>(A.get(), nullptr,
-                                                        nullptr);
+    using Dense = typename TestFixture::Dense;
+    auto A = Dense::create(this->exec);
+    auto str = this->init(gko::log::Logger::linop_apply_completed_mask,
+                          "linop_apply_completed", A.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::linop_apply_completed>(
+        A.get(), nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesLinOpAdvancedApplyStarted)
+TYPED_TEST(Papi, CatchesLinOpAdvancedApplyStarted)
 {
-    auto A = Dense::create(exec);
-    auto str = init(gko::log::Logger::linop_advanced_apply_started_mask,
-                    "linop_advanced_apply_started", A.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::linop_advanced_apply_started>(
+    using Dense = typename TestFixture::Dense;
+    auto A = Dense::create(this->exec);
+    auto str = this->init(gko::log::Logger::linop_advanced_apply_started_mask,
+                          "linop_advanced_apply_started", A.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::linop_advanced_apply_started>(
         A.get(), nullptr, nullptr, nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesLinOpAdvancedApplyCompleted)
+TYPED_TEST(Papi, CatchesLinOpAdvancedApplyCompleted)
 {
-    auto A = Dense::create(exec);
-    auto str = init(gko::log::Logger::linop_advanced_apply_completed_mask,
-                    "linop_advanced_apply_completed", A.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::linop_advanced_apply_completed>(
+    using Dense = typename TestFixture::Dense;
+    auto A = Dense::create(this->exec);
+    auto str = this->init(gko::log::Logger::linop_advanced_apply_completed_mask,
+                          "linop_advanced_apply_completed", A.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::linop_advanced_apply_completed>(
         A.get(), nullptr, nullptr, nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesLinOpFactoryGenerateStarted)
+TYPED_TEST(Papi, CatchesLinOpFactoryGenerateStarted)
 {
     auto factory =
-        gko::solver::Bicgstab<>::build()
+        gko::solver::Bicgstab<TypeParam>::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto str = init(gko::log::Logger::linop_factory_generate_started_mask,
-                    "linop_factory_generate_started", factory.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::linop_factory_generate_started>(factory.get(),
-                                                                 nullptr);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto str = this->init(gko::log::Logger::linop_factory_generate_started_mask,
+                          "linop_factory_generate_started", factory.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::linop_factory_generate_started>(
+        factory.get(), nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesLinOpFactoryGenerateCompleted)
+TYPED_TEST(Papi, CatchesLinOpFactoryGenerateCompleted)
 {
     auto factory =
-        gko::solver::Bicgstab<>::build()
+        gko::solver::Bicgstab<TypeParam>::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto str = init(gko::log::Logger::linop_factory_generate_completed_mask,
-                    "linop_factory_generate_completed", factory.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::linop_factory_generate_completed>(
-        factory.get(), nullptr, nullptr);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    TypeParam dummy;
+    auto str =
+        this->init(gko::log::Logger::linop_factory_generate_completed_mask,
+                   "linop_factory_generate_completed", factory.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger
+        ->template on<gko::log::Logger::linop_factory_generate_completed>(
+            factory.get(), nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, 1);
 }
 
 
-TEST_F(Papi, CatchesIterationComplete)
+TYPED_TEST(Papi, CatchesIterationComplete)
 {
+    using Dense = typename TestFixture::Dense;
     int logged_value = 42;
-    auto A = Dense::create(exec);
-    auto str = init(gko::log::Logger::iteration_complete_mask,
-                    "iteration_complete", A.get());
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::iteration_complete>(A.get(), 42, nullptr,
-                                                     nullptr, nullptr);
+    auto A = Dense::create(this->exec);
+    auto str = this->init(gko::log::Logger::iteration_complete_mask,
+                          "iteration_complete", A.get());
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::iteration_complete>(
+        A.get(), 42, nullptr, nullptr, nullptr);
     long long int value = 0;
-    stop(&value);
+    this->stop(&value);
 
     ASSERT_EQ(value, logged_value);
 }
diff --git a/core/test/log/record.cpp b/core/test/log/record.cpp
index 369e12c6af9..dd829d39a8c 100644
--- a/core/test/log/record.cpp
+++ b/core/test/log/record.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,13 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/test/utils/assertions.hpp"
+
+
 namespace {
 
 
diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp
index 4f22c49927c..163a54fd74a 100644
--- a/core/test/log/stream.cpp
+++ b/core/test/log/stream.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,32 +33,43 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/log/stream.hpp>
 
 
-#include <gtest/gtest.h>
 #include <iomanip>
 #include <sstream>
+#include <string>
+
+
+#include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
 constexpr int num_iters = 10;
 
 
-TEST(Stream, CatchesAllocationStarted)
+template <typename T>
+class Stream : public ::testing::Test {};
+
+TYPED_TEST_CASE(Stream, gko::test::ValueTypes);
+
+
+TYPED_TEST(Stream, CatchesAllocationStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::allocation_started_mask, out);
 
-    logger->on<gko::log::Logger::allocation_started>(exec.get(), 42);
+    logger->template on<gko::log::Logger::allocation_started>(exec.get(), 42);
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "allocation started on");
@@ -66,17 +77,17 @@ TEST(Stream, CatchesAllocationStarted)
 }
 
 
-TEST(Stream, CatchesAllocationCompleted)
+TYPED_TEST(Stream, CatchesAllocationCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::allocation_completed_mask, out);
     int dummy = 1;
     std::stringstream ptrstream;
     ptrstream << std::hex << "0x" << reinterpret_cast<gko::uintptr>(&dummy);
 
-    logger->on<gko::log::Logger::allocation_completed>(
+    logger->template on<gko::log::Logger::allocation_completed>(
         exec.get(), 42, reinterpret_cast<gko::uintptr>(&dummy));
 
     auto os = out.str();
@@ -86,17 +97,17 @@ TEST(Stream, CatchesAllocationCompleted)
 }
 
 
-TEST(Stream, CatchesFreeStarted)
+TYPED_TEST(Stream, CatchesFreeStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::free_started_mask, out);
     int dummy = 1;
     std::stringstream ptrstream;
     ptrstream << std::hex << "0x" << reinterpret_cast<gko::uintptr>(&dummy);
 
-    logger->on<gko::log::Logger::free_started>(
+    logger->template on<gko::log::Logger::free_started>(
         exec.get(), reinterpret_cast<gko::uintptr>(&dummy));
 
     auto os = out.str();
@@ -105,17 +116,17 @@ TEST(Stream, CatchesFreeStarted)
 }
 
 
-TEST(Stream, CatchesFreeCompleted)
+TYPED_TEST(Stream, CatchesFreeCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::free_completed_mask, out);
     int dummy = 1;
     std::stringstream ptrstream;
     ptrstream << std::hex << "0x" << reinterpret_cast<gko::uintptr>(&dummy);
 
-    logger->on<gko::log::Logger::free_completed>(
+    logger->template on<gko::log::Logger::free_completed>(
         exec.get(), reinterpret_cast<gko::uintptr>(&dummy));
 
     auto os = out.str();
@@ -124,11 +135,11 @@ TEST(Stream, CatchesFreeCompleted)
 }
 
 
-TEST(Stream, CatchesCopyStarted)
+TYPED_TEST(Stream, CatchesCopyStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::copy_started_mask, out);
     int dummy_in = 1;
     int dummy_out = 1;
@@ -139,7 +150,7 @@ TEST(Stream, CatchesCopyStarted)
     ptrstream_out << std::hex << "0x"
                   << reinterpret_cast<gko::uintptr>(&dummy_out);
 
-    logger->on<gko::log::Logger::copy_started>(
+    logger->template on<gko::log::Logger::copy_started>(
         exec.get(), exec.get(), reinterpret_cast<gko::uintptr>(&dummy_in),
         reinterpret_cast<gko::uintptr>(&dummy_out), 42);
 
@@ -151,11 +162,11 @@ TEST(Stream, CatchesCopyStarted)
 }
 
 
-TEST(Stream, CatchesCopyCompleted)
+TYPED_TEST(Stream, CatchesCopyCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::copy_completed_mask, out);
     int dummy_in = 1;
     int dummy_out = 1;
@@ -166,7 +177,7 @@ TEST(Stream, CatchesCopyCompleted)
     ptrstream_out << std::hex << "0x"
                   << reinterpret_cast<gko::uintptr>(&dummy_out);
 
-    logger->on<gko::log::Logger::copy_completed>(
+    logger->template on<gko::log::Logger::copy_completed>(
         exec.get(), exec.get(), reinterpret_cast<gko::uintptr>(&dummy_in),
         reinterpret_cast<gko::uintptr>(&dummy_out), 42);
 
@@ -178,17 +189,17 @@ TEST(Stream, CatchesCopyCompleted)
 }
 
 
-TEST(Stream, CatchesOperationLaunched)
+TYPED_TEST(Stream, CatchesOperationLaunched)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::operation_launched_mask, out);
     gko::Operation op;
     std::stringstream ptrstream;
     ptrstream << &op;
 
-    logger->on<gko::log::Logger::operation_launched>(exec.get(), &op);
+    logger->template on<gko::log::Logger::operation_launched>(exec.get(), &op);
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "started on");
@@ -196,17 +207,17 @@ TEST(Stream, CatchesOperationLaunched)
 }
 
 
-TEST(Stream, CatchesOperationCompleted)
+TYPED_TEST(Stream, CatchesOperationCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::operation_completed_mask, out);
     gko::Operation op;
     std::stringstream ptrstream;
     ptrstream << &op;
 
-    logger->on<gko::log::Logger::operation_completed>(exec.get(), &op);
+    logger->template on<gko::log::Logger::operation_completed>(exec.get(), &op);
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "completed on");
@@ -214,18 +225,18 @@ TEST(Stream, CatchesOperationCompleted)
 }
 
 
-TEST(Stream, CatchesPolymorphicObjectCreateStarted)
+TYPED_TEST(Stream, CatchesPolymorphicObjectCreateStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::polymorphic_object_create_started_mask, out);
-    auto po = gko::matrix::Dense<>::create(exec);
+    auto po = gko::matrix::Dense<TypeParam>::create(exec);
     std::stringstream ptrstream;
     ptrstream << po.get();
 
-    logger->on<gko::log::Logger::polymorphic_object_create_started>(exec.get(),
-                                                                    po.get());
+    logger->template on<gko::log::Logger::polymorphic_object_create_started>(
+        exec.get(), po.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, ptrstream.str());
@@ -233,20 +244,20 @@ TEST(Stream, CatchesPolymorphicObjectCreateStarted)
 }
 
 
-TEST(Stream, CatchesPolymorphicObjectCreateCompleted)
+TYPED_TEST(Stream, CatchesPolymorphicObjectCreateCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::polymorphic_object_create_completed_mask, out);
-    auto po = gko::matrix::Dense<>::create(exec);
-    auto output = gko::matrix::Dense<>::create(exec);
+    auto po = gko::matrix::Dense<TypeParam>::create(exec);
+    auto output = gko::matrix::Dense<TypeParam>::create(exec);
     std::stringstream ptrstream_in;
     ptrstream_in << po.get();
     std::stringstream ptrstream_out;
     ptrstream_out << output.get();
 
-    logger->on<gko::log::Logger::polymorphic_object_create_completed>(
+    logger->template on<gko::log::Logger::polymorphic_object_create_completed>(
         exec.get(), po.get(), output.get());
 
     auto os = out.str();
@@ -256,20 +267,20 @@ TEST(Stream, CatchesPolymorphicObjectCreateCompleted)
 }
 
 
-TEST(Stream, CatchesPolymorphicObjectCopyStarted)
+TYPED_TEST(Stream, CatchesPolymorphicObjectCopyStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::polymorphic_object_copy_started_mask, out);
-    auto from = gko::matrix::Dense<>::create(exec);
-    auto to = gko::matrix::Dense<>::create(exec);
+    auto from = gko::matrix::Dense<TypeParam>::create(exec);
+    auto to = gko::matrix::Dense<TypeParam>::create(exec);
     std::stringstream ptrstream_from;
     ptrstream_from << from.get();
     std::stringstream ptrstream_to;
     ptrstream_to << to.get();
 
-    logger->on<gko::log::Logger::polymorphic_object_copy_started>(
+    logger->template on<gko::log::Logger::polymorphic_object_copy_started>(
         exec.get(), from.get(), to.get());
 
     auto os = out.str();
@@ -279,20 +290,20 @@ TEST(Stream, CatchesPolymorphicObjectCopyStarted)
 }
 
 
-TEST(Stream, CatchesPolymorphicObjectCopyCompleted)
+TYPED_TEST(Stream, CatchesPolymorphicObjectCopyCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::polymorphic_object_copy_completed_mask, out);
-    auto from = gko::matrix::Dense<>::create(exec);
-    auto to = gko::matrix::Dense<>::create(exec);
+    auto from = gko::matrix::Dense<TypeParam>::create(exec);
+    auto to = gko::matrix::Dense<TypeParam>::create(exec);
     std::stringstream ptrstream_from;
     ptrstream_from << from.get();
     std::stringstream ptrstream_to;
     ptrstream_to << to.get();
 
-    logger->on<gko::log::Logger::polymorphic_object_copy_completed>(
+    logger->template on<gko::log::Logger::polymorphic_object_copy_completed>(
         exec.get(), from.get(), to.get());
 
     auto os = out.str();
@@ -302,18 +313,18 @@ TEST(Stream, CatchesPolymorphicObjectCopyCompleted)
 }
 
 
-TEST(Stream, CatchesPolymorphicObjectDeleted)
+TYPED_TEST(Stream, CatchesPolymorphicObjectDeleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::polymorphic_object_deleted_mask, out);
-    auto po = gko::matrix::Dense<>::create(exec);
+    auto po = gko::matrix::Dense<TypeParam>::create(exec);
     std::stringstream ptrstream;
     ptrstream << po.get();
 
-    logger->on<gko::log::Logger::polymorphic_object_deleted>(exec.get(),
-                                                             po.get());
+    logger->template on<gko::log::Logger::polymorphic_object_deleted>(
+        exec.get(), po.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, ptrstream.str());
@@ -321,12 +332,12 @@ TEST(Stream, CatchesPolymorphicObjectDeleted)
 }
 
 
-TEST(Stream, CatchesLinOpApplyStarted)
+TYPED_TEST(Stream, CatchesLinOpApplyStarted)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_apply_started_mask, out);
     auto A = Dense::create(exec);
     auto b = Dense::create(exec);
@@ -338,8 +349,8 @@ TEST(Stream, CatchesLinOpApplyStarted)
     std::stringstream ptrstream_x;
     ptrstream_x << x.get();
 
-    logger->on<gko::log::Logger::linop_apply_started>(A.get(), b.get(),
-                                                      x.get());
+    logger->template on<gko::log::Logger::linop_apply_started>(A.get(), b.get(),
+                                                               x.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "apply started on A");
@@ -349,19 +360,19 @@ TEST(Stream, CatchesLinOpApplyStarted)
 }
 
 
-TEST(Stream, CatchesLinOpApplyStartedWithVerbose)
+TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_apply_started_mask, out, true);
     auto A = gko::initialize<Dense>({1.1}, exec);
     auto b = gko::initialize<Dense>({-2.2}, exec);
     auto x = gko::initialize<Dense>({3.3}, exec);
 
-    logger->on<gko::log::Logger::linop_apply_started>(A.get(), b.get(),
-                                                      x.get());
+    logger->template on<gko::log::Logger::linop_apply_started>(A.get(), b.get(),
+                                                               x.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "1.1");
@@ -370,12 +381,12 @@ TEST(Stream, CatchesLinOpApplyStartedWithVerbose)
 }
 
 
-TEST(Stream, CatchesLinOpApplyCompleted)
+TYPED_TEST(Stream, CatchesLinOpApplyCompleted)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_apply_completed_mask, out);
     auto A = Dense::create(exec);
     auto b = Dense::create(exec);
@@ -387,8 +398,8 @@ TEST(Stream, CatchesLinOpApplyCompleted)
     std::stringstream ptrstream_x;
     ptrstream_x << x.get();
 
-    logger->on<gko::log::Logger::linop_apply_completed>(A.get(), b.get(),
-                                                        x.get());
+    logger->template on<gko::log::Logger::linop_apply_completed>(
+        A.get(), b.get(), x.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "apply completed on A");
@@ -398,19 +409,19 @@ TEST(Stream, CatchesLinOpApplyCompleted)
 }
 
 
-TEST(Stream, CatchesLinOpApplyCompletedWithVerbose)
+TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_apply_completed_mask, out, true);
     auto A = gko::initialize<Dense>({1.1}, exec);
     auto b = gko::initialize<Dense>({-2.2}, exec);
     auto x = gko::initialize<Dense>({3.3}, exec);
 
-    logger->on<gko::log::Logger::linop_apply_completed>(A.get(), b.get(),
-                                                        x.get());
+    logger->template on<gko::log::Logger::linop_apply_completed>(
+        A.get(), b.get(), x.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "1.1");
@@ -419,12 +430,12 @@ TEST(Stream, CatchesLinOpApplyCompletedWithVerbose)
 }
 
 
-TEST(Stream, CatchesLinOpAdvancedApplyStarted)
+TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStarted)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_advanced_apply_started_mask, out);
     auto A = Dense::create(exec);
     auto alpha = Dense::create(exec);
@@ -442,7 +453,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyStarted)
     std::stringstream ptrstream_x;
     ptrstream_x << x.get();
 
-    logger->on<gko::log::Logger::linop_advanced_apply_started>(
+    logger->template on<gko::log::Logger::linop_advanced_apply_started>(
         A.get(), alpha.get(), b.get(), beta.get(), x.get());
 
     auto os = out.str();
@@ -455,12 +466,12 @@ TEST(Stream, CatchesLinOpAdvancedApplyStarted)
 }
 
 
-TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose)
+TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_advanced_apply_started_mask, out, true);
     auto A = gko::initialize<Dense>({1.1}, exec);
     auto alpha = gko::initialize<Dense>({-4.4}, exec);
@@ -468,7 +479,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose)
     auto beta = gko::initialize<Dense>({-5.5}, exec);
     auto x = gko::initialize<Dense>({3.3}, exec);
 
-    logger->on<gko::log::Logger::linop_advanced_apply_started>(
+    logger->template on<gko::log::Logger::linop_advanced_apply_started>(
         A.get(), alpha.get(), b.get(), beta.get(), x.get());
 
     auto os = out.str();
@@ -480,12 +491,12 @@ TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose)
 }
 
 
-TEST(Stream, CatchesLinOpAdvancedApplyCompleted)
+TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompleted)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_advanced_apply_completed_mask, out);
     auto A = Dense::create(exec);
     auto alpha = Dense::create(exec);
@@ -503,7 +514,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompleted)
     std::stringstream ptrstream_x;
     ptrstream_x << x.get();
 
-    logger->on<gko::log::Logger::linop_advanced_apply_completed>(
+    logger->template on<gko::log::Logger::linop_advanced_apply_completed>(
         A.get(), alpha.get(), b.get(), beta.get(), x.get());
 
     auto os = out.str();
@@ -516,12 +527,12 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompleted)
 }
 
 
-TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose)
+TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_advanced_apply_completed_mask, out, true);
     auto A = gko::initialize<Dense>({1.1}, exec);
     auto alpha = gko::initialize<Dense>({-4.4}, exec);
@@ -529,7 +540,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose)
     auto beta = gko::initialize<Dense>({-5.5}, exec);
     auto x = gko::initialize<Dense>({3.3}, exec);
 
-    logger->on<gko::log::Logger::linop_advanced_apply_completed>(
+    logger->template on<gko::log::Logger::linop_advanced_apply_completed>(
         A.get(), alpha.get(), b.get(), beta.get(), x.get());
 
     auto os = out.str();
@@ -541,25 +552,25 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose)
 }
 
 
-TEST(Stream, CatchesLinopFactoryGenerateStarted)
+TYPED_TEST(Stream, CatchesLinopFactoryGenerateStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_factory_generate_started_mask, out);
     auto factory =
-        gko::solver::Bicgstab<>::build()
+        gko::solver::Bicgstab<TypeParam>::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(3u).on(exec))
             .on(exec);
-    auto input = factory->generate(gko::matrix::Dense<>::create(exec));
+    auto input = factory->generate(gko::matrix::Dense<TypeParam>::create(exec));
     std::stringstream ptrstream_factory;
     ptrstream_factory << factory.get();
     std::stringstream ptrstream_input;
     ptrstream_input << input.get();
 
-    logger->on<gko::log::Logger::linop_factory_generate_started>(factory.get(),
-                                                                 input.get());
+    logger->template on<gko::log::Logger::linop_factory_generate_started>(
+        factory.get(), input.get());
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "generate started for");
@@ -568,19 +579,20 @@ TEST(Stream, CatchesLinopFactoryGenerateStarted)
 }
 
 
-TEST(Stream, CatchesLinopFactoryGenerateCompleted)
+TYPED_TEST(Stream, CatchesLinopFactoryGenerateCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::linop_factory_generate_completed_mask, out);
     auto factory =
-        gko::solver::Bicgstab<>::build()
+        gko::solver::Bicgstab<TypeParam>::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(3u).on(exec))
             .on(exec);
-    auto input = factory->generate(gko::matrix::Dense<>::create(exec));
-    auto output = factory->generate(gko::matrix::Dense<>::create(exec));
+    auto input = factory->generate(gko::matrix::Dense<TypeParam>::create(exec));
+    auto output =
+        factory->generate(gko::matrix::Dense<TypeParam>::create(exec));
     std::stringstream ptrstream_factory;
     ptrstream_factory << factory.get();
     std::stringstream ptrstream_input;
@@ -588,7 +600,7 @@ TEST(Stream, CatchesLinopFactoryGenerateCompleted)
     std::stringstream ptrstream_output;
     ptrstream_output << output.get();
 
-    logger->on<gko::log::Logger::linop_factory_generate_completed>(
+    logger->template on<gko::log::Logger::linop_factory_generate_completed>(
         factory.get(), input.get(), output.get());
 
     auto os = out.str();
@@ -599,11 +611,11 @@ TEST(Stream, CatchesLinopFactoryGenerateCompleted)
 }
 
 
-TEST(Stream, CatchesCriterionCheckStarted)
+TYPED_TEST(Stream, CatchesCriterionCheckStarted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::criterion_check_started_mask, out);
     auto criterion =
         gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate(
@@ -614,7 +626,7 @@ TEST(Stream, CatchesCriterionCheckStarted)
     std::stringstream true_in_stream;
     true_in_stream << true;
 
-    logger->on<gko::log::Logger::criterion_check_started>(
+    logger->template on<gko::log::Logger::criterion_check_started>(
         criterion.get(), 1, nullptr, nullptr, nullptr, RelativeStoppingId,
         true);
 
@@ -626,11 +638,11 @@ TEST(Stream, CatchesCriterionCheckStarted)
 }
 
 
-TEST(Stream, CatchesCriterionCheckCompleted)
+TYPED_TEST(Stream, CatchesCriterionCheckCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::criterion_check_completed_mask, out);
     auto criterion =
         gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate(
@@ -642,7 +654,7 @@ TEST(Stream, CatchesCriterionCheckCompleted)
     std::stringstream true_in_stream;
     true_in_stream << true;
 
-    logger->on<gko::log::Logger::criterion_check_completed>(
+    logger->template on<gko::log::Logger::criterion_check_completed>(
         criterion.get(), 1, nullptr, nullptr, nullptr, RelativeStoppingId, true,
         &stop_status, true, true);
 
@@ -657,11 +669,11 @@ TEST(Stream, CatchesCriterionCheckCompleted)
 }
 
 
-TEST(Stream, CatchesCriterionCheckCompletedWithVerbose)
+TYPED_TEST(Stream, CatchesCriterionCheckCompletedWithVerbose)
 {
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::criterion_check_completed_mask, out, true);
     auto criterion =
         gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate(
@@ -673,7 +685,7 @@ TEST(Stream, CatchesCriterionCheckCompletedWithVerbose)
 
     stop_status.get_data()->reset();
     stop_status.get_data()->stop(RelativeStoppingId);
-    logger->on<gko::log::Logger::criterion_check_completed>(
+    logger->template on<gko::log::Logger::criterion_check_completed>(
         criterion.get(), 1, nullptr, nullptr, nullptr, RelativeStoppingId, true,
         &stop_status, true, true);
 
@@ -685,12 +697,12 @@ TEST(Stream, CatchesCriterionCheckCompletedWithVerbose)
 }
 
 
-TEST(Stream, CatchesIterations)
+TYPED_TEST(Stream, CatchesIterations)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::iteration_complete_mask, out);
     auto solver = Dense::create(exec);
     auto residual = Dense::create(exec);
@@ -701,25 +713,26 @@ TEST(Stream, CatchesIterations)
     std::stringstream ptrstream_residual;
     ptrstream_residual << residual.get();
 
-    logger->on<gko::log::Logger::iteration_complete>(solver.get(), num_iters,
-                                                     residual.get());
+    logger->template on<gko::log::Logger::iteration_complete>(
+        solver.get(), num_iters, residual.get());
 
-    GKO_ASSERT_STR_CONTAINS(out.str(), "iteration " + num_iters);
+    GKO_ASSERT_STR_CONTAINS(out.str(),
+                            "iteration " + std::to_string(num_iters));
     GKO_ASSERT_STR_CONTAINS(out.str(), ptrstream_solver.str());
     GKO_ASSERT_STR_CONTAINS(out.str(), ptrstream_residual.str());
 }
 
 
-TEST(Stream, CatchesIterationsWithVerbose)
+TYPED_TEST(Stream, CatchesIterationsWithVerbose)
 {
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<TypeParam>;
     auto exec = gko::ReferenceExecutor::create();
     std::stringstream out;
-    auto logger = gko::log::Stream<>::create(
+    auto logger = gko::log::Stream<TypeParam>::create(
         exec, gko::log::Logger::iteration_complete_mask, out, true);
 
     auto factory =
-        gko::solver::Bicgstab<>::build()
+        gko::solver::Bicgstab<TypeParam>::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(3u).on(exec))
             .on(exec);
@@ -728,7 +741,7 @@ TEST(Stream, CatchesIterationsWithVerbose)
     auto solution = gko::initialize<Dense>({-2.2}, exec);
     auto residual_norm = gko::initialize<Dense>({-3.3}, exec);
 
-    logger->on<gko::log::Logger::iteration_complete>(
+    logger->template on<gko::log::Logger::iteration_complete>(
         solver.get(), num_iters, residual.get(), solution.get(),
         residual_norm.get());
 
diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt
index ba370edb60f..68382fa4b8f 100644
--- a/core/test/matrix/CMakeLists.txt
+++ b/core/test/matrix/CMakeLists.txt
@@ -4,5 +4,8 @@ ginkgo_create_test(dense)
 ginkgo_create_test(ell)
 ginkgo_create_test(hybrid)
 ginkgo_create_test(identity)
+ginkgo_create_test(permutation)
 ginkgo_create_test(sellp)
 ginkgo_create_test(sparsity_csr)
+ginkgo_create_test(csr_builder)
+ginkgo_create_test(coo_builder)
\ No newline at end of file
diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp
index 3f78fa76ad5..92a999febb6 100644
--- a/core/test/matrix/coo.cpp
+++ b/core/test/matrix/coo.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,20 +36,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Coo : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Coo<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Coo<value_type, index_type>;
 
     Coo()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::Coo<>::create(exec, gko::dim<2>{2, 3}, 4))
+          mtx(gko::matrix::Coo<value_type, index_type>::create(
+              exec, gko::dim<2>{2, 3}, 4))
     {
-        Mtx::value_type *v = mtx->get_values();
-        Mtx::index_type *c = mtx->get_col_idxs();
-        Mtx::index_type *r = mtx->get_row_idxs();
+        value_type *v = mtx->get_values();
+        index_type *c = mtx->get_col_idxs();
+        index_type *r = mtx->get_row_idxs();
         r[0] = 0;
         r[1] = 0;
         r[2] = 0;
@@ -82,10 +91,10 @@ class Coo : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{3.0});
+        EXPECT_EQ(v[2], value_type{2.0});
+        EXPECT_EQ(v[3], value_type{5.0});
     }
 
     void assert_empty(const Mtx *m)
@@ -98,35 +107,44 @@ class Coo : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Coo, gko::test::ValueIndexTypes);
 
-TEST_F(Coo, KnowsItsSize)
+
+TYPED_TEST(Coo, KnowsItsSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(mtx->get_num_stored_elements(), 4);
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx->get_num_stored_elements(), 4);
 }
 
 
-TEST_F(Coo, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); }
+TYPED_TEST(Coo, ContainsCorrectData)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
 
 
-TEST_F(Coo, CanBeEmpty)
+TYPED_TEST(Coo, CanBeEmpty)
 {
-    auto mtx = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec);
 
-    assert_empty(mtx.get());
+    this->assert_empty(mtx.get());
 }
 
 
-TEST_F(Coo, CanBeCreatedFromExistingData)
+TYPED_TEST(Coo, CanBeCreatedFromExistingData)
 {
-    double values[] = {1.0, 2.0, 3.0, 4.0};
-    gko::int32 col_idxs[] = {0, 1, 1, 0};
-    gko::int32 row_idxs[] = {0, 0, 1, 2};
-
-    auto mtx = gko::matrix::Coo<>::create(
-        exec, gko::dim<2>{3, 2}, gko::Array<double>::view(exec, 4, values),
-        gko::Array<gko::int32>::view(exec, 4, col_idxs),
-        gko::Array<gko::int32>::view(exec, 4, row_idxs));
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    value_type values[] = {1.0, 2.0, 3.0, 4.0};
+    index_type col_idxs[] = {0, 1, 1, 0};
+    index_type row_idxs[] = {0, 0, 1, 2};
+
+    auto mtx = gko::matrix::Coo<value_type, index_type>::create(
+        this->exec, gko::dim<2>{3, 2},
+        gko::Array<value_type>::view(this->exec, 4, values),
+        gko::Array<index_type>::view(this->exec, 4, col_idxs),
+        gko::Array<index_type>::view(this->exec, 4, row_idxs));
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
@@ -134,48 +152,53 @@ TEST_F(Coo, CanBeCreatedFromExistingData)
 }
 
 
-TEST_F(Coo, CanBeCopied)
+TYPED_TEST(Coo, CanBeCopied)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(mtx.get());
+    copy->copy_from(this->mtx.get());
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Coo, CanBeMoved)
+TYPED_TEST(Coo, CanBeMoved)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(std::move(mtx));
+    copy->copy_from(std::move(this->mtx));
 
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Coo, CanBeCloned)
+TYPED_TEST(Coo, CanBeCloned)
 {
-    auto clone = mtx->clone();
+    using Mtx = typename TestFixture::Mtx;
+    auto clone = this->mtx->clone();
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
 }
 
-TEST_F(Coo, CanBeCleared)
+
+TYPED_TEST(Coo, CanBeCleared)
 {
-    mtx->clear();
+    this->mtx->clear();
 
-    assert_empty(mtx.get());
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(Coo, CanBeReadFromMatrixData)
+TYPED_TEST(Coo, CanBeReadFromMatrixData)
 {
-    auto m = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec);
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -184,23 +207,26 @@ TEST_F(Coo, CanBeReadFromMatrixData)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_mtx(m.get());
 }
 
 
-TEST_F(Coo, GeneratesCorrectMatrixData)
+TYPED_TEST(Coo, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 4);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0}));
 }
 
 
diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp
new file mode 100644
index 00000000000..de5844b0bbe
--- /dev/null
+++ b/core/test/matrix/coo_builder.cpp
@@ -0,0 +1,88 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/coo_builder.hpp"
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class CooBuilder : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Coo<value_type, index_type>;
+
+    CooBuilder()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<Mtx> mtx;
+};
+
+TYPED_TEST_CASE(CooBuilder, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(CooBuilder, ReturnsCorrectArrays)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    gko::matrix::CooBuilder<value_type, index_type> builder{this->mtx.get()};
+
+    auto builder_row_idxs = builder.get_row_idx_array().get_data();
+    auto builder_col_idxs = builder.get_col_idx_array().get_data();
+    auto builder_values = builder.get_value_array().get_data();
+    auto ref_row_idxs = this->mtx->get_row_idxs();
+    auto ref_col_idxs = this->mtx->get_col_idxs();
+    auto ref_values = this->mtx->get_values();
+
+    ASSERT_EQ(builder_row_idxs, ref_row_idxs);
+    ASSERT_EQ(builder_col_idxs, ref_col_idxs);
+    ASSERT_EQ(builder_values, ref_values);
+}
+
+
+}  // namespace
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index 155f2c8ce21..f927861afce 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,23 +36,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Csr : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Csr<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
 
     Csr()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::Csr<>::create(
+          mtx(gko::matrix::Csr<value_type, index_type>::create(
               exec, gko::dim<2>{2, 3}, 4,
-              std::make_shared<Mtx::load_balance>(2)))
+              std::make_shared<typename Mtx::load_balance>(2)))
     {
-        Mtx::value_type *v = mtx->get_values();
-        Mtx::index_type *c = mtx->get_col_idxs();
-        Mtx::index_type *r = mtx->get_row_ptrs();
-        Mtx::index_type *s = mtx->get_srow();
+        value_type *v = mtx->get_values();
+        index_type *c = mtx->get_col_idxs();
+        index_type *r = mtx->get_row_ptrs();
+        index_type *s = mtx->get_srow();
         r[0] = 0;
         r[1] = 3;
         r[2] = 4;
@@ -85,10 +93,10 @@ class Csr : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{3.0});
+        EXPECT_EQ(v[2], value_type{2.0});
+        EXPECT_EQ(v[3], value_type{5.0});
         EXPECT_EQ(s[0], 0);
     }
 
@@ -98,41 +106,51 @@ class Csr : public ::testing::Test {
         ASSERT_EQ(m->get_num_stored_elements(), 0);
         ASSERT_EQ(m->get_const_values(), nullptr);
         ASSERT_EQ(m->get_const_col_idxs(), nullptr);
-        ASSERT_EQ(m->get_const_row_ptrs(), nullptr);
+        ASSERT_NE(m->get_const_row_ptrs(), nullptr);
         ASSERT_EQ(m->get_const_srow(), nullptr);
     }
 };
 
+TYPED_TEST_CASE(Csr, gko::test::ValueIndexTypes);
+
 
-TEST_F(Csr, KnowsItsSize)
+TYPED_TEST(Csr, KnowsItsSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(mtx->get_num_stored_elements(), 4);
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx->get_num_stored_elements(), 4);
 }
 
 
-TEST_F(Csr, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); }
+TYPED_TEST(Csr, ContainsCorrectData)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
 
 
-TEST_F(Csr, CanBeEmpty)
+TYPED_TEST(Csr, CanBeEmpty)
 {
-    auto mtx = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec);
 
-    assert_empty(mtx.get());
+    this->assert_empty(mtx.get());
 }
 
 
-TEST_F(Csr, CanBeCreatedFromExistingData)
+TYPED_TEST(Csr, CanBeCreatedFromExistingData)
 {
-    double values[] = {1.0, 2.0, 3.0, 4.0};
-    gko::int32 col_idxs[] = {0, 1, 1, 0};
-    gko::int32 row_ptrs[] = {0, 2, 3, 4};
-
-    auto mtx = gko::matrix::Csr<>::create(
-        exec, gko::dim<2>{3, 2}, gko::Array<double>::view(exec, 4, values),
-        gko::Array<gko::int32>::view(exec, 4, col_idxs),
-        gko::Array<gko::int32>::view(exec, 4, row_ptrs),
-        std::make_shared<Mtx::load_balance>(2));
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    value_type values[] = {1.0, 2.0, 3.0, 4.0};
+    index_type col_idxs[] = {0, 1, 1, 0};
+    index_type row_ptrs[] = {0, 2, 3, 4};
+
+    auto mtx = gko::matrix::Csr<value_type, index_type>::create(
+        this->exec, gko::dim<2>{3, 2},
+        gko::Array<value_type>::view(this->exec, 4, values),
+        gko::Array<index_type>::view(this->exec, 4, col_idxs),
+        gko::Array<index_type>::view(this->exec, 4, row_ptrs),
+        std::make_shared<typename Mtx::load_balance>(2));
 
     ASSERT_EQ(mtx->get_num_srow_elements(), 1);
     ASSERT_EQ(mtx->get_const_values(), values);
@@ -142,49 +160,54 @@ TEST_F(Csr, CanBeCreatedFromExistingData)
 }
 
 
-TEST_F(Csr, CanBeCopied)
+TYPED_TEST(Csr, CanBeCopied)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(mtx.get());
+    copy->copy_from(this->mtx.get());
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Csr, CanBeMoved)
+TYPED_TEST(Csr, CanBeMoved)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(std::move(mtx));
+    copy->copy_from(std::move(this->mtx));
 
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Csr, CanBeCloned)
+TYPED_TEST(Csr, CanBeCloned)
 {
-    auto clone = mtx->clone();
+    using Mtx = typename TestFixture::Mtx;
+    auto clone = this->mtx->clone();
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
 }
 
 
-TEST_F(Csr, CanBeCleared)
+TYPED_TEST(Csr, CanBeCleared)
 {
-    mtx->clear();
+    this->mtx->clear();
 
-    assert_empty(mtx.get());
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(Csr, CanBeReadFromMatrixData)
+TYPED_TEST(Csr, CanBeReadFromMatrixData)
 {
-    auto m = Mtx::create(exec, std::make_shared<Mtx::load_balance>(2));
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec,
+                         std::make_shared<typename Mtx::load_balance>(2));
 
     m->read({{2, 3},
              {{0, 0, 1.0},
@@ -194,23 +217,25 @@ TEST_F(Csr, CanBeReadFromMatrixData)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_mtx(m.get());
 }
 
 
-TEST_F(Csr, GeneratesCorrectMatrixData)
+TYPED_TEST(Csr, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 4);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0}));
 }
 
 
diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp
new file mode 100644
index 00000000000..9a1bfb6eb5f
--- /dev/null
+++ b/core/test/matrix/csr_builder.cpp
@@ -0,0 +1,119 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/csr_builder.hpp"
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class CsrBuilder : public ::testing::Test {
+public:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+
+protected:
+    CsrBuilder()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<Mtx> mtx;
+};
+
+TYPED_TEST_CASE(CsrBuilder, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(CsrBuilder, ReturnsCorrectArrays)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    gko::matrix::CsrBuilder<value_type, index_type> builder{this->mtx.get()};
+
+    auto builder_col_idxs = builder.get_col_idx_array().get_data();
+    auto builder_values = builder.get_value_array().get_data();
+    auto ref_col_idxs = this->mtx->get_col_idxs();
+    auto ref_values = this->mtx->get_values();
+
+    ASSERT_EQ(builder_col_idxs, ref_col_idxs);
+    ASSERT_EQ(builder_values, ref_values);
+}
+
+
+TYPED_TEST(CsrBuilder, UpdatesSrowOnDestruction)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    struct mock_strategy : public Mtx::strategy_type {
+        virtual void process(const gko::Array<index_type> &,
+                             gko::Array<index_type> *) override
+        {
+            *was_called = true;
+        }
+
+        virtual int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        virtual std::shared_ptr<typename Mtx::strategy_type> copy() override
+        {
+            return std::make_shared<mock_strategy>(*was_called);
+        }
+
+        mock_strategy(bool &flag) : Mtx::strategy_type(""), was_called(&flag) {}
+
+        bool *was_called;
+    };
+    bool was_called{};
+    this->mtx->set_strategy(std::make_shared<mock_strategy>(was_called));
+    was_called = false;
+
+    gko::matrix::CsrBuilder<value_type, index_type>{this->mtx.get()};
+
+    ASSERT_TRUE(was_called);
+}
+
+
+}  // namespace
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index eec17387754..c89f9a740e5 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,59 +40,67 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/range.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Dense : public ::testing::Test {
 protected:
+    using value_type = T;
     Dense()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::initialize<gko::matrix::Dense<>>(
+          mtx(gko::initialize<gko::matrix::Dense<value_type>>(
               4, {{1.0, 2.0, 3.0}, {1.5, 2.5, 3.5}}, exec))
     {}
 
 
-    static void assert_equal_to_original_mtx(gko::matrix::Dense<> *m)
+    static void assert_equal_to_original_mtx(gko::matrix::Dense<value_type> *m)
     {
         ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
         ASSERT_EQ(m->get_stride(), 4);
         ASSERT_EQ(m->get_num_stored_elements(), 2 * 4);
-        EXPECT_EQ(m->at(0, 0), 1.0);
-        EXPECT_EQ(m->at(0, 1), 2.0);
-        EXPECT_EQ(m->at(0, 2), 3.0);
-        EXPECT_EQ(m->at(1, 0), 1.5);
-        EXPECT_EQ(m->at(1, 1), 2.5);
-        ASSERT_EQ(m->at(1, 2), 3.5);
+        EXPECT_EQ(m->at(0, 0), value_type{1.0});
+        EXPECT_EQ(m->at(0, 1), value_type{2.0});
+        EXPECT_EQ(m->at(0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(1, 0), value_type{1.5});
+        EXPECT_EQ(m->at(1, 1), value_type{2.5});
+        ASSERT_EQ(m->at(1, 2), value_type{3.5});
     }
 
-    static void assert_empty(gko::matrix::Dense<> *m)
+    static void assert_empty(gko::matrix::Dense<value_type> *m)
     {
         ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0));
         ASSERT_EQ(m->get_num_stored_elements(), 0);
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<gko::matrix::Dense<>> mtx;
+    std::unique_ptr<gko::matrix::Dense<value_type>> mtx;
 };
 
+TYPED_TEST_CASE(Dense, gko::test::ValueTypes);
 
-TEST_F(Dense, CanBeEmpty)
+
+TYPED_TEST(Dense, CanBeEmpty)
 {
-    auto empty = gko::matrix::Dense<>::create(exec);
-    assert_empty(empty.get());
+    auto empty = gko::matrix::Dense<TypeParam>::create(this->exec);
+    this->assert_empty(empty.get());
 }
 
 
-TEST_F(Dense, ReturnsNullValuesArrayWhenEmpty)
+TYPED_TEST(Dense, ReturnsNullValuesArrayWhenEmpty)
 {
-    auto empty = gko::matrix::Dense<>::create(exec);
+    auto empty = gko::matrix::Dense<TypeParam>::create(this->exec);
     ASSERT_EQ(empty->get_const_values(), nullptr);
 }
 
 
-TEST_F(Dense, CanBeConstructedWithSize)
+TYPED_TEST(Dense, CanBeConstructedWithSize)
 {
-    auto m = gko::matrix::Dense<>::create(exec, gko::dim<2>{2, 3});
+    auto m =
+        gko::matrix::Dense<TypeParam>::create(this->exec, gko::dim<2>{2, 3});
 
     ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
     EXPECT_EQ(m->get_stride(), 3);
@@ -100,9 +108,10 @@ TEST_F(Dense, CanBeConstructedWithSize)
 }
 
 
-TEST_F(Dense, CanBeConstructedWithSizeAndStride)
+TYPED_TEST(Dense, CanBeConstructedWithSizeAndStride)
 {
-    auto m = gko::matrix::Dense<>::create(exec, gko::dim<2>{2, 3}, 4);
+    auto m =
+        gko::matrix::Dense<TypeParam>::create(this->exec, gko::dim<2>{2, 3}, 4);
 
     ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
     EXPECT_EQ(m->get_stride(), 4);
@@ -110,172 +119,187 @@ TEST_F(Dense, CanBeConstructedWithSizeAndStride)
 }
 
 
-TEST_F(Dense, CanBeConstructedFromExistingData)
+TYPED_TEST(Dense, CanBeConstructedFromExistingData)
 {
+    using value_type = typename TestFixture::value_type;
     // clang-format off
-    double data[] = {
+    value_type data[] = {
         1.0, 2.0, -1.0,
         3.0, 4.0, -1.0,
         5.0, 6.0, -1.0};
     // clang-format on
 
-    auto m = gko::matrix::Dense<>::create(
-        exec, gko::dim<2>{3, 2}, gko::Array<double>::view(exec, 9, data), 3);
+    auto m = gko::matrix::Dense<TypeParam>::create(
+        this->exec, gko::dim<2>{3, 2},
+        gko::Array<value_type>::view(this->exec, 9, data), 3);
 
     ASSERT_EQ(m->get_const_values(), data);
-    ASSERT_EQ(m->at(2, 1), 6.0);
+    ASSERT_EQ(m->at(2, 1), value_type{6.0});
 }
 
 
-TEST_F(Dense, KnowsItsSizeAndValues)
+TYPED_TEST(Dense, KnowsItsSizeAndValues)
 {
-    assert_equal_to_original_mtx(mtx.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
 }
 
 
-TEST_F(Dense, CanBeListConstructed)
+TYPED_TEST(Dense, CanBeListConstructed)
 {
-    auto m = gko::initialize<gko::matrix::Dense<>>({1.0, 2.0}, exec);
+    using value_type = typename TestFixture::value_type;
+    auto m =
+        gko::initialize<gko::matrix::Dense<TypeParam>>({1.0, 2.0}, this->exec);
 
     ASSERT_EQ(m->get_size(), gko::dim<2>(2, 1));
     ASSERT_EQ(m->get_num_stored_elements(), 2);
-    EXPECT_EQ(m->at(0), 1);
-    EXPECT_EQ(m->at(1), 2);
+    EXPECT_EQ(m->at(0), value_type{1});
+    EXPECT_EQ(m->at(1), value_type{2});
 }
 
 
-TEST_F(Dense, CanBeListConstructedWithstride)
+TYPED_TEST(Dense, CanBeListConstructedWithstride)
 {
-    auto m = gko::initialize<gko::matrix::Dense<>>(2, {1.0, 2.0}, exec);
+    using value_type = typename TestFixture::value_type;
+    auto m = gko::initialize<gko::matrix::Dense<TypeParam>>(2, {1.0, 2.0},
+                                                            this->exec);
     ASSERT_EQ(m->get_size(), gko::dim<2>(2, 1));
     ASSERT_EQ(m->get_num_stored_elements(), 4);
-    EXPECT_EQ(m->at(0), 1.0);
-    EXPECT_EQ(m->at(1), 2.0);
+    EXPECT_EQ(m->at(0), value_type{1.0});
+    EXPECT_EQ(m->at(1), value_type{2.0});
 }
 
 
-TEST_F(Dense, CanBeDoubleListConstructed)
+TYPED_TEST(Dense, CanBeDoubleListConstructed)
 {
-    auto m = gko::initialize<gko::matrix::Dense<>>(
-        {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, exec);
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto m = gko::initialize<gko::matrix::Dense<TypeParam>>(
+        {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, this->exec);
 
     ASSERT_EQ(m->get_size(), gko::dim<2>(3, 2));
     ASSERT_EQ(m->get_num_stored_elements(), 6);
-    EXPECT_EQ(m->at(0), 1.0);
-    EXPECT_EQ(m->at(1), 2.0);
-    EXPECT_EQ(m->at(2), 3.0);
-    ASSERT_EQ(m->at(3), 4.0);
-    EXPECT_EQ(m->at(4), 5.0);
+    EXPECT_EQ(m->at(0), value_type{1.0});
+    EXPECT_EQ(m->at(1), value_type{2.0});
+    EXPECT_EQ(m->at(2), value_type{3.0});
+    ASSERT_EQ(m->at(3), value_type{4.0});
+    EXPECT_EQ(m->at(4), value_type{5.0});
 }
 
 
-TEST_F(Dense, CanBeDoubleListConstructedWithstride)
+TYPED_TEST(Dense, CanBeDoubleListConstructedWithstride)
 {
-    auto m = gko::initialize<gko::matrix::Dense<>>(
-        4, {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, exec);
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto m = gko::initialize<gko::matrix::Dense<TypeParam>>(
+        4, {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, this->exec);
 
     ASSERT_EQ(m->get_size(), gko::dim<2>(3, 2));
     ASSERT_EQ(m->get_num_stored_elements(), 12);
-    EXPECT_EQ(m->at(0), 1.0);
-    EXPECT_EQ(m->at(1), 2.0);
-    EXPECT_EQ(m->at(2), 3.0);
-    ASSERT_EQ(m->at(3), 4.0);
-    EXPECT_EQ(m->at(4), 5.0);
+    EXPECT_EQ(m->at(0), value_type{1.0});
+    EXPECT_EQ(m->at(1), value_type{2.0});
+    EXPECT_EQ(m->at(2), value_type{3.0});
+    ASSERT_EQ(m->at(3), value_type{4.0});
+    EXPECT_EQ(m->at(4), value_type{5.0});
 }
 
 
-TEST_F(Dense, CanBeCopied)
+TYPED_TEST(Dense, CanBeCopied)
 {
-    auto mtx_copy = gko::matrix::Dense<>::create(exec);
-    mtx_copy->copy_from(mtx.get());
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->at(0) = 7;
-    assert_equal_to_original_mtx(mtx_copy.get());
+    auto mtx_copy = gko::matrix::Dense<TypeParam>::create(this->exec);
+    mtx_copy->copy_from(this->mtx.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->at(0) = 7;
+    this->assert_equal_to_original_mtx(mtx_copy.get());
 }
 
 
-TEST_F(Dense, CanBeMoved)
+TYPED_TEST(Dense, CanBeMoved)
 {
-    auto mtx_copy = gko::matrix::Dense<>::create(exec);
-    mtx_copy->copy_from(std::move(mtx));
-    assert_equal_to_original_mtx(mtx_copy.get());
+    auto mtx_copy = gko::matrix::Dense<TypeParam>::create(this->exec);
+    mtx_copy->copy_from(std::move(this->mtx));
+    this->assert_equal_to_original_mtx(mtx_copy.get());
 }
 
 
-TEST_F(Dense, CanBeCloned)
+TYPED_TEST(Dense, CanBeCloned)
 {
-    auto mtx_clone = mtx->clone();
-    assert_equal_to_original_mtx(
-        dynamic_cast<decltype(mtx.get())>(mtx_clone.get()));
+    auto mtx_clone = this->mtx->clone();
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
 }
 
 
-TEST_F(Dense, CanBeCleared)
+TYPED_TEST(Dense, CanBeCleared)
 {
-    mtx->clear();
-    assert_empty(mtx.get());
+    this->mtx->clear();
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(Dense, CanBeReadFromMatrixData)
+TYPED_TEST(Dense, CanBeReadFromMatrixData)
 {
-    auto m = gko::matrix::Dense<>::create(exec);
-    m->read(gko::matrix_data<>{{2, 3},
-                               {{0, 0, 1.0},
-                                {0, 1, 3.0},
-                                {0, 2, 2.0},
-                                {1, 0, 0.0},
-                                {1, 1, 5.0},
-                                {1, 2, 0.0}}});
+    using value_type = typename TestFixture::value_type;
+    auto m = gko::matrix::Dense<TypeParam>::create(this->exec);
+    m->read(gko::matrix_data<TypeParam>{{2, 3},
+                                        {{0, 0, 1.0},
+                                         {0, 1, 3.0},
+                                         {0, 2, 2.0},
+                                         {1, 0, 0.0},
+                                         {1, 1, 5.0},
+                                         {1, 2, 0.0}}});
 
     ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
     ASSERT_EQ(m->get_num_stored_elements(), 6);
-    EXPECT_EQ(m->at(0, 0), 1.0);
-    EXPECT_EQ(m->at(1, 0), 0.0);
-    EXPECT_EQ(m->at(0, 1), 3.0);
-    EXPECT_EQ(m->at(1, 1), 5.0);
-    EXPECT_EQ(m->at(0, 2), 2.0);
-    ASSERT_EQ(m->at(1, 2), 0.0);
+    EXPECT_EQ(m->at(0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(0, 2), value_type{2.0});
+    ASSERT_EQ(m->at(1, 2), value_type{0.0});
 }
 
 
-TEST_F(Dense, GeneratesCorrectMatrixData)
+TYPED_TEST(Dense, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
+    gko::matrix_data<TypeParam> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 6);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 2.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 3.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 0, 1.5));
-    EXPECT_EQ(data.nonzeros[4], tpl(1, 1, 2.5));
-    EXPECT_EQ(data.nonzeros[5], tpl(1, 2, 3.5));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{2.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 0, value_type{1.5}));
+    EXPECT_EQ(data.nonzeros[4], tpl(1, 1, value_type{2.5}));
+    EXPECT_EQ(data.nonzeros[5], tpl(1, 2, value_type{3.5}));
 }
 
 
-TEST_F(Dense, CanCreateSubmatrix)
+TYPED_TEST(Dense, CanCreateSubmatrix)
 {
-    auto submtx = mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2});
+    using value_type = typename TestFixture::value_type;
+    auto submtx = this->mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2});
 
-    EXPECT_EQ(submtx->at(0, 0), 2.0);
-    EXPECT_EQ(submtx->at(0, 1), 3.0);
-    EXPECT_EQ(submtx->at(1, 0), 2.5);
-    EXPECT_EQ(submtx->at(1, 1), 3.5);
+    EXPECT_EQ(submtx->at(0, 0), value_type{2.0});
+    EXPECT_EQ(submtx->at(0, 1), value_type{3.0});
+    EXPECT_EQ(submtx->at(1, 0), value_type{2.5});
+    EXPECT_EQ(submtx->at(1, 1), value_type{3.5});
 }
 
 
-TEST_F(Dense, CanCreateSubmatrixWithStride)
+TYPED_TEST(Dense, CanCreateSubmatrixWithStride)
 {
-    auto submtx = mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2}, 3);
-
-    EXPECT_EQ(submtx->at(0, 0), 2.0);
-    EXPECT_EQ(submtx->at(0, 1), 3.0);
-    EXPECT_EQ(submtx->at(1, 0), 1.5);
-    EXPECT_EQ(submtx->at(1, 1), 2.5);
+    using value_type = typename TestFixture::value_type;
+    auto submtx =
+        this->mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2}, 3);
+
+    EXPECT_EQ(submtx->at(0, 0), value_type{2.0});
+    EXPECT_EQ(submtx->at(0, 1), value_type{3.0});
+    EXPECT_EQ(submtx->at(1, 0), value_type{1.5});
+    EXPECT_EQ(submtx->at(1, 1), value_type{2.5});
 }
 
 
diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp
index fdefa61712e..6e92f1251ba 100644
--- a/core/test/matrix/ell.cpp
+++ b/core/test/matrix/ell.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,19 +36,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Ell : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Ell<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Ell<value_type, index_type>;
 
     Ell()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::Ell<>::create(exec, gko::dim<2>{2, 3}, 3))
+          mtx(gko::matrix::Ell<value_type, index_type>::create(
+              exec, gko::dim<2>{2, 3}, 3))
     {
-        Mtx::value_type *v = mtx->get_values();
-        Mtx::index_type *c = mtx->get_col_idxs();
+        value_type *v = mtx->get_values();
+        index_type *c = mtx->get_col_idxs();
         c[0] = 0;
         c[1] = 1;
         c[2] = 1;
@@ -82,12 +91,12 @@ class Ell : public ::testing::Test {
         EXPECT_EQ(c[3], 0);
         EXPECT_EQ(c[4], 2);
         EXPECT_EQ(c[5], 0);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 5.0);
-        EXPECT_EQ(v[2], 3.0);
-        EXPECT_EQ(v[3], 0.0);
-        EXPECT_EQ(v[4], 2.0);
-        EXPECT_EQ(v[5], 0.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{5.0});
+        EXPECT_EQ(v[2], value_type{3.0});
+        EXPECT_EQ(v[3], value_type{0.0});
+        EXPECT_EQ(v[4], value_type{2.0});
+        EXPECT_EQ(v[5], value_type{0.0});
     }
 
     void assert_empty(const Mtx *m)
@@ -101,84 +110,97 @@ class Ell : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Ell, gko::test::ValueIndexTypes);
+
 
-TEST_F(Ell, KnowsItsSize)
+TYPED_TEST(Ell, KnowsItsSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(mtx->get_num_stored_elements(), 6);
-    ASSERT_EQ(mtx->get_num_stored_elements_per_row(), 3);
-    ASSERT_EQ(mtx->get_stride(), 2);
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx->get_num_stored_elements(), 6);
+    ASSERT_EQ(this->mtx->get_num_stored_elements_per_row(), 3);
+    ASSERT_EQ(this->mtx->get_stride(), 2);
 }
 
 
-TEST_F(Ell, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); }
+TYPED_TEST(Ell, ContainsCorrectData)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
 
 
-TEST_F(Ell, CanBeEmpty)
+TYPED_TEST(Ell, CanBeEmpty)
 {
-    auto mtx = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec);
 
-    assert_empty(mtx.get());
+    this->assert_empty(mtx.get());
 }
 
 
-TEST_F(Ell, CanBeCreatedFromExistingData)
+TYPED_TEST(Ell, CanBeCreatedFromExistingData)
 {
-    double values[] = {1.0, 3.0, 4.0, -1.0, 2.0, 0.0, 0.0, -1.0};
-    gko::int32 col_idxs[] = {0, 1, 0, -1, 1, 0, 0, -1};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    value_type values[] = {1.0, 3.0, 4.0, -1.0, 2.0, 0.0, 0.0, -1.0};
+    index_type col_idxs[] = {0, 1, 0, -1, 1, 0, 0, -1};
 
-    auto mtx = gko::matrix::Ell<>::create(
-        exec, gko::dim<2>{3, 2}, gko::Array<double>::view(exec, 8, values),
-        gko::Array<gko::int32>::view(exec, 8, col_idxs), 2, 4);
+    auto mtx = gko::matrix::Ell<value_type, index_type>::create(
+        this->exec, gko::dim<2>{3, 2},
+        gko::Array<value_type>::view(this->exec, 8, values),
+        gko::Array<index_type>::view(this->exec, 8, col_idxs), 2, 4);
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
 }
 
 
-TEST_F(Ell, CanBeCopied)
+TYPED_TEST(Ell, CanBeCopied)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(mtx.get());
+    copy->copy_from(this->mtx.get());
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Ell, CanBeMoved)
+TYPED_TEST(Ell, CanBeMoved)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(std::move(mtx));
+    copy->copy_from(std::move(this->mtx));
 
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Ell, CanBeCloned)
+TYPED_TEST(Ell, CanBeCloned)
 {
-    auto clone = mtx->clone();
+    using Mtx = typename TestFixture::Mtx;
+    auto clone = this->mtx->clone();
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(static_cast<Mtx *>(clone.get()));
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(static_cast<Mtx *>(clone.get()));
 }
 
 
-TEST_F(Ell, CanBeCleared)
+TYPED_TEST(Ell, CanBeCleared)
 {
-    mtx->clear();
+    this->mtx->clear();
 
-    assert_empty(mtx.get());
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(Ell, CanBeReadFromMatrixData)
+TYPED_TEST(Ell, CanBeReadFromMatrixData)
 {
-    auto m = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec);
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -187,23 +209,25 @@ TEST_F(Ell, CanBeReadFromMatrixData)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_mtx(m.get());
 }
 
 
-TEST_F(Ell, GeneratesCorrectMatrixData)
+TYPED_TEST(Ell, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 4);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0}));
 }
 
 
diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp
index 3a8ca343d10..dac9da86167 100644
--- a/core/test/matrix/hybrid.cpp
+++ b/core/test/matrix/hybrid.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,19 +36,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Hybrid : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Hybrid<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Hybrid<value_type, index_type>;
 
     Hybrid()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::Hybrid<>::create(exec, gko::dim<2>{2, 3}, 2, 2, 1))
+          mtx(gko::matrix::Hybrid<value_type, index_type>::create(
+              exec, gko::dim<2>{2, 3}, 2, 2, 1))
     {
-        Mtx::value_type *v = mtx->get_ell_values();
-        Mtx::index_type *c = mtx->get_ell_col_idxs();
+        value_type *v = mtx->get_ell_values();
+        index_type *c = mtx->get_ell_col_idxs();
         c[0] = 0;
         c[1] = 1;
         c[2] = 1;
@@ -80,11 +89,11 @@ class Hybrid : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 1);
         EXPECT_EQ(c[3], 0);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 5.0);
-        EXPECT_EQ(v[2], 3.0);
-        EXPECT_EQ(v[3], 0.0);
-        EXPECT_EQ(m->get_const_coo_values()[0], 2.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{5.0});
+        EXPECT_EQ(v[2], value_type{3.0});
+        EXPECT_EQ(v[3], value_type{0.0});
+        EXPECT_EQ(m->get_const_coo_values()[0], value_type{2.0});
         EXPECT_EQ(m->get_const_coo_col_idxs()[0], 2);
         EXPECT_EQ(m->get_const_coo_row_idxs()[0], 0);
     }
@@ -103,71 +112,83 @@ class Hybrid : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Hybrid, gko::test::ValueIndexTypes);
+
 
-TEST_F(Hybrid, KnowsItsSize)
+TYPED_TEST(Hybrid, KnowsItsSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(mtx->get_ell_num_stored_elements(), 4);
-    ASSERT_EQ(mtx->get_ell_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(mtx->get_ell_stride(), 2);
-    ASSERT_EQ(mtx->get_coo_num_stored_elements(), 1);
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx->get_ell_num_stored_elements(), 4);
+    ASSERT_EQ(this->mtx->get_ell_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(this->mtx->get_ell_stride(), 2);
+    ASSERT_EQ(this->mtx->get_coo_num_stored_elements(), 1);
 }
 
 
-TEST_F(Hybrid, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); }
+TYPED_TEST(Hybrid, ContainsCorrectData)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
 
 
-TEST_F(Hybrid, CanBeEmpty)
+TYPED_TEST(Hybrid, CanBeEmpty)
 {
-    auto mtx = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec);
 
-    assert_empty(mtx.get());
+    this->assert_empty(mtx.get());
 }
 
 
-TEST_F(Hybrid, CanBeCopied)
+TYPED_TEST(Hybrid, CanBeCopied)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(mtx.get());
+    copy->copy_from(this->mtx.get());
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_ell_values()[1] = 5.0;
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_ell_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Hybrid, CanBeMoved)
+TYPED_TEST(Hybrid, CanBeMoved)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(std::move(mtx));
+    copy->copy_from(std::move(this->mtx));
 
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Hybrid, CanBeCloned)
+TYPED_TEST(Hybrid, CanBeCloned)
 {
-    auto clone = mtx->clone();
+    using Mtx = typename TestFixture::Mtx;
+    auto clone = this->mtx->clone();
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_ell_values()[1] = 5.0;
-    assert_equal_to_original_mtx(static_cast<Mtx *>(clone.get()));
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_ell_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(static_cast<Mtx *>(clone.get()));
 }
 
 
-TEST_F(Hybrid, CanBeCleared)
+TYPED_TEST(Hybrid, CanBeCleared)
 {
-    mtx->clear();
+    this->mtx->clear();
 
-    assert_empty(mtx.get());
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(Hybrid, CanBeReadFromMatrixDataAutomatically)
+TYPED_TEST(Hybrid, CanBeReadFromMatrixDataAutomatically)
 {
-    auto m = Mtx::create(exec, std::make_shared<Mtx::automatic>());
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto m =
+        Mtx::create(this->exec, std::make_shared<typename Mtx::automatic>());
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -194,16 +215,18 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataAutomatically)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], value_type{1.0});
+    EXPECT_EQ(v[1], value_type{3.0});
+    EXPECT_EQ(v[2], value_type{2.0});
+    EXPECT_EQ(v[3], value_type{5.0});
 }
 
 
-TEST_F(Hybrid, CanBeReadFromMatrixDataByColumns2)
+TYPED_TEST(Hybrid, CanBeReadFromMatrixDataByColumns2)
 {
-    auto m = Mtx::create(exec, std::make_shared<Mtx::column_limit>(2));
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec,
+                         std::make_shared<typename Mtx::column_limit>(2));
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -212,13 +235,16 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataByColumns2)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_mtx(m.get());
 }
 
 
-TEST_F(Hybrid, CanBeReadFromMatrixDataByPercent40)
+TYPED_TEST(Hybrid, CanBeReadFromMatrixDataByPercent40)
 {
-    auto m = Mtx::create(exec, std::make_shared<Mtx::imbalance_limit>(0.4));
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto m = Mtx::create(this->exec,
+                         std::make_shared<typename Mtx::imbalance_limit>(0.4));
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -237,15 +263,15 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataByPercent40)
     EXPECT_EQ(p, 2);
     EXPECT_EQ(c[0], 0);
     EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 5.0);
+    EXPECT_EQ(v[0], value_type{1.0});
+    EXPECT_EQ(v[1], value_type{5.0});
 
     auto coo_v = m->get_const_coo_values();
     auto coo_c = m->get_const_coo_col_idxs();
     auto coo_r = m->get_const_coo_row_idxs();
     ASSERT_EQ(m->get_coo_num_stored_elements(), 2);
-    EXPECT_EQ(coo_v[0], 3.0);
-    EXPECT_EQ(coo_v[1], 2.0);
+    EXPECT_EQ(coo_v[0], value_type{3.0});
+    EXPECT_EQ(coo_v[1], value_type{2.0});
     EXPECT_EQ(coo_c[0], 1);
     EXPECT_EQ(coo_c[1], 2);
     EXPECT_EQ(coo_r[0], 0);
@@ -253,19 +279,21 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataByPercent40)
 }
 
 
-TEST_F(Hybrid, GeneratesCorrectMatrixData)
+TYPED_TEST(Hybrid, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 4);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0}));
 }
 
 
diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp
index 58a08ac37d6..f890a9dd039 100644
--- a/core/test/matrix/identity.cpp
+++ b/core/test/matrix/identity.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,43 +36,53 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Identity : public ::testing::Test {
 protected:
-    using Id = gko::matrix::Identity<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type = T;
+    using Id = gko::matrix::Identity<T>;
+    using Vec = gko::matrix::Dense<T>;
 
     Identity() : exec(gko::ReferenceExecutor::create()) {}
 
     std::shared_ptr<const gko::Executor> exec;
 };
 
+TYPED_TEST_CASE(Identity, gko::test::ValueTypes);
 
-TEST_F(Identity, CanBeEmpty)
+
+TYPED_TEST(Identity, CanBeEmpty)
 {
-    auto empty = Id::create(exec);
+    using Id = typename TestFixture::Id;
+    auto empty = Id::create(this->exec);
     ASSERT_EQ(empty->get_size(), gko::dim<2>(0, 0));
 }
 
 
-TEST_F(Identity, CanBeConstructedWithSize)
+TYPED_TEST(Identity, CanBeConstructedWithSize)
 {
-    auto identity = Id::create(exec, 5);
+    using Id = typename TestFixture::Id;
+    auto identity = Id::create(this->exec, 5);
     ASSERT_EQ(identity->get_size(), gko::dim<2>(5, 5));
 }
 
 
-TEST_F(Identity, AppliesToVector)
+TYPED_TEST(Identity, AppliesToVector)
 {
-    auto identity = Id::create(exec, 3);
-    auto x = Vec::create(exec, gko::dim<2>{3, 1});
-    auto b = gko::initialize<Vec>({2.0, 1.0, 5.0}, exec);
+    using Id = typename TestFixture::Id;
+    using Vec = typename TestFixture::Vec;
+    auto identity = Id::create(this->exec, 3);
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 1});
+    auto b = gko::initialize<Vec>({2.0, 1.0, 5.0}, this->exec);
 
     identity->apply(b.get(), x.get());
 
@@ -80,12 +90,15 @@ TEST_F(Identity, AppliesToVector)
 }
 
 
-TEST_F(Identity, AppliesToMultipleVectors)
+TYPED_TEST(Identity, AppliesToMultipleVectors)
 {
-    auto identity = Id::create(exec, 3);
-    auto x = Vec::create(exec, gko::dim<2>{3, 2}, 3);
-    auto b =
-        gko::initialize<Vec>(3, {{2.0, 3.0}, {1.0, 2.0}, {5.0, -1.0}}, exec);
+    using Id = typename TestFixture::Id;
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto identity = Id::create(this->exec, 3);
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2}, 3);
+    auto b = gko::initialize<Vec>(
+        3, {I<T>{2.0, 3.0}, I<T>{1.0, 2.0}, I<T>{5.0, -1.0}}, this->exec);
 
     identity->apply(b.get(), x.get());
 
@@ -93,11 +106,20 @@ TEST_F(Identity, AppliesToMultipleVectors)
 }
 
 
-TEST(IdentityFactory, CanGenerateIdentityMatrix)
+template <typename T>
+class IdentityFactory : public ::testing::Test {
+protected:
+    using value_type = T;
+};
+
+TYPED_TEST_CASE(IdentityFactory, gko::test::ValueTypes);
+
+
+TYPED_TEST(IdentityFactory, CanGenerateIdentityMatrix)
 {
     auto exec = gko::ReferenceExecutor::create();
-    auto id_factory = gko::matrix::IdentityFactory<>::create(exec);
-    auto mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{5, 5});
+    auto id_factory = gko::matrix::IdentityFactory<TypeParam>::create(exec);
+    auto mtx = gko::matrix::Dense<TypeParam>::create(exec, gko::dim<2>{5, 5});
 
     auto id = id_factory->generate(std::move(mtx));
 
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
new file mode 100644
index 00000000000..c64f39fb3e2
--- /dev/null
+++ b/core/test/matrix/permutation.cpp
@@ -0,0 +1,294 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Permutation : public ::testing::Test {
+protected:
+    using v_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using i_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Vec = gko::matrix::Dense<v_type>;
+    using Csr = gko::matrix::Csr<v_type, i_type>;
+    Permutation()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::matrix::Permutation<i_type>::create(
+              exec, gko::dim<2>{4, 3}, gko::Array<i_type>{exec, {1, 0, 2, 3}}))
+    {}
+
+
+    static void assert_equal_to_original_mtx(
+        gko::matrix::Permutation<i_type> *m)
+    {
+        auto perm = m->get_permutation();
+        ASSERT_EQ(m->get_size(), gko::dim<2>(4, 3));
+        ASSERT_EQ(m->get_permutation_size(), 4);
+        ASSERT_EQ(perm[0], 1);
+        ASSERT_EQ(perm[1], 0);
+        ASSERT_EQ(perm[2], 2);
+        ASSERT_EQ(perm[3], 3);
+    }
+
+    static void assert_empty(gko::matrix::Permutation<i_type> *m)
+    {
+        ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0));
+        ASSERT_EQ(m->get_permutation_size(), 0);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<gko::matrix::Permutation<i_type>> mtx;
+};
+
+TYPED_TEST_CASE(Permutation, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(Permutation, CanBeEmpty)
+{
+    using i_type = typename TestFixture::i_type;
+    auto empty = gko::matrix::Permutation<i_type>::create(this->exec);
+
+    this->assert_empty(empty.get());
+}
+
+
+TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty)
+{
+    using i_type = typename TestFixture::i_type;
+    auto empty = gko::matrix::Permutation<i_type>::create(this->exec);
+
+    ASSERT_EQ(empty->get_const_permutation(), nullptr);
+}
+
+
+TYPED_TEST(Permutation, CanBeConstructedWithSize)
+{
+    using i_type = typename TestFixture::i_type;
+    auto m =
+        gko::matrix::Permutation<i_type>::create(this->exec, gko::dim<2>{2, 3});
+
+    ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(m->get_permutation_size(), 2);
+}
+
+
+TYPED_TEST(Permutation, FactorySetsCorrectPermuteMask)
+{
+    using i_type = typename TestFixture::i_type;
+    auto m = gko::matrix::Permutation<i_type>::create(this->exec);
+    auto mask = m->get_permute_mask();
+
+    ASSERT_EQ(mask, gko::matrix::row_permute);
+}
+
+
+TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData)
+{
+    using i_type = typename TestFixture::i_type;
+    using i_type = typename TestFixture::i_type;
+    i_type data[] = {1, 0, 2};
+
+    auto m = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3, 5},
+        gko::Array<i_type>::view(this->exec, 3, data));
+
+    ASSERT_EQ(m->get_const_permutation(), data);
+}
+
+
+TYPED_TEST(Permutation, CanBeConstructedWithSizeAndMask)
+{
+    using i_type = typename TestFixture::i_type;
+    auto m = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute);
+
+    ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(m->get_permutation_size(), 2);
+    ASSERT_EQ(m->get_permute_mask(), gko::matrix::column_permute);
+}
+
+
+TYPED_TEST(Permutation, CanExplicitlyOverrideSetPermuteMask)
+{
+    using i_type = typename TestFixture::i_type;
+    auto m = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute);
+
+    auto mask = m->get_permute_mask();
+    ASSERT_EQ(mask, gko::matrix::column_permute);
+
+    m->set_permute_mask(gko::matrix::row_permute |
+                        gko::matrix::inverse_permute);
+
+    auto s_mask = m->get_permute_mask();
+    ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute);
+}
+
+
+TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions)
+{
+    using i_type = typename TestFixture::i_type;
+    i_type data[] = {0, 2, 1};
+
+    ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
+                     this->exec, gko::dim<2>{4, 2},
+                     gko::Array<i_type>::view(this->exec, 3, data)),
+                 gko::ValueMismatch);
+}
+
+
+TYPED_TEST(Permutation, SettingMaskDoesNotModifyData)
+{
+    using i_type = typename TestFixture::i_type;
+    i_type data[] = {1, 0, 2};
+
+    auto m = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3, 5},
+        gko::Array<i_type>::view(this->exec, 3, data));
+
+    auto mask = m->get_permute_mask();
+    ASSERT_EQ(m->get_const_permutation(), data);
+    ASSERT_EQ(mask, gko::matrix::row_permute);
+
+    m->set_permute_mask(gko::matrix::row_permute |
+                        gko::matrix::inverse_permute);
+
+    auto s_mask = m->get_permute_mask();
+    ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute);
+    ASSERT_EQ(m->get_const_permutation(), data);
+}
+
+
+TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions)
+{
+    using i_type = typename TestFixture::i_type;
+    i_type data[] = {0, 2, 1};
+
+    ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
+                     this->exec, gko::dim<2>{3, 4},
+                     gko::Array<i_type>::view(this->exec, 3, data),
+                     gko::matrix::column_permute),
+                 gko::ValueMismatch);
+}
+
+
+TYPED_TEST(Permutation, KnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
+
+
+TYPED_TEST(Permutation, CanBeCopied)
+{
+    using i_type = typename TestFixture::i_type;
+    auto mtx_copy = gko::matrix::Permutation<i_type>::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_permutation()[0] = 3;
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Permutation, CanBeMoved)
+{
+    using i_type = typename TestFixture::i_type;
+    auto mtx_copy = gko::matrix::Permutation<i_type>::create(this->exec);
+
+    mtx_copy->copy_from(std::move(this->mtx));
+
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Permutation, CopyingPreservesMask)
+{
+    using i_type = typename TestFixture::i_type;
+    auto mtx_copy = gko::matrix::Permutation<i_type>::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    auto o_mask = this->mtx->get_permute_mask();
+    auto n_mask = mtx_copy->get_permute_mask();
+    ASSERT_EQ(o_mask, gko::matrix::row_permute);
+    ASSERT_EQ(o_mask, n_mask);
+
+    this->mtx->set_permute_mask(gko::matrix::column_permute);
+
+    o_mask = this->mtx->get_permute_mask();
+    n_mask = mtx_copy->get_permute_mask();
+    ASSERT_EQ(o_mask, gko::matrix::column_permute);
+    ASSERT_NE(o_mask, n_mask);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    n_mask = mtx_copy->get_permute_mask();
+    ASSERT_EQ(o_mask, n_mask);
+}
+
+
+TYPED_TEST(Permutation, CanBeCloned)
+{
+    auto mtx_clone = this->mtx->clone();
+
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+}
+
+
+TYPED_TEST(Permutation, CanBeCleared)
+{
+    this->mtx->clear();
+
+    this->assert_empty(this->mtx.get());
+}
+
+
+}  // namespace
diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp
index bc3d790ee34..d6f139ba82e 100644
--- a/core/test/matrix/sellp.cpp
+++ b/core/test/matrix/sellp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,16 +36,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Sellp : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Sellp<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Sellp<value_type, index_type>;
 
     Sellp()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::Sellp<>::create(exec, gko::dim<2>{2, 3}, 3))
+          mtx(gko::matrix::Sellp<value_type, index_type>::create(
+              exec, gko::dim<2>{2, 3}, 3))
     {
         mtx->read(
             {{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, {0, 2, 2.0}, {1, 1, 5.0}}});
@@ -77,12 +86,12 @@ class Sellp : public ::testing::Test {
         EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0);
         EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
         EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 5.0);
-        EXPECT_EQ(v[gko::matrix::default_slice_size], 3.0);
-        EXPECT_EQ(v[gko::matrix::default_slice_size + 1], 0.0);
-        EXPECT_EQ(v[2 * gko::matrix::default_slice_size], 2.0);
-        EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], 0.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{5.0});
+        EXPECT_EQ(v[gko::matrix::default_slice_size], value_type{3.0});
+        EXPECT_EQ(v[gko::matrix::default_slice_size + 1], value_type{0.0});
+        EXPECT_EQ(v[2 * gko::matrix::default_slice_size], value_type{2.0});
+        EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], value_type{0.0});
     }
 
     void assert_equal_to_original_mtx_with_slice_size_and_stride_factor(
@@ -109,12 +118,12 @@ class Sellp : public ::testing::Test {
         EXPECT_EQ(c[3], 0);
         EXPECT_EQ(c[4], 2);
         EXPECT_EQ(c[5], 0);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 5.0);
-        EXPECT_EQ(v[2], 3.0);
-        EXPECT_EQ(v[3], 0.0);
-        EXPECT_EQ(v[4], 2.0);
-        EXPECT_EQ(v[5], 0.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{5.0});
+        EXPECT_EQ(v[2], value_type{3.0});
+        EXPECT_EQ(v[3], value_type{0.0});
+        EXPECT_EQ(v[4], value_type{2.0});
+        EXPECT_EQ(v[5], value_type{0.0});
     }
 
     void assert_empty(const Mtx *m)
@@ -125,34 +134,43 @@ class Sellp : public ::testing::Test {
         ASSERT_EQ(m->get_const_values(), nullptr);
         ASSERT_EQ(m->get_const_col_idxs(), nullptr);
         ASSERT_EQ(m->get_const_slice_lengths(), nullptr);
-        ASSERT_EQ(m->get_const_slice_sets(), nullptr);
+        ASSERT_NE(m->get_const_slice_sets(), nullptr);
     }
 };
 
+TYPED_TEST_CASE(Sellp, gko::test::ValueIndexTypes);
+
 
-TEST_F(Sellp, KnowsItsSize)
+TYPED_TEST(Sellp, KnowsItsSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(mtx->get_num_stored_elements(), 192);
-    ASSERT_EQ(mtx->get_slice_size(), gko::matrix::default_slice_size);
-    ASSERT_EQ(mtx->get_stride_factor(), gko::matrix::default_stride_factor);
-    ASSERT_EQ(mtx->get_total_cols(), 3);
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx->get_num_stored_elements(), 192);
+    ASSERT_EQ(this->mtx->get_slice_size(), gko::matrix::default_slice_size);
+    ASSERT_EQ(this->mtx->get_stride_factor(),
+              gko::matrix::default_stride_factor);
+    ASSERT_EQ(this->mtx->get_total_cols(), 3);
 }
 
 
-TEST_F(Sellp, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); }
+TYPED_TEST(Sellp, ContainsCorrectData)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
 
 
-TEST_F(Sellp, CanBeEmpty)
+TYPED_TEST(Sellp, CanBeEmpty)
 {
-    auto mtx = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec);
 
-    assert_empty(mtx.get());
+    this->assert_empty(mtx.get());
 }
 
-TEST_F(Sellp, CanBeConstructedWithSliceSizeAndStrideFactor)
+
+TYPED_TEST(Sellp, CanBeConstructedWithSliceSizeAndStrideFactor)
 {
-    auto mtx = Mtx::create(exec, gko::dim<2>{2, 3}, 2, 2, 3);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec, gko::dim<2>{2, 3}, 2, 2, 3);
 
     ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
     ASSERT_EQ(mtx->get_num_stored_elements(), 6);
@@ -162,49 +180,53 @@ TEST_F(Sellp, CanBeConstructedWithSliceSizeAndStrideFactor)
 }
 
 
-TEST_F(Sellp, CanBeCopied)
+TYPED_TEST(Sellp, CanBeCopied)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(mtx.get());
+    copy->copy_from(this->mtx.get());
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Sellp, CanBeMoved)
+TYPED_TEST(Sellp, CanBeMoved)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(std::move(mtx));
+    copy->copy_from(std::move(this->mtx));
 
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(Sellp, CanBeCloned)
+TYPED_TEST(Sellp, CanBeCloned)
 {
-    auto clone = mtx->clone();
+    using Mtx = typename TestFixture::Mtx;
+    auto clone = this->mtx->clone();
 
-    assert_equal_to_original_mtx(mtx.get());
-    mtx->get_values()[1] = 5.0;
-    assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[1] = 5.0;
+    this->assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
 }
 
 
-TEST_F(Sellp, CanBeCleared)
+TYPED_TEST(Sellp, CanBeCleared)
 {
-    mtx->clear();
+    this->mtx->clear();
 
-    assert_empty(mtx.get());
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(Sellp, CanBeReadFromMatrixData)
+TYPED_TEST(Sellp, CanBeReadFromMatrixData)
 {
-    auto m = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec);
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -213,12 +235,14 @@ TEST_F(Sellp, CanBeReadFromMatrixData)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_mtx(m.get());
 }
 
-TEST_F(Sellp, CanBeReadFromMatrixDataWithSliceSizeAndStrideFactor)
+
+TYPED_TEST(Sellp, CanBeReadFromMatrixDataWithSliceSizeAndStrideFactor)
 {
-    auto m = Mtx::create(exec, gko::dim<2>{2, 3}, 2, 2, 3);
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec, gko::dim<2>{2, 3}, 2, 2, 3);
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
@@ -227,22 +251,26 @@ TEST_F(Sellp, CanBeReadFromMatrixDataWithSliceSizeAndStrideFactor)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx_with_slice_size_and_stride_factor(m.get());
+    this->assert_equal_to_original_mtx_with_slice_size_and_stride_factor(
+        m.get());
 }
 
-TEST_F(Sellp, GeneratesCorrectMatrixData)
+
+TYPED_TEST(Sellp, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 4);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0}));
 }
 
 
diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp
index 6d11271e2d3..7e26fee9c88 100644
--- a/core/test/matrix/sparsity_csr.cpp
+++ b/core/test/matrix/sparsity_csr.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,19 +43,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/dim.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class SparsityCsr : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::SparsityCsr<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::SparsityCsr<value_type, index_type>;
 
     SparsityCsr()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::SparsityCsr<>::create(exec, gko::dim<2>{2, 3}, 4))
+          mtx(gko::matrix::SparsityCsr<value_type, index_type>::create(
+              exec, gko::dim<2>{2, 3}, 4))
     {
-        Mtx::index_type *c = mtx->get_col_idxs();
-        Mtx::index_type *r = mtx->get_row_ptrs();
+        index_type *c = mtx->get_col_idxs();
+        index_type *r = mtx->get_row_ptrs();
         r[0] = 0;
         r[1] = 3;
         r[2] = 4;
@@ -82,7 +91,7 @@ class SparsityCsr : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
+        EXPECT_EQ(v[0], value_type{1.0});
     }
 
     void assert_empty(Mtx *m)
@@ -90,106 +99,117 @@ class SparsityCsr : public ::testing::Test {
         ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0));
         ASSERT_EQ(m->get_num_nonzeros(), 0);
         ASSERT_EQ(m->get_const_col_idxs(), nullptr);
-        ASSERT_EQ(m->get_const_row_ptrs(), nullptr);
-        ASSERT_EQ(m->get_const_value(), nullptr);
+        ASSERT_NE(m->get_const_row_ptrs(), nullptr);
+        ASSERT_NE(m->get_const_value(), nullptr);
         ASSERT_EQ(m->get_col_idxs(), nullptr);
-        ASSERT_EQ(m->get_row_ptrs(), nullptr);
-        ASSERT_EQ(m->get_value(), nullptr);
+        ASSERT_NE(m->get_row_ptrs(), nullptr);
+        ASSERT_NE(m->get_value(), nullptr);
     }
 };
 
+TYPED_TEST_CASE(SparsityCsr, gko::test::ValueIndexTypes);
+
 
-TEST_F(SparsityCsr, KnowsItsSize)
+TYPED_TEST(SparsityCsr, KnowsItsSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(mtx->get_num_nonzeros(), 4);
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx->get_num_nonzeros(), 4);
 }
 
 
-TEST_F(SparsityCsr, ContainsCorrectData)
+TYPED_TEST(SparsityCsr, ContainsCorrectData)
 {
-    assert_equal_to_original_mtx(mtx.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
 }
 
 
-TEST_F(SparsityCsr, CanBeEmpty)
+TYPED_TEST(SparsityCsr, CanBeEmpty)
 {
-    auto mtx = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec);
 
-    assert_empty(mtx.get());
+    this->assert_empty(mtx.get());
 }
 
 
-TEST_F(SparsityCsr, SetsCorrectDefaultValue)
+TYPED_TEST(SparsityCsr, SetsCorrectDefaultValue)
 {
-    auto mtx = gko::matrix::SparsityCsr<>::create(
-        exec, gko::dim<2>{3, 2}, static_cast<gko::size_type>(0));
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto mtx = gko::matrix::SparsityCsr<value_type, index_type>::create(
+        this->exec, gko::dim<2>{3, 2}, static_cast<gko::size_type>(0));
 
-    ASSERT_EQ(mtx->get_const_value()[0], 1.0);
-    ASSERT_EQ(mtx->get_value()[0], 1.0);
+    ASSERT_EQ(mtx->get_const_value()[0], value_type{1.0});
+    ASSERT_EQ(mtx->get_value()[0], value_type{1.0});
 }
 
 
-TEST_F(SparsityCsr, CanBeCreatedFromExistingData)
+TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingData)
 {
-    gko::int32 col_idxs[] = {0, 1, 1, 0};
-    gko::int32 row_ptrs[] = {0, 2, 3, 4};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    index_type col_idxs[] = {0, 1, 1, 0};
+    index_type row_ptrs[] = {0, 2, 3, 4};
 
-    auto mtx = gko::matrix::SparsityCsr<>::create(
-        exec, gko::dim<2>{3, 2},
-        gko::Array<gko::int32>::view(exec, 4, col_idxs),
-        gko::Array<gko::int32>::view(exec, 4, row_ptrs), 2.0);
+    auto mtx = gko::matrix::SparsityCsr<value_type, index_type>::create(
+        this->exec, gko::dim<2>{3, 2},
+        gko::Array<index_type>::view(this->exec, 4, col_idxs),
+        gko::Array<index_type>::view(this->exec, 4, row_ptrs), 2.0);
 
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
     ASSERT_EQ(mtx->get_const_row_ptrs(), row_ptrs);
-    ASSERT_EQ(mtx->get_const_value()[0], 2.0);
+    ASSERT_EQ(mtx->get_const_value()[0], value_type{2.0});
     ASSERT_EQ(mtx->get_col_idxs(), col_idxs);
     ASSERT_EQ(mtx->get_row_ptrs(), row_ptrs);
-    ASSERT_EQ(mtx->get_value()[0], 2.0);
+    ASSERT_EQ(mtx->get_value()[0], value_type{2.0});
 }
 
 
-TEST_F(SparsityCsr, CanBeCopied)
+TYPED_TEST(SparsityCsr, CanBeCopied)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(mtx.get());
+    copy->copy_from(this->mtx.get());
 
-    assert_equal_to_original_mtx(mtx.get());
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(SparsityCsr, CanBeMoved)
+TYPED_TEST(SparsityCsr, CanBeMoved)
 {
-    auto copy = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = Mtx::create(this->exec);
 
-    copy->copy_from(std::move(mtx));
+    copy->copy_from(std::move(this->mtx));
 
-    assert_equal_to_original_mtx(copy.get());
+    this->assert_equal_to_original_mtx(copy.get());
 }
 
 
-TEST_F(SparsityCsr, CanBeCloned)
+TYPED_TEST(SparsityCsr, CanBeCloned)
 {
-    auto clone = mtx->clone();
+    using Mtx = typename TestFixture::Mtx;
+    auto clone = this->mtx->clone();
 
-    assert_equal_to_original_mtx(mtx.get());
-    assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->assert_equal_to_original_mtx(dynamic_cast<Mtx *>(clone.get()));
 }
 
 
-TEST_F(SparsityCsr, CanBeCleared)
+TYPED_TEST(SparsityCsr, CanBeCleared)
 {
-    mtx->clear();
+    this->mtx->clear();
 
-    assert_empty(mtx.get());
+    this->assert_empty(this->mtx.get());
 }
 
 
-TEST_F(SparsityCsr, CanBeReadFromMatrixData)
+TYPED_TEST(SparsityCsr, CanBeReadFromMatrixData)
 {
-    auto m = Mtx::create(exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto m = Mtx::create(this->exec);
 
     m->read({{2, 3},
              {{0, 0, 1.0},
@@ -199,23 +219,25 @@ TEST_F(SparsityCsr, CanBeReadFromMatrixData)
               {1, 1, 5.0},
               {1, 2, 0.0}}});
 
-    assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_mtx(m.get());
 }
 
 
-TEST_F(SparsityCsr, GeneratesCorrectMatrixData)
+TYPED_TEST(SparsityCsr, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
 
-    mtx->write(data);
+    this->mtx->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>(2, 3));
     ASSERT_EQ(data.nonzeros.size(), 4);
-    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0));
-    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 1.0));
-    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 1.0));
-    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 1.0));
+    EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{1.0}));
+    EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{1.0}));
 }
 
 
diff --git a/core/test/preconditioner/CMakeLists.txt b/core/test/preconditioner/CMakeLists.txt
index 82eec1105ca..efbeed1af2e 100644
--- a/core/test/preconditioner/CMakeLists.txt
+++ b/core/test/preconditioner/CMakeLists.txt
@@ -1,2 +1,3 @@
 ginkgo_create_test(ilu)
+ginkgo_create_test(isai)
 ginkgo_create_test(jacobi)
diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp
index 0bfa2392833..2103e00d958 100644
--- a/core/test/preconditioner/ilu.cpp
+++ b/core/test/preconditioner/ilu.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,15 +43,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class IluFactory : public ::testing::Test {
 protected:
-    using value_type = double;
+    using value_type = T;
     using l_solver_type = gko::solver::Bicgstab<value_type>;
     using u_solver_type = gko::solver::Bicgstab<value_type>;
     using ilu_prec_type =
@@ -64,34 +65,41 @@ class IluFactory : public ::testing::Test {
     {}
 
     std::shared_ptr<const gko::Executor> exec;
-    std::shared_ptr<l_solver_type::Factory> l_factory;
-    std::shared_ptr<u_solver_type::Factory> u_factory;
+    std::shared_ptr<typename l_solver_type::Factory> l_factory;
+    std::shared_ptr<typename u_solver_type::Factory> u_factory;
 };
 
+TYPED_TEST_CASE(IluFactory, gko::test::ValueTypes);
 
-TEST_F(IluFactory, KnowsItsExecutor)
+
+TYPED_TEST(IluFactory, KnowsItsExecutor)
 {
-    auto ilu_factory = ilu_prec_type::build().on(exec);
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    auto ilu_factory = ilu_prec_type::build().on(this->exec);
 
-    ASSERT_EQ(ilu_factory->get_executor(), exec);
+    ASSERT_EQ(ilu_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(IluFactory, CanSetLSolverFactory)
+TYPED_TEST(IluFactory, CanSetLSolverFactory)
 {
-    auto ilu_factory =
-        ilu_prec_type::build().with_l_solver_factory(l_factory).on(exec);
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    auto ilu_factory = ilu_prec_type::build()
+                           .with_l_solver_factory(this->l_factory)
+                           .on(this->exec);
 
-    ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, l_factory);
+    ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory);
 }
 
 
-TEST_F(IluFactory, CanSetUSolverFactory)
+TYPED_TEST(IluFactory, CanSetUSolverFactory)
 {
-    auto ilu_factory =
-        ilu_prec_type::build().with_u_solver_factory(u_factory).on(exec);
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    auto ilu_factory = ilu_prec_type::build()
+                           .with_u_solver_factory(this->u_factory)
+                           .on(this->exec);
 
-    ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, u_factory);
+    ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, this->u_factory);
 }
 
 
diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp
new file mode 100644
index 00000000000..1a549bca874
--- /dev/null
+++ b/core/test/preconditioner/isai.cpp
@@ -0,0 +1,183 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+struct DummyOperator : public gko::EnableLinOp<DummyOperator>,
+                       gko::EnableCreateMethod<DummyOperator> {
+    DummyOperator(std::shared_ptr<const gko::Executor> exec,
+                  gko::dim<2> size = {})
+        : gko::EnableLinOp<DummyOperator>(exec, size)
+    {}
+
+    void apply_impl(const LinOp *b, LinOp *x) const override {}
+
+    void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta,
+                    LinOp *x) const override
+    {}
+};
+
+
+template <typename ValueIndexType>
+class IsaiFactory : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using LowerIsai = gko::preconditioner::LowerIsai<value_type, index_type>;
+    using UpperIsai = gko::preconditioner::UpperIsai<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    IsaiFactory()
+        : exec(gko::ReferenceExecutor::create()),
+          lower_isai_factory(LowerIsai::build().on(exec)),
+          upper_isai_factory(UpperIsai::build().on(exec))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<typename LowerIsai::Factory> lower_isai_factory;
+    std::unique_ptr<typename UpperIsai::Factory> upper_isai_factory;
+};
+
+TYPED_TEST_CASE(IsaiFactory, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(IsaiFactory, KnowsItsExecutor)
+{
+    ASSERT_EQ(this->lower_isai_factory->get_executor(), this->exec);
+    ASSERT_EQ(this->upper_isai_factory->get_executor(), this->exec);
+}
+
+
+TYPED_TEST(IsaiFactory, SetsSkipSortingCorrectly)
+{
+    using LowerIsai = typename TestFixture::LowerIsai;
+    using UpperIsai = typename TestFixture::UpperIsai;
+
+    auto l_isai_factory =
+        LowerIsai::build().with_skip_sorting(true).on(this->exec);
+    auto u_isai_factory =
+        UpperIsai::build().with_skip_sorting(true).on(this->exec);
+
+    ASSERT_EQ(l_isai_factory->get_parameters().skip_sorting, true);
+    ASSERT_EQ(u_isai_factory->get_parameters().skip_sorting, true);
+}
+
+
+TYPED_TEST(IsaiFactory, SetsDefaultSkipSortingCorrectly)
+{
+    ASSERT_EQ(this->lower_isai_factory->get_parameters().skip_sorting, false);
+    ASSERT_EQ(this->upper_isai_factory->get_parameters().skip_sorting, false);
+}
+
+
+TYPED_TEST(IsaiFactory, SetsSparsityPowerCorrectly)
+{
+    using LowerIsai = typename TestFixture::LowerIsai;
+    using UpperIsai = typename TestFixture::UpperIsai;
+
+    auto l_isai_factory =
+        LowerIsai::build().with_sparsity_power(2).on(this->exec);
+    auto u_isai_factory =
+        UpperIsai::build().with_sparsity_power(2).on(this->exec);
+
+    ASSERT_EQ(l_isai_factory->get_parameters().sparsity_power, 2);
+    ASSERT_EQ(u_isai_factory->get_parameters().sparsity_power, 2);
+}
+
+
+TYPED_TEST(IsaiFactory, SetsDefaultSparsityPowerCorrectly)
+{
+    ASSERT_EQ(this->lower_isai_factory->get_parameters().sparsity_power, 1);
+    ASSERT_EQ(this->upper_isai_factory->get_parameters().sparsity_power, 1);
+}
+
+
+TYPED_TEST(IsaiFactory, ThrowsWrongDimensionL)
+{
+    using Csr = typename TestFixture::Csr;
+    auto mtx = Csr::create(this->exec, gko::dim<2>{1, 2}, 1);
+
+    ASSERT_THROW(this->lower_isai_factory->generate(gko::share(mtx)),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(IsaiFactory, ThrowsWrongDimensionU)
+{
+    using Csr = typename TestFixture::Csr;
+    auto mtx = Csr::create(this->exec, gko::dim<2>{1, 2}, 1);
+
+    ASSERT_THROW(this->upper_isai_factory->generate(gko::share(mtx)),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(IsaiFactory, ThrowsNoConversionCsrL)
+{
+    using Csr = typename TestFixture::Csr;
+    auto mtx = DummyOperator::create(this->exec, gko::dim<2>{2, 2});
+
+    ASSERT_THROW(this->lower_isai_factory->generate(gko::share(mtx)),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(IsaiFactory, ThrowsNoConversionCsrU)
+{
+    using Csr = typename TestFixture::Csr;
+    auto mtx = DummyOperator::create(this->exec, gko::dim<2>{2, 2});
+
+    ASSERT_THROW(this->upper_isai_factory->generate(gko::share(mtx)),
+                 gko::NotSupported);
+}
+
+
+}  // namespace
diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp
index d2c76f77bd4..0af14acbabf 100644
--- a/core/test/preconditioner/jacobi.cpp
+++ b/core/test/preconditioner/jacobi.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,19 +39,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class JacobiFactory : public ::testing::Test {
 protected:
-    using Bj = gko::preconditioner::Jacobi<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Bj = gko::preconditioner::Jacobi<value_type, index_type>;
 
     JacobiFactory()
         : exec(gko::ReferenceExecutor::create()),
           bj_factory(Bj::build().with_max_block_size(3u).on(exec)),
           block_pointers(exec, 2),
           block_precisions(exec, 2),
-          mtx(gko::matrix::Csr<>::create(exec, gko::dim<2>{5, 5}, 13))
+          mtx(gko::matrix::Csr<value_type, index_type>::create(
+              exec, gko::dim<2>{5, 5}, 13))
     {
         block_pointers.get_data()[0] = 2;
         block_pointers.get_data()[1] = 3;
@@ -60,31 +69,34 @@ class JacobiFactory : public ::testing::Test {
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<Bj::Factory> bj_factory;
-    gko::Array<gko::int32> block_pointers;
+    std::unique_ptr<typename Bj::Factory> bj_factory;
+    gko::Array<index_type> block_pointers;
     gko::Array<gko::precision_reduction> block_precisions;
-    std::shared_ptr<gko::matrix::Csr<>> mtx;
+    std::shared_ptr<gko::matrix::Csr<value_type, index_type>> mtx;
 };
 
+TYPED_TEST_CASE(JacobiFactory, gko::test::ValueIndexTypes);
 
-TEST_F(JacobiFactory, KnowsItsExecutor)
+
+TYPED_TEST(JacobiFactory, KnowsItsExecutor)
 {
-    ASSERT_EQ(bj_factory->get_executor(), exec);
+    ASSERT_EQ(this->bj_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(JacobiFactory, SavesMaximumBlockSize)
+TYPED_TEST(JacobiFactory, SavesMaximumBlockSize)
 {
-    ASSERT_EQ(bj_factory->get_parameters().max_block_size, 3);
+    ASSERT_EQ(this->bj_factory->get_parameters().max_block_size, 3);
 }
 
 
-TEST_F(JacobiFactory, CanSetBlockPointers)
+TYPED_TEST(JacobiFactory, CanSetBlockPointers)
 {
+    using Bj = typename TestFixture::Bj;
     auto bj_factory = Bj::build()
                           .with_max_block_size(3u)
-                          .with_block_pointers(block_pointers)
-                          .on(exec);
+                          .with_block_pointers(this->block_pointers)
+                          .on(this->exec);
 
     auto ptrs = bj_factory->get_parameters().block_pointers;
     EXPECT_EQ(ptrs.get_data()[0], 2);
@@ -92,12 +104,13 @@ TEST_F(JacobiFactory, CanSetBlockPointers)
 }
 
 
-TEST_F(JacobiFactory, CanMoveBlockPointers)
+TYPED_TEST(JacobiFactory, CanMoveBlockPointers)
 {
+    using Bj = typename TestFixture::Bj;
     auto bj_factory = Bj::build()
                           .with_max_block_size(3u)
-                          .with_block_pointers(std::move(block_pointers))
-                          .on(exec);
+                          .with_block_pointers(std::move(this->block_pointers))
+                          .on(this->exec);
 
     auto ptrs = bj_factory->get_parameters().block_pointers;
     EXPECT_EQ(ptrs.get_data()[0], 2);
@@ -105,12 +118,13 @@ TEST_F(JacobiFactory, CanMoveBlockPointers)
 }
 
 
-TEST_F(JacobiFactory, CanSetBlockPrecisions)
+TYPED_TEST(JacobiFactory, CanSetBlockPrecisions)
 {
+    using Bj = typename TestFixture::Bj;
     auto bj_factory = Bj::build()
                           .with_max_block_size(3u)
-                          .with_storage_optimization(block_precisions)
-                          .on(exec);
+                          .with_storage_optimization(this->block_precisions)
+                          .on(this->exec);
 
     auto prec = bj_factory->get_parameters().storage_optimization.block_wise;
     EXPECT_EQ(prec.get_data()[0], gko::precision_reduction(0, 1));
@@ -118,13 +132,14 @@ TEST_F(JacobiFactory, CanSetBlockPrecisions)
 }
 
 
-TEST_F(JacobiFactory, CanMoveBlockPrecisions)
+TYPED_TEST(JacobiFactory, CanMoveBlockPrecisions)
 {
+    using Bj = typename TestFixture::Bj;
     auto bj_factory =
         Bj::build()
             .with_max_block_size(3u)
-            .with_storage_optimization(std::move(block_precisions))
-            .on(exec);
+            .with_storage_optimization(std::move(this->block_precisions))
+            .on(this->exec);
 
     auto prec = bj_factory->get_parameters().storage_optimization.block_wise;
     EXPECT_EQ(prec.get_data()[0], gko::precision_reduction(0, 1));
@@ -132,41 +147,46 @@ TEST_F(JacobiFactory, CanMoveBlockPrecisions)
 }
 
 
+template <typename T>
 class BlockInterleavedStorageScheme : public ::testing::Test {
 protected:
+    using index_type = T;
     // groups of 4 blocks, offset of 3 within the group and 16 between groups
-    gko::preconditioner::block_interleaved_storage_scheme<gko::int32> s{3, 16,
+    gko::preconditioner::block_interleaved_storage_scheme<index_type> s{3, 16,
                                                                         2};
 };
 
+TYPED_TEST_CASE(BlockInterleavedStorageScheme, gko::test::IndexTypes);
+
 
-TEST_F(BlockInterleavedStorageScheme, ComputesStorageSpace)
+TYPED_TEST(BlockInterleavedStorageScheme, ComputesStorageSpace)
 {
-    ASSERT_EQ(s.compute_storage_space(10), 16 * 3);  // 3 groups of 16 elements
+    ASSERT_EQ(this->s.compute_storage_space(10),
+              16 * 3);  // 3 groups of 16 elements
 }
 
 
-TEST_F(BlockInterleavedStorageScheme, ComputesGroupOffset)
+TYPED_TEST(BlockInterleavedStorageScheme, ComputesGroupOffset)
 {
-    ASSERT_EQ(s.get_group_offset(17), 16 * 4);  // 5th group
+    ASSERT_EQ(this->s.get_group_offset(17), 16 * 4);  // 5th group
 }
 
 
-TEST_F(BlockInterleavedStorageScheme, ComputesBlockOffset)
+TYPED_TEST(BlockInterleavedStorageScheme, ComputesBlockOffset)
 {
-    ASSERT_EQ(s.get_block_offset(17), 1 * 3);  // 2nd in group
+    ASSERT_EQ(this->s.get_block_offset(17), 1 * 3);  // 2nd in group
 }
 
 
-TEST_F(BlockInterleavedStorageScheme, ComputesGlobalBlockOffset)
+TYPED_TEST(BlockInterleavedStorageScheme, ComputesGlobalBlockOffset)
 {
-    ASSERT_EQ(s.get_global_block_offset(17), 16 * 4 + 1 * 3);
+    ASSERT_EQ(this->s.get_global_block_offset(17), 16 * 4 + 1 * 3);
 }
 
 
-TEST_F(BlockInterleavedStorageScheme, ComputesStride)
+TYPED_TEST(BlockInterleavedStorageScheme, ComputesStride)
 {
-    ASSERT_EQ(s.get_stride(), 4 * 3);  // 4 offsets of 3
+    ASSERT_EQ(this->s.get_stride(), 4 * 3);  // 4 offsets of 3
 }
 
 
diff --git a/core/test/solver/CMakeLists.txt b/core/test/solver/CMakeLists.txt
index b8b6e6fe596..e017edd6bee 100644
--- a/core/test/solver/CMakeLists.txt
+++ b/core/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_test(bicg)
 ginkgo_create_test(bicgstab)
 ginkgo_create_test(cg)
 ginkgo_create_test(cgs)
diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp
new file mode 100644
index 00000000000..0d8763cdc86
--- /dev/null
+++ b/core/test/solver/bicg.cpp
@@ -0,0 +1,303 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicg.hpp>
+
+
+#include <typeinfo>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class Bicg : public ::testing::Test {
+protected:
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Bicg<value_type>;
+
+    Bicg()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::initialize<Mtx>(
+              {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
+          bicg_factory(
+              Solver::build()
+                  .with_criteria(
+                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::ResidualNormReduction<>::build()
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
+                          .on(exec))
+                  .on(exec)),
+          solver(bicg_factory->generate(mtx))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<Mtx> mtx;
+    std::unique_ptr<typename Solver::Factory> bicg_factory;
+    std::unique_ptr<gko::LinOp> solver;
+
+    static void assert_same_matrices(const Mtx *m1, const Mtx *m2)
+    {
+        ASSERT_EQ(m1->get_size()[0], m2->get_size()[0]);
+        ASSERT_EQ(m1->get_size()[1], m2->get_size()[1]);
+        for (gko::size_type i = 0; i < m1->get_size()[0]; ++i) {
+            for (gko::size_type j = 0; j < m2->get_size()[1]; ++j) {
+                EXPECT_EQ(m1->at(i, j), m2->at(i, j));
+            }
+        }
+    }
+};
+
+TYPED_TEST_CASE(Bicg, gko::test::ValueTypes);
+
+
+TYPED_TEST(Bicg, BicgFactoryKnowsItsExecutor)
+{
+    ASSERT_EQ(this->bicg_factory->get_executor(), this->exec);
+}
+
+
+TYPED_TEST(Bicg, BicgFactoryCreatesCorrectSolver)
+{
+    using Solver = typename TestFixture::Solver;
+
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto bicg_solver = static_cast<Solver *>(this->solver.get());
+    ASSERT_NE(bicg_solver->get_system_matrix(), nullptr);
+    ASSERT_EQ(bicg_solver->get_system_matrix(), this->mtx);
+}
+
+
+TYPED_TEST(Bicg, CanBeCopied)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->bicg_factory->generate(Mtx::create(this->exec));
+
+    copy->copy_from(this->solver.get());
+
+    ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
+    auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
+}
+
+
+TYPED_TEST(Bicg, CanBeMoved)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->bicg_factory->generate(Mtx::create(this->exec));
+
+    copy->copy_from(std::move(this->solver));
+
+    ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
+    auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
+}
+
+
+TYPED_TEST(Bicg, CanBeCloned)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
+
+    ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
+    auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
+    this->assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()),
+                               this->mtx.get());
+}
+
+
+TYPED_TEST(Bicg, CanBeCleared)
+{
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
+
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
+    ASSERT_EQ(solver_mtx, nullptr);
+}
+
+
+TYPED_TEST(Bicg, ApplyUsesInitialGuessReturnsTrue)
+{
+    using Solver = typename TestFixture::Solver;
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Bicg, CanSetPreconditionerGenerator)
+{
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
+    auto bicg_factory =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(
+                        gko::remove_complex<value_type>(1e-6))
+                    .on(this->exec))
+            .with_preconditioner(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = bicg_factory->generate(this->mtx);
+    auto precond = dynamic_cast<const gko::solver::Bicg<value_type> *>(
+        static_cast<gko::solver::Bicg<value_type> *>(solver.get())
+            ->get_preconditioner()
+            .get());
+
+    ASSERT_NE(precond, nullptr);
+    ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3));
+    ASSERT_EQ(precond->get_system_matrix(), this->mtx);
+}
+
+
+TYPED_TEST(Bicg, CanSetPreconditionerInFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Solver> bicg_precond =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
+
+    auto bicg_factory =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_generated_preconditioner(bicg_precond)
+            .on(this->exec);
+    auto solver = bicg_factory->generate(this->mtx);
+    auto precond = solver->get_preconditioner();
+
+    ASSERT_NE(precond.get(), nullptr);
+    ASSERT_EQ(precond.get(), bicg_precond.get());
+}
+
+
+TYPED_TEST(Bicg, CanSetCriteriaAgain)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto bicg_factory = Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((bicg_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = bicg_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
+}
+
+
+TYPED_TEST(Bicg, ThrowsOnWrongPreconditionerInFactory)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
+    std::shared_ptr<Solver> bicg_precond =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(wrong_sized_mtx);
+
+    auto bicg_factory =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_generated_preconditioner(bicg_precond)
+            .on(this->exec);
+
+    ASSERT_THROW(bicg_factory->generate(this->mtx), gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Bicg, CanSetPreconditioner)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Solver> bicg_precond =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
+
+    auto bicg_factory =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = bicg_factory->generate(this->mtx);
+    solver->set_preconditioner(bicg_precond);
+    auto precond = solver->get_preconditioner();
+
+    ASSERT_NE(precond.get(), nullptr);
+    ASSERT_EQ(precond.get(), bicg_precond.get());
+}
+
+
+}  // namespace
diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp
index a582c1e5264..16d5b8a9bff 100644
--- a/core/test/solver/bicgstab.cpp
+++ b/core/test/solver/bicgstab.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,17 +40,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Bicgstab : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Bicgstab<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Bicgstab<value_type>;
 
     Bicgstab()
         : exec(gko::ReferenceExecutor::create()),
@@ -60,8 +65,8 @@ class Bicgstab : public ::testing::Test {
               Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
                           .on(exec))
                   .on(exec)),
           solver(bicgstab_factory->generate(mtx))
@@ -69,7 +74,7 @@ class Bicgstab : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<Solver::Factory> bicgstab_factory;
+    std::unique_ptr<typename Solver::Factory> bicgstab_factory;
     std::unique_ptr<gko::LinOp> solver;
 
     static void assert_same_matrices(const Mtx *m1, const Mtx *m2)
@@ -84,101 +89,154 @@ class Bicgstab : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Bicgstab, gko::test::ValueTypes);
+
 
-TEST_F(Bicgstab, BicgstabFactoryKnowsItsExecutor)
+TYPED_TEST(Bicgstab, BicgstabFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(bicgstab_factory->get_executor(), exec);
+    ASSERT_EQ(this->bicgstab_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(Bicgstab, BicgstabFactoryCreatesCorrectSolver)
+TYPED_TEST(Bicgstab, BicgstabFactoryCreatesCorrectSolver)
 {
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
-    auto bicgstab_solver = static_cast<Solver *>(solver.get());
+    using Solver = typename TestFixture::Solver;
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto bicgstab_solver = static_cast<Solver *>(this->solver.get());
     ASSERT_NE(bicgstab_solver->get_system_matrix(), nullptr);
-    ASSERT_EQ(bicgstab_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(bicgstab_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Bicgstab, CanBeCopied)
+TYPED_TEST(Bicgstab, CanBeCopied)
 {
-    auto copy = bicgstab_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->bicgstab_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(solver.get());
+    copy->copy_from(this->solver.get());
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Bicgstab, CanBeMoved)
+TYPED_TEST(Bicgstab, CanBeMoved)
 {
-    auto copy = bicgstab_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->bicgstab_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Bicgstab, CanBeCloned)
+TYPED_TEST(Bicgstab, CanBeCloned)
 {
-    auto clone = solver->clone();
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
     auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Bicgstab, CanBeCleared)
+TYPED_TEST(Bicgstab, CanBeCleared)
 {
-    solver->clear();
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
-    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
 
-TEST_F(Bicgstab, CanSetPreconditionerGenerator)
+TYPED_TEST(Bicgstab, ApplyUsesInitialGuessReturnsTrue)
 {
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Bicgstab, CanSetPreconditionerGenerator)
+{
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto bicgstab_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .with_preconditioner(Solver::build().on(exec))
-            .on(exec);
-
-    auto solver = bicgstab_factory->generate(mtx);
-    auto precond = dynamic_cast<const gko::solver::Bicgstab<> *>(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_preconditioner(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+
+    auto solver = bicgstab_factory->generate(this->mtx);
+    auto precond = dynamic_cast<const gko::solver::Bicgstab<value_type> *>(
         gko::lend(solver->get_preconditioner()));
 
     ASSERT_NE(precond, nullptr);
     ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3));
-    ASSERT_EQ(precond->get_system_matrix(), mtx);
+    ASSERT_EQ(precond->get_system_matrix(), this->mtx);
+}
+
+
+TYPED_TEST(Bicgstab, CanSetCriteriaAgain)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto bicgstab_factory =
+        Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((bicgstab_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = bicgstab_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
 }
 
 
-TEST_F(Bicgstab, CanSetPreconditionerInFactory)
+TYPED_TEST(Bicgstab, CanSetPreconditionerInFactory)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> bicgstab_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto bicgstab_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(bicgstab_precond)
-            .on(exec);
-    auto solver = bicgstab_factory->generate(mtx);
+            .on(this->exec);
+    auto solver = bicgstab_factory->generate(this->mtx);
     auto precond = solver->get_preconditioner();
 
     ASSERT_NE(precond.get(), nullptr);
@@ -186,42 +244,46 @@ TEST_F(Bicgstab, CanSetPreconditionerInFactory)
 }
 
 
-TEST_F(Bicgstab, ThrowsOnWrongPreconditionerInFactory)
+TYPED_TEST(Bicgstab, ThrowsOnWrongPreconditionerInFactory)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> bicgstab_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto bicgstab_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(bicgstab_precond)
-            .on(exec);
+            .on(this->exec);
 
-    ASSERT_THROW(bicgstab_factory->generate(mtx), gko::DimensionMismatch);
+    ASSERT_THROW(bicgstab_factory->generate(this->mtx), gko::DimensionMismatch);
 }
 
 
-TEST_F(Bicgstab, CanSetPreconditioner)
+TYPED_TEST(Bicgstab, CanSetPreconditioner)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> bicgstab_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto bicgstab_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = bicgstab_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = bicgstab_factory->generate(this->mtx);
     solver->set_preconditioner(bicgstab_precond);
     auto precond = solver->get_preconditioner();
 
diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp
index 9461ba0fc5a..e6652defb0e 100644
--- a/core/test/solver/cg.cpp
+++ b/core/test/solver/cg.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,16 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Cg : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Cg<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Cg<value_type>;
 
     Cg()
         : exec(gko::ReferenceExecutor::create()),
@@ -62,8 +67,8 @@ class Cg : public ::testing::Test {
               Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
                           .on(exec))
                   .on(exec)),
           solver(cg_factory->generate(mtx))
@@ -71,7 +76,7 @@ class Cg : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<Solver::Factory> cg_factory;
+    std::unique_ptr<typename Solver::Factory> cg_factory;
     std::unique_ptr<gko::LinOp> solver;
 
     static void assert_same_matrices(const Mtx *m1, const Mtx *m2)
@@ -86,105 +91,135 @@ class Cg : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Cg, gko::test::ValueTypes);
 
-TEST_F(Cg, CgFactoryKnowsItsExecutor)
+
+TYPED_TEST(Cg, CgFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(cg_factory->get_executor(), exec);
+    ASSERT_EQ(this->cg_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(Cg, CgFactoryCreatesCorrectSolver)
+TYPED_TEST(Cg, CgFactoryCreatesCorrectSolver)
 {
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
-    auto cg_solver = static_cast<Solver *>(solver.get());
+    using Solver = typename TestFixture::Solver;
+
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto cg_solver = static_cast<Solver *>(this->solver.get());
     ASSERT_NE(cg_solver->get_system_matrix(), nullptr);
-    ASSERT_EQ(cg_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(cg_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Cg, CanBeCopied)
+TYPED_TEST(Cg, CanBeCopied)
 {
-    auto copy = cg_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->cg_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(solver.get());
+    copy->copy_from(this->solver.get());
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Cg, CanBeMoved)
+TYPED_TEST(Cg, CanBeMoved)
 {
-    auto copy = cg_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->cg_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Cg, CanBeCloned)
+TYPED_TEST(Cg, CanBeCloned)
 {
-    auto clone = solver->clone();
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
     auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Cg, CanBeCleared)
+TYPED_TEST(Cg, CanBeCleared)
 {
-    solver->clear();
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
-    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
 
-TEST_F(Cg, CanSetPreconditionerGenerator)
+TYPED_TEST(Cg, ApplyUsesInitialGuessReturnsTrue)
+{
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Cg, CanSetPreconditionerGenerator)
 {
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto cg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-6)
-                    .on(exec))
-            .with_preconditioner(Solver::build().on(exec))
-            .on(exec);
-    auto solver = cg_factory->generate(mtx);
-    auto precond = dynamic_cast<const gko::solver::Cg<> *>(
-        static_cast<gko::solver::Cg<> *>(solver.get())
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(
+                        gko::remove_complex<value_type>(1e-6))
+                    .on(this->exec))
+            .with_preconditioner(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = cg_factory->generate(this->mtx);
+    auto precond = dynamic_cast<const gko::solver::Cg<value_type> *>(
+        static_cast<gko::solver::Cg<value_type> *>(solver.get())
             ->get_preconditioner()
             .get());
 
     ASSERT_NE(precond, nullptr);
     ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3));
-    ASSERT_EQ(precond->get_system_matrix(), mtx);
+    ASSERT_EQ(precond->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Cg, CanSetPreconditionerInFactory)
+TYPED_TEST(Cg, CanSetPreconditionerInFactory)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cg_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto cg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(cg_precond)
-            .on(exec);
-    auto solver = cg_factory->generate(mtx);
+            .on(this->exec);
+    auto solver = cg_factory->generate(this->mtx);
     auto precond = solver->get_preconditioner();
 
     ASSERT_NE(precond.get(), nullptr);
@@ -192,42 +227,70 @@ TEST_F(Cg, CanSetPreconditionerInFactory)
 }
 
 
-TEST_F(Cg, ThrowsOnWrongPreconditionerInFactory)
+TYPED_TEST(Cg, CanSetCriteriaAgain)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto cg_factory = Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((cg_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = cg_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
+}
+
+
+TYPED_TEST(Cg, ThrowsOnWrongPreconditionerInFactory)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> cg_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto cg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(cg_precond)
-            .on(exec);
+            .on(this->exec);
 
-    ASSERT_THROW(cg_factory->generate(mtx), gko::DimensionMismatch);
+    ASSERT_THROW(cg_factory->generate(this->mtx), gko::DimensionMismatch);
 }
 
 
-TEST_F(Cg, CanSetPreconditioner)
+TYPED_TEST(Cg, CanSetPreconditioner)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cg_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto cg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = cg_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = cg_factory->generate(this->mtx);
     solver->set_preconditioner(cg_precond);
     auto precond = solver->get_preconditioner();
 
diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp
index cc65423ddf8..04f7c31aab9 100644
--- a/core/test/solver/cgs.cpp
+++ b/core/test/solver/cgs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,16 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Cgs : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Cgs<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Cgs<value_type>;
 
     Cgs()
         : exec(gko::ReferenceExecutor::create()),
@@ -62,8 +67,8 @@ class Cgs : public ::testing::Test {
               Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
                           .on(exec))
                   .on(exec)),
           solver(cgs_factory->generate(mtx))
@@ -71,7 +76,7 @@ class Cgs : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<Solver::Factory> cgs_factory;
+    std::unique_ptr<typename Solver::Factory> cgs_factory;
     std::unique_ptr<gko::LinOp> solver;
 
     static void assert_same_matrices(const Mtx *m1, const Mtx *m2)
@@ -86,105 +91,159 @@ class Cgs : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Cgs, gko::test::ValueTypes);
 
-TEST_F(Cgs, CgsFactoryKnowsItsExecutor)
+
+TYPED_TEST(Cgs, CgsFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(cgs_factory->get_executor(), exec);
+    ASSERT_EQ(this->cgs_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(Cgs, CgsFactoryCreatesCorrectSolver)
+TYPED_TEST(Cgs, CgsFactoryCreatesCorrectSolver)
 {
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
-    auto cgs_solver = static_cast<Solver *>(solver.get());
+    using Solver = typename TestFixture::Solver;
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto cgs_solver = static_cast<Solver *>(this->solver.get());
     ASSERT_NE(cgs_solver->get_system_matrix(), nullptr);
-    ASSERT_EQ(cgs_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(cgs_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Cgs, CanBeCopied)
+TYPED_TEST(Cgs, CanBeCopied)
 {
-    auto copy = cgs_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->cgs_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(solver.get());
+    copy->copy_from(this->solver.get());
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Cgs, CanBeMoved)
+TYPED_TEST(Cgs, CanBeMoved)
 {
-    auto copy = cgs_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->cgs_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Cgs, CanBeCloned)
+TYPED_TEST(Cgs, CanBeCloned)
 {
-    auto clone = solver->clone();
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
     auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Cgs, CanBeCleared)
+TYPED_TEST(Cgs, CanBeCleared)
 {
-    solver->clear();
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
-    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
 
-TEST_F(Cgs, CanSetPreconditionerGenerator)
+TYPED_TEST(Cgs, ApplyUsesInitialGuessReturnsTrue)
+{
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Cgs, CanSetPreconditionerGenerator)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto cgs_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-6)
-                    .on(exec))
-            .with_preconditioner(Solver::build().on(exec))
-            .on(exec);
-    auto solver = cgs_factory->generate(mtx);
-    auto precond = dynamic_cast<const gko::solver::Cgs<> *>(
-        static_cast<gko::solver::Cgs<> *>(solver.get())
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(
+                        gko::remove_complex<value_type>(1e-6))
+                    .on(this->exec))
+            .with_preconditioner(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = cgs_factory->generate(this->mtx);
+    auto precond = dynamic_cast<const gko::solver::Cgs<value_type> *>(
+        static_cast<gko::solver::Cgs<value_type> *>(solver.get())
             ->get_preconditioner()
             .get());
 
     ASSERT_NE(precond, nullptr);
     ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3));
-    ASSERT_EQ(precond->get_system_matrix(), mtx);
+    ASSERT_EQ(precond->get_system_matrix(), this->mtx);
+}
+
+
+TYPED_TEST(Cgs, CanSetCriteriaAgain)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto cgs_factory = Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((cgs_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = cgs_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
 }
 
 
-TEST_F(Cgs, CanSetPreconditionerInFactory)
+TYPED_TEST(Cgs, CanSetPreconditionerInFactory)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cgs_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto cgs_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(cgs_precond)
-            .on(exec);
-    auto solver = cgs_factory->generate(mtx);
+            .on(this->exec);
+    auto solver = cgs_factory->generate(this->mtx);
     auto precond = solver->get_preconditioner();
 
     ASSERT_NE(precond.get(), nullptr);
@@ -192,42 +251,46 @@ TEST_F(Cgs, CanSetPreconditionerInFactory)
 }
 
 
-TEST_F(Cgs, ThrowsOnWrongPreconditionerInFactory)
+TYPED_TEST(Cgs, ThrowsOnWrongPreconditionerInFactory)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> cgs_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto cgs_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(cgs_precond)
-            .on(exec);
+            .on(this->exec);
 
-    ASSERT_THROW(cgs_factory->generate(mtx), gko::DimensionMismatch);
+    ASSERT_THROW(cgs_factory->generate(this->mtx), gko::DimensionMismatch);
 }
 
 
-TEST_F(Cgs, CanSetPreconditioner)
+TYPED_TEST(Cgs, CanSetPreconditioner)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cgs_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto cgs_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = cgs_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = cgs_factory->generate(this->mtx);
     solver->set_preconditioner(cgs_precond);
     auto precond = solver->get_preconditioner();
 
diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp
index bf4618b1a89..6b9c0e954a7 100644
--- a/core/test/solver/fcg.cpp
+++ b/core/test/solver/fcg.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,21 +36,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Fcg : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Fcg<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Fcg<value_type>;
 
     Fcg()
         : exec(gko::ReferenceExecutor::create()),
@@ -60,8 +64,8 @@ class Fcg : public ::testing::Test {
               Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
                           .on(exec))
                   .on(exec)),
           solver(fcg_factory->generate(mtx))
@@ -69,112 +73,162 @@ class Fcg : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<Solver::Factory> fcg_factory;
+    std::unique_ptr<typename Solver::Factory> fcg_factory;
     std::unique_ptr<gko::LinOp> solver;
 };
 
+TYPED_TEST_CASE(Fcg, gko::test::ValueTypes);
 
-TEST_F(Fcg, FcgFactoryKnowsItsExecutor)
+
+TYPED_TEST(Fcg, FcgFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(fcg_factory->get_executor(), exec);
+    ASSERT_EQ(this->fcg_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(Fcg, FcgFactoryCreatesCorrectSolver)
+TYPED_TEST(Fcg, FcgFactoryCreatesCorrectSolver)
 {
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
-    auto fcg_solver = dynamic_cast<Solver *>(solver.get());
+    using Solver = typename TestFixture::Solver;
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto fcg_solver = dynamic_cast<Solver *>(this->solver.get());
     ASSERT_NE(fcg_solver->get_system_matrix(), nullptr);
-    ASSERT_EQ(fcg_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(fcg_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Fcg, CanBeCopied)
+TYPED_TEST(Fcg, CanBeCopied)
 {
-    auto copy = fcg_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->fcg_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(solver.get());
+    copy->copy_from(this->solver.get());
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = dynamic_cast<Solver *>(copy.get())->get_system_matrix();
-    GKO_ASSERT_MTX_NEAR(dynamic_cast<const Mtx *>(copy_mtx.get()), mtx.get(),
-                        1e-14);
+    GKO_ASSERT_MTX_NEAR(dynamic_cast<const Mtx *>(copy_mtx.get()),
+                        this->mtx.get(), 0.0);
 }
 
 
-TEST_F(Fcg, CanBeMoved)
+TYPED_TEST(Fcg, CanBeMoved)
 {
-    auto copy = fcg_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->fcg_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = dynamic_cast<Solver *>(copy.get())->get_system_matrix();
-    GKO_ASSERT_MTX_NEAR(dynamic_cast<const Mtx *>(copy_mtx.get()), mtx.get(),
-                        1e-14);
+    GKO_ASSERT_MTX_NEAR(dynamic_cast<const Mtx *>(copy_mtx.get()),
+                        this->mtx.get(), 0.0);
 }
 
 
-TEST_F(Fcg, CanBeCloned)
+TYPED_TEST(Fcg, CanBeCloned)
 {
-    auto clone = solver->clone();
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
     auto clone_mtx = dynamic_cast<Solver *>(clone.get())->get_system_matrix();
-    GKO_ASSERT_MTX_NEAR(dynamic_cast<const Mtx *>(clone_mtx.get()), mtx.get(),
-                        1e-14);
+    GKO_ASSERT_MTX_NEAR(dynamic_cast<const Mtx *>(clone_mtx.get()),
+                        this->mtx.get(), 0.0);
 }
 
 
-TEST_F(Fcg, CanBeCleared)
+TYPED_TEST(Fcg, CanBeCleared)
 {
-    solver->clear();
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
-    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
 
-TEST_F(Fcg, CanSetPreconditionerGenerator)
+TYPED_TEST(Fcg, ApplyUsesInitialGuessReturnsTrue)
+{
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Fcg, CanSetPreconditionerGenerator)
 {
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto fcg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-6)
-                    .on(exec))
-            .with_preconditioner(Solver::build().on(exec))
-            .on(exec);
-    auto solver = fcg_factory->generate(mtx);
-    auto precond = dynamic_cast<const gko::solver::Fcg<> *>(
-        static_cast<gko::solver::Fcg<> *>(solver.get())
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(
+                        gko::remove_complex<value_type>(1e-6))
+                    .on(this->exec))
+            .with_preconditioner(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = fcg_factory->generate(this->mtx);
+    auto precond = dynamic_cast<const gko::solver::Fcg<value_type> *>(
+        static_cast<gko::solver::Fcg<value_type> *>(solver.get())
             ->get_preconditioner()
             .get());
 
     ASSERT_NE(precond, nullptr);
     ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3));
-    ASSERT_EQ(precond->get_system_matrix(), mtx);
+    ASSERT_EQ(precond->get_system_matrix(), this->mtx);
+}
+
+
+TYPED_TEST(Fcg, CanSetCriteriaAgain)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto fcg_factory = Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((fcg_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = fcg_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
 }
 
 
-TEST_F(Fcg, CanSetPreconditionerInFactory)
+TYPED_TEST(Fcg, CanSetPreconditionerInFactory)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> fcg_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto fcg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(fcg_precond)
-            .on(exec);
-    auto solver = fcg_factory->generate(mtx);
+            .on(this->exec);
+    auto solver = fcg_factory->generate(this->mtx);
     auto precond = solver->get_preconditioner();
 
     ASSERT_NE(precond.get(), nullptr);
@@ -182,42 +236,46 @@ TEST_F(Fcg, CanSetPreconditionerInFactory)
 }
 
 
-TEST_F(Fcg, ThrowsOnWrongPreconditionerInFactory)
+TYPED_TEST(Fcg, ThrowsOnWrongPreconditionerInFactory)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> fcg_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto fcg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(fcg_precond)
-            .on(exec);
+            .on(this->exec);
 
-    ASSERT_THROW(fcg_factory->generate(mtx), gko::DimensionMismatch);
+    ASSERT_THROW(fcg_factory->generate(this->mtx), gko::DimensionMismatch);
 }
 
 
-TEST_F(Fcg, CanSetPreconditioner)
+TYPED_TEST(Fcg, CanSetPreconditioner)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> fcg_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto fcg_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = fcg_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = fcg_factory->generate(this->mtx);
     solver->set_preconditioner(fcg_precond);
     auto precond = solver->get_preconditioner();
 
diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp
index d6c6a78aab6..4765f07183b 100644
--- a/core/test/solver/gmres.cpp
+++ b/core/test/solver/gmres.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <core/solver/gmres.cpp>
 #include <ginkgo/core/solver/gmres.hpp>
 
 
@@ -44,18 +43,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Gmres : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Gmres<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Gmres<value_type>;
     using Big_solver = gko::solver::Gmres<double>;
 
+    static constexpr gko::remove_complex<T> reduction_factor =
+        gko::remove_complex<T>(1e-6);
+
     Gmres()
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
@@ -64,8 +71,8 @@ class Gmres : public ::testing::Test {
               Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(reduction_factor)
                           .on(exec))
                   .on(exec)),
           solver(gmres_factory->generate(mtx)),
@@ -74,8 +81,8 @@ class Gmres : public ::testing::Test {
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(128u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(reduction_factor)
                           .on(exec))
                   .on(exec)),
           big_solver(gmres_big_factory->generate(mtx))
@@ -83,7 +90,7 @@ class Gmres : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<Solver::Factory> gmres_factory;
+    std::unique_ptr<typename Solver::Factory> gmres_factory;
     std::unique_ptr<gko::LinOp> solver;
     std::unique_ptr<Big_solver::Factory> gmres_big_factory;
     std::unique_ptr<gko::LinOp> big_solver;
@@ -100,123 +107,200 @@ class Gmres : public ::testing::Test {
     }
 };
 
+template <typename T>
+constexpr gko::remove_complex<T> Gmres<T>::reduction_factor;
+
+TYPED_TEST_CASE(Gmres, gko::test::ValueTypes);
 
-TEST_F(Gmres, GmresFactoryKnowsItsExecutor)
+
+TYPED_TEST(Gmres, GmresFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(gmres_factory->get_executor(), exec);
+    ASSERT_EQ(this->gmres_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(Gmres, GmresFactoryCreatesCorrectSolver)
+TYPED_TEST(Gmres, GmresFactoryCreatesCorrectSolver)
 {
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
-    auto gmres_solver = static_cast<Solver *>(solver.get());
+    using Solver = typename TestFixture::Solver;
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto gmres_solver = static_cast<Solver *>(this->solver.get());
     ASSERT_NE(gmres_solver->get_system_matrix(), nullptr);
-    ASSERT_EQ(gmres_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(gmres_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Gmres, CanBeCopied)
+TYPED_TEST(Gmres, CanBeCopied)
 {
-    auto copy = gmres_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->gmres_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(solver.get());
+    copy->copy_from(this->solver.get());
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Gmres, CanBeMoved)
+TYPED_TEST(Gmres, CanBeMoved)
 {
-    auto copy = gmres_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->gmres_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Gmres, CanBeCloned)
+TYPED_TEST(Gmres, CanBeCloned)
 {
-    auto clone = solver->clone();
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
     auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Gmres, CanBeCleared)
+TYPED_TEST(Gmres, CanBeCleared)
 {
-    solver->clear();
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
-    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
 
-TEST_F(Gmres, CanSetPreconditionerGenerator)
+TYPED_TEST(Gmres, ApplyUsesInitialGuessReturnsTrue)
+{
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Gmres, CanSetPreconditionerGenerator)
 {
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto gmres_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-6)
-                    .on(exec))
-            .with_preconditioner(Solver::build().on(exec))
-            .on(exec);
-    auto solver = gmres_factory->generate(mtx);
-    auto precond = dynamic_cast<const gko::solver::Gmres<> *>(
-        static_cast<gko::solver::Gmres<> *>(solver.get())
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(TestFixture::reduction_factor)
+                    .on(this->exec))
+            .with_preconditioner(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = gmres_factory->generate(this->mtx);
+    auto precond = dynamic_cast<const gko::solver::Gmres<value_type> *>(
+        static_cast<gko::solver::Gmres<value_type> *>(solver.get())
             ->get_preconditioner()
             .get());
 
     ASSERT_NE(precond, nullptr);
     ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3));
-    ASSERT_EQ(precond->get_system_matrix(), mtx);
+    ASSERT_EQ(precond->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Gmres, CanSetKrylovDim)
+TYPED_TEST(Gmres, CanSetCriteriaAgain)
 {
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto gmres_factory =
+        Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((gmres_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = gmres_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
+}
+
+
+TYPED_TEST(Gmres, CanSetKrylovDim)
+{
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto gmres_factory =
         Solver::build()
             .with_krylov_dim(4u)
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-6)
-                    .on(exec))
-            .on(exec);
-    auto solver = gmres_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(TestFixture::reduction_factor)
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = gmres_factory->generate(this->mtx);
     auto krylov_dim = solver->get_krylov_dim();
 
     ASSERT_EQ(krylov_dim, 4);
 }
 
 
-TEST_F(Gmres, CanSetPreconditionerInFactory)
+TYPED_TEST(Gmres, CanSetKrylovDimAgain)
+{
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto gmres_factory =
+        Solver::build().with_criteria(init_crit).with_krylov_dim(10u).on(
+            this->exec);
+
+    ASSERT_EQ(gmres_factory->get_parameters().krylov_dim, 10);
+
+    auto solver = gmres_factory->generate(this->mtx);
+
+    solver->set_krylov_dim(20);
+
+    ASSERT_EQ(solver->get_krylov_dim(), 20);
+}
+
+
+TYPED_TEST(Gmres, CanSetPreconditionerInFactory)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> gmres_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto gmres_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(gmres_precond)
-            .on(exec);
-    auto solver = gmres_factory->generate(mtx);
+            .on(this->exec);
+    auto solver = gmres_factory->generate(this->mtx);
     auto precond = solver->get_preconditioner();
 
     ASSERT_NE(precond.get(), nullptr);
@@ -224,42 +308,46 @@ TEST_F(Gmres, CanSetPreconditionerInFactory)
 }
 
 
-TEST_F(Gmres, ThrowsOnWrongPreconditionerInFactory)
+TYPED_TEST(Gmres, ThrowsOnWrongPreconditionerInFactory)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> gmres_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto gmres_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_preconditioner(gmres_precond)
-            .on(exec);
+            .on(this->exec);
 
-    ASSERT_THROW(gmres_factory->generate(mtx), gko::DimensionMismatch);
+    ASSERT_THROW(gmres_factory->generate(this->mtx), gko::DimensionMismatch);
 }
 
 
-TEST_F(Gmres, CanSetPreconditioner)
+TYPED_TEST(Gmres, CanSetPreconditioner)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> gmres_precond =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto gmres_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = gmres_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = gmres_factory->generate(this->mtx);
     solver->set_preconditioner(gmres_precond);
     auto precond = solver->get_preconditioner();
 
diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp
index d38024b3806..b711c511e97 100644
--- a/core/test/solver/ir.cpp
+++ b/core/test/solver/ir.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,16 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Ir : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Ir<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Ir<value_type>;
 
     Ir()
         : exec(gko::ReferenceExecutor::create()),
@@ -62,8 +67,8 @@ class Ir : public ::testing::Test {
               Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-6)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           solver(ir_factory->generate(mtx))
@@ -71,7 +76,7 @@ class Ir : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<Solver::Factory> ir_factory;
+    std::unique_ptr<typename Solver::Factory> ir_factory;
     std::unique_ptr<gko::LinOp> solver;
 
     static void assert_same_matrices(const Mtx *m1, const Mtx *m2)
@@ -86,103 +91,131 @@ class Ir : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(Ir, gko::test::ValueTypes);
 
-TEST_F(Ir, IrFactoryKnowsItsExecutor)
+
+TYPED_TEST(Ir, IrFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(ir_factory->get_executor(), exec);
+    ASSERT_EQ(this->ir_factory->get_executor(), this->exec);
 }
 
 
-TEST_F(Ir, IrFactoryCreatesCorrectSolver)
+TYPED_TEST(Ir, IrFactoryCreatesCorrectSolver)
 {
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
-    auto cg_solver = static_cast<Solver *>(solver.get());
+    using Solver = typename TestFixture::Solver;
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
+    auto cg_solver = static_cast<Solver *>(this->solver.get());
     ASSERT_NE(cg_solver->get_system_matrix(), nullptr);
-    ASSERT_EQ(cg_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(cg_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Ir, CanBeCopied)
+TYPED_TEST(Ir, CanBeCopied)
 {
-    auto copy = ir_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->ir_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(solver.get());
+    copy->copy_from(this->solver.get());
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Ir, CanBeMoved)
+TYPED_TEST(Ir, CanBeMoved)
 {
-    auto copy = ir_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->ir_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
     auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Ir, CanBeCloned)
+TYPED_TEST(Ir, CanBeCloned)
 {
-    auto clone = solver->clone();
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto clone = this->solver->clone();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
     auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
-    assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()), mtx.get());
+    this->assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()),
+                               this->mtx.get());
 }
 
 
-TEST_F(Ir, CanBeCleared)
+TYPED_TEST(Ir, CanBeCleared)
 {
-    solver->clear();
+    using Solver = typename TestFixture::Solver;
+    this->solver->clear();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
-    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
+    auto solver_mtx =
+        static_cast<Solver *>(this->solver.get())->get_system_matrix();
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
 
-TEST_F(Ir, CanSetInnerSolverInFactory)
+TYPED_TEST(Ir, ApplyUsesInitialGuessReturnsTrue)
+{
+    ASSERT_TRUE(this->solver->apply_uses_initial_guess());
+}
+
+
+TYPED_TEST(Ir, CanSetInnerSolverInFactory)
 {
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto ir_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-6)
-                    .on(exec))
-            .with_solver(Solver::build().on(exec))
-            .on(exec);
-    auto solver = ir_factory->generate(mtx);
-    auto inner_solver = dynamic_cast<const gko::solver::Ir<> *>(
-        static_cast<gko::solver::Ir<> *>(solver.get())->get_solver().get());
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(r<value_type>::value)
+                    .on(this->exec))
+            .with_solver(
+                Solver::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(3u).on(
+                            this->exec))
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = ir_factory->generate(this->mtx);
+    auto inner_solver = dynamic_cast<const Solver *>(
+        static_cast<Solver *>(solver.get())->get_solver().get());
 
     ASSERT_NE(inner_solver, nullptr);
     ASSERT_EQ(inner_solver->get_size(), gko::dim<2>(3, 3));
-    ASSERT_EQ(inner_solver->get_system_matrix(), mtx);
+    ASSERT_EQ(inner_solver->get_system_matrix(), this->mtx);
 }
 
 
-TEST_F(Ir, CanSetGeneratedInnerSolverInFactory)
+TYPED_TEST(Ir, CanSetGeneratedInnerSolverInFactory)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto ir_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_solver(ir_solver)
-            .on(exec);
-    auto solver = ir_factory->generate(mtx);
+            .on(this->exec);
+    auto solver = ir_factory->generate(this->mtx);
     auto inner_solver = solver->get_solver();
 
     ASSERT_NE(inner_solver.get(), nullptr);
@@ -190,42 +223,70 @@ TEST_F(Ir, CanSetGeneratedInnerSolverInFactory)
 }
 
 
-TEST_F(Ir, ThrowsOnWrongInnerSolverInFactory)
+TYPED_TEST(Ir, CanSetCriteriaAgain)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<gko::stop::CriterionFactory> init_crit =
+        gko::stop::Iteration::build().with_max_iters(3u).on(this->exec);
+    auto ir_factory = Solver::build().with_criteria(init_crit).on(this->exec);
+
+    ASSERT_EQ((ir_factory->get_parameters().criteria).back(), init_crit);
+
+    auto solver = ir_factory->generate(this->mtx);
+    std::shared_ptr<gko::stop::CriterionFactory> new_crit =
+        gko::stop::Iteration::build().with_max_iters(5u).on(this->exec);
+
+    solver->set_stop_criterion_factory(new_crit);
+    auto new_crit_fac = solver->get_stop_criterion_factory();
+    auto niter =
+        static_cast<const gko::stop::Iteration::Factory *>(new_crit_fac.get())
+            ->get_parameters()
+            .max_iters;
+
+    ASSERT_EQ(niter, 5);
+}
+
+
+TYPED_TEST(Ir, ThrowsOnWrongInnerSolverInFactory)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto ir_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
             .with_generated_solver(ir_solver)
-            .on(exec);
+            .on(this->exec);
 
-    ASSERT_THROW(ir_factory->generate(mtx), gko::DimensionMismatch);
+    ASSERT_THROW(ir_factory->generate(this->mtx), gko::DimensionMismatch);
 }
 
 
-TEST_F(Ir, CanSetInnerSolver)
+TYPED_TEST(Ir, CanSetInnerSolver)
 {
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
-            ->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
 
     auto ir_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = ir_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = ir_factory->generate(this->mtx);
     solver->set_solver(ir_solver);
     auto inner_solver = solver->get_solver();
 
@@ -234,25 +295,67 @@ TEST_F(Ir, CanSetInnerSolver)
 }
 
 
-TEST_F(Ir, ThrowOnWrongInnerSolverSet)
+TYPED_TEST(Ir, ThrowOnWrongInnerSolverSet)
 {
-    std::shared_ptr<Mtx> wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3});
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> wrong_sized_mtx =
+        Mtx::create(this->exec, gko::dim<2>{1, 3});
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec)
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto ir_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
-            .on(exec);
-    auto solver = ir_factory->generate(mtx);
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .on(this->exec);
+    auto solver = ir_factory->generate(this->mtx);
 
     ASSERT_THROW(solver->set_solver(ir_solver), gko::DimensionMismatch);
 }
 
 
+TYPED_TEST(Ir, DefaultRelaxationFactor)
+{
+    using value_type = typename TestFixture::value_type;
+    const value_type relaxation_factor{0.5};
+
+    auto richardson =
+        gko::solver::Richardson<value_type>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(r<value_type>::value)
+                    .on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
+
+    ASSERT_EQ(richardson->get_parameters().relaxation_factor, value_type{1});
+}
+
+
+TYPED_TEST(Ir, UseAsRichardson)
+{
+    using value_type = typename TestFixture::value_type;
+    const value_type relaxation_factor{0.5};
+
+    auto richardson =
+        gko::solver::Richardson<value_type>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(r<value_type>::value)
+                    .on(this->exec))
+            .with_relaxation_factor(relaxation_factor)
+            .on(this->exec)
+            ->generate(this->mtx);
+
+    ASSERT_EQ(richardson->get_parameters().relaxation_factor, value_type{0.5});
+}
+
+
 }  // namespace
diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp
index c32afcc6e83..be12f10ef53 100644
--- a/core/test/solver/lower_trs.cpp
+++ b/core/test/solver/lower_trs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,15 +42,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class LowerTrs : public ::testing::Test {
 protected:
-    using Solver = gko::solver::LowerTrs<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Solver = gko::solver::LowerTrs<value_type, index_type>;
 
     LowerTrs()
         : exec(gko::ReferenceExecutor::create()),
@@ -58,13 +63,15 @@ class LowerTrs : public ::testing::Test {
     {}
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<Solver::Factory> lower_trs_factory;
+    std::unique_ptr<typename Solver::Factory> lower_trs_factory;
 };
 
+TYPED_TEST_CASE(LowerTrs, gko::test::ValueIndexTypes);
 
-TEST_F(LowerTrs, LowerTrsFactoryKnowsItsExecutor)
+
+TYPED_TEST(LowerTrs, LowerTrsFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(lower_trs_factory->get_executor(), exec);
+    ASSERT_EQ(this->lower_trs_factory->get_executor(), this->exec);
 }
 
 
diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp
index 2e8b629e186..1ec759fa47d 100644
--- a/core/test/solver/upper_trs.cpp
+++ b/core/test/solver/upper_trs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,15 +42,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class UpperTrs : public ::testing::Test {
 protected:
-    using Solver = gko::solver::UpperTrs<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Solver = gko::solver::UpperTrs<value_type, index_type>;
 
     UpperTrs()
         : exec(gko::ReferenceExecutor::create()),
@@ -58,13 +63,15 @@ class UpperTrs : public ::testing::Test {
     {}
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<Solver::Factory> upper_trs_factory;
+    std::unique_ptr<typename Solver::Factory> upper_trs_factory;
 };
 
+TYPED_TEST_CASE(UpperTrs, gko::test::ValueIndexTypes);
 
-TEST_F(UpperTrs, UpperTrsFactoryKnowsItsExecutor)
+
+TYPED_TEST(UpperTrs, UpperTrsFactoryKnowsItsExecutor)
 {
-    ASSERT_EQ(upper_trs_factory->get_executor(), exec);
+    ASSERT_EQ(this->upper_trs_factory->get_executor(), this->exec);
 }
 
 
diff --git a/core/test/stop/combined.cpp b/core/test/stop/combined.cpp
index 8aff6707cc3..8a443790429 100644
--- a/core/test/stop/combined.cpp
+++ b/core/test/stop/combined.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,12 +33,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/combined.hpp>
 
 
-#include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/time.hpp>
+#include <thread>
+#include <vector>
 
 
 #include <gtest/gtest.h>
-#include <thread>
+
+
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/time.hpp>
 
 
 namespace {
@@ -86,4 +89,86 @@ TEST_F(Combined, CanCreateCriterion)
 }
 
 
+TEST_F(Combined, CanIgnoreNullptr)
+{
+    auto combined = gko::stop::Combined::build()
+                        .with_criteria(gko::stop::Iteration::build()
+                                           .with_max_iters(test_iterations)
+                                           .on(exec_),
+                                       nullptr)
+                        .on(exec_);
+
+    ASSERT_NO_THROW(combined->generate(nullptr, nullptr, nullptr));
+}
+
+
+TEST_F(Combined, CanThrowAllNullptr)
+{
+    auto combined =
+        gko::stop::Combined::build().with_criteria(nullptr, nullptr).on(exec_);
+
+    ASSERT_THROW(combined->generate(nullptr, nullptr, nullptr),
+                 gko::NotSupported);
+}
+
+
+TEST_F(Combined, CanThrowWithoutInput)
+{
+    auto combined = gko::stop::Combined::build().on(exec_);
+
+    ASSERT_THROW(combined->generate(nullptr, nullptr, nullptr),
+                 gko::NotSupported);
+}
+
+
+TEST_F(Combined, FunctionCanThrowWithoutInput)
+{
+    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
+        criterion_vec{};
+
+    ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported);
+}
+
+
+TEST_F(Combined, FunctionCanThrowOnlyOneNullptr)
+{
+    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
+        criterion_vec{nullptr};
+
+    ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported);
+}
+
+
+TEST_F(Combined, FunctionCanThrowAllNullptr)
+{
+    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
+        criterion_vec{nullptr, nullptr};
+
+    ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported);
+}
+
+
+TEST_F(Combined, FunctionCanThrowFirstIsInvalid)
+{
+    auto stop =
+        gko::stop::Iteration::build().with_max_iters(test_iterations).on(exec_);
+    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
+        criterion_vec{nullptr, gko::share(stop)};
+
+    ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported);
+}
+
+
+TEST_F(Combined, FunctionCanIgnoreNullptr)
+{
+    auto stop =
+        gko::stop::Iteration::build().with_max_iters(test_iterations).on(exec_);
+    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
+        criterion_vec{gko::share(stop), nullptr};
+    auto combined = gko::stop::combine(criterion_vec);
+
+    ASSERT_NO_THROW(combined->generate(nullptr, nullptr, nullptr));
+}
+
+
 }  // namespace
diff --git a/core/test/stop/iteration.cpp b/core/test/stop/iteration.cpp
index 2e8733167a7..aedc443eb76 100644
--- a/core/test/stop/iteration.cpp
+++ b/core/test/stop/iteration.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/stop/stopping_status.cpp b/core/test/stop/stopping_status.cpp
index bc42727083b..d9cdebc165e 100644
--- a/core/test/stop/stopping_status.cpp
+++ b/core/test/stop/stopping_status.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/core/test/stop/time.cpp b/core/test/stop/time.cpp
index e45fec32f47..53966fbacad 100644
--- a/core/test/stop/time.cpp
+++ b/core/test/stop/time.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/time.hpp>
 
 
-#include <gtest/gtest.h>
 #include <chrono>
 #include <thread>
 
 
+#include <gtest/gtest.h>
+
+
 namespace {
 
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index d03ea69fe46..89b135a01f3 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,80 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_TEST_UTILS_HPP_
 
 
+#include <complex>
+#include <initializer_list>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 
 
+namespace gko {
+namespace test {
+
+
+using ValueTypes =
+    ::testing::Types<float, double, std::complex<float>, std::complex<double>>;
+
+
+using ComplexValueTypes =
+    ::testing::Types<std::complex<float>, std::complex<double>>;
+
+
+using IndexTypes = ::testing::Types<gko::int32, gko::int64>;
+
+
+using ValueAndIndexTypes =
+    ::testing::Types<float, double, std::complex<float>, std::complex<double>,
+                     gko::int32, gko::int64, gko::size_type>;
+
+
+using ValueIndexTypes = ::testing::Types<
+    std::tuple<float, gko::int32>, std::tuple<double, gko::int32>,
+    std::tuple<std::complex<float>, gko::int32>,
+    std::tuple<std::complex<double>, gko::int32>, std::tuple<float, gko::int64>,
+    std::tuple<double, gko::int64>, std::tuple<std::complex<float>, gko::int64>,
+    std::tuple<std::complex<double>, gko::int64>>;
+
+
+using RealValueIndexTypes = ::testing::Types<
+    std::tuple<float, gko::int32>, std::tuple<double, gko::int32>,
+    std::tuple<float, gko::int64>, std::tuple<double, gko::int64>>;
+
+
+using ComplexValueIndexTypes =
+    ::testing::Types<std::tuple<std::complex<float>, gko::int32>,
+                     std::tuple<std::complex<double>, gko::int32>,
+                     std::tuple<std::complex<float>, gko::int64>,
+                     std::tuple<std::complex<double>, gko::int64>>;
+
+
+template <typename T>
+struct reduction_factor {
+    static constexpr gko::remove_complex<T> value =
+        std::is_same<gko::remove_complex<T>, float>::value ? 1.0e-7 : 1.0e-14;
+};
+
+
+template <typename T>
+constexpr gko::remove_complex<T> reduction_factor<T>::value;
+
+
+}  // namespace test
+}  // namespace gko
+
+
+template <typename T>
+using r = typename gko::test::reduction_factor<T>;
+
+
+template <typename T>
+using I = std::initializer_list<T>;
+
+
 #endif  // GKO_CORE_TEST_UTILS_HPP_
diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index 2ba6811921f..88b38561511 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,24 +34,104 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_TEST_UTILS_ASSERTIONS_HPP_
 
 
-#include <gtest/gtest.h>
+#include <cctype>
 #include <cmath>
+#include <complex>
 #include <cstdlib>
+#include <fstream>
 #include <initializer_list>
 #include <string>
 #include <type_traits>
 
 
+#include <gtest/gtest.h>
+
+
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/extended_float.hpp"
+
+
 namespace gko {
 namespace test {
 namespace assertions {
 namespace detail {
 
 
+/**
+ * Structure helper to return the biggest valuetype able to contain values from
+ * both ValueType1 and ValueType2.
+ *
+ * @tparam ValueType1  the first valuetype to compare
+ * @tparam ValueType2  the second valuetype to compare
+ * @tparam T  enable_if placeholder
+ */
+template <typename ValueType1, typename ValueType2, typename T = void>
+struct biggest_valuetype {
+    /** The type. This default is good but should not be used due to the
+     * enable_if versions. */
+    using type = std::complex<long double>;
+};
+
+
+/**
+ * Specialization when both ValueType1 and ValueType2 are the same.
+ *
+ * @copydoc biggest_valuetype
+ */
+template <typename ValueType1, typename ValueType2>
+struct biggest_valuetype<ValueType1, ValueType2,
+                         typename std::enable_if<std::is_same<
+                             ValueType1, ValueType2>::value>::type> {
+    /** The type. */
+    using type = ValueType1;
+};
+
+
+/**
+ * Specialization when both ValueType1 and ValueType2 are different but non
+ * complex.
+ *
+ * @copydoc biggest_valuetype
+ */
+template <typename ValueType1, typename ValueType2>
+struct biggest_valuetype<
+    ValueType1, ValueType2,
+    typename std::enable_if<!std::is_same<ValueType1, ValueType2>::value &&
+                            !(gko::is_complex_s<ValueType1>::value ||
+                              gko::is_complex_s<ValueType2>::value)>::type> {
+    /** The type. We pick the bigger of the two. */
+    using type = typename std::conditional<xstd::greater(sizeof(ValueType1),
+                                                         sizeof(ValueType2)),
+                                           ValueType1, ValueType2>::type;
+};
+
+
+/**
+ * Specialization when both ValueType1 and ValueType2 are different and one of
+ * them is complex.
+ *
+ * @copydoc biggest_valuetype
+ */
+template <typename ValueType1, typename ValueType2>
+class biggest_valuetype<
+    ValueType1, ValueType2,
+    typename std::enable_if<!std::is_same<ValueType1, ValueType2>::value &&
+                            (gko::is_complex_s<ValueType1>::value ||
+                             gko::is_complex_s<ValueType2>::value)>::type> {
+    using real_vt1 = remove_complex<ValueType1>;
+    using real_vt2 = remove_complex<ValueType2>;
+
+public:
+    /** The type. We make a complex with the bigger real of the two. */
+    using type = typename std::conditional<
+        xstd::greater(sizeof(real_vt1), sizeof(real_vt2)),
+        std::complex<real_vt1>, std::complex<real_vt2>>::type;
+};
+
+
 template <typename NonzeroIterator>
 auto get_next_value(NonzeroIterator &it, const NonzeroIterator &end,
                     size_type next_row, size_type next_col) ->
@@ -83,17 +163,23 @@ template <typename Ostream, typename MatrixData1, typename MatrixData2>
 void print_componentwise_error(Ostream &os, const MatrixData1 &first,
                                const MatrixData2 &second)
 {
-    using real_vt = remove_complex<typename MatrixData2::value_type>;
+    using std::abs;
+    using vt = typename detail::biggest_valuetype<
+        typename MatrixData1::value_type,
+        typename MatrixData2::value_type>::type;
+    using real_vt = remove_complex<vt>;
+
     auto first_it = begin(first.nonzeros);
     auto second_it = begin(second.nonzeros);
     for (size_type row = 0; row < first.size[0]; ++row) {
         os << "\t";
         for (size_type col = 0; col < first.size[1]; ++col) {
-            auto r = get_next_value(first_it, end(first.nonzeros), row, col);
-            auto e = get_next_value(second_it, end(second.nonzeros), row, col);
-            auto m =
-                max(static_cast<real_vt>(abs(r)), static_cast<real_vt>(abs(e)));
-            if (m == zero<real_vt>()) {
+            auto r =
+                vt{get_next_value(first_it, end(first.nonzeros), row, col)};
+            auto e =
+                vt{get_next_value(second_it, end(second.nonzeros), row, col)};
+            auto m = std::max(abs(r), abs(e));
+            if (m == zero<vt>()) {
                 os << abs(r - e) << "\t";
             } else {
                 os << abs((r - e) / m) << "\t";
@@ -103,21 +189,37 @@ void print_componentwise_error(Ostream &os, const MatrixData1 &first,
     }
 }
 
+template <typename Ostream, typename Iterator>
+void print_columns(Ostream &os, const Iterator &begin, const Iterator &end)
+{
+    for (auto it = begin; it != end; ++it) {
+        os << '\t' << it->column;
+    }
+    os << '\n';
+}
+
 
 template <typename MatrixData1, typename MatrixData2>
 double get_relative_error(const MatrixData1 &first, const MatrixData2 &second)
 {
-    double diff = 0.0;
-    double first_norm = 0.0;
-    double second_norm = 0.0;
+    using std::abs;
+    using vt = typename detail::biggest_valuetype<
+        typename MatrixData1::value_type,
+        typename MatrixData2::value_type>::type;
+    using real_vt = remove_complex<vt>;
+
+    real_vt diff = 0.0;
+    real_vt first_norm = 0.0;
+    real_vt second_norm = 0.0;
     auto first_it = begin(first.nonzeros);
     auto second_it = begin(second.nonzeros);
     for (size_type row = 0; row < first.size[0]; ++row) {
         for (size_type col = 0; col < first.size[1]; ++col) {
             const auto first_val =
-                get_next_value(first_it, end(first.nonzeros), row, col);
+                vt{get_next_value(first_it, end(first.nonzeros), row, col)};
             const auto second_val =
-                get_next_value(second_it, end(second.nonzeros), row, col);
+                vt{get_next_value(second_it, end(second.nonzeros), row, col)};
+
             diff += squared_norm(first_val - second_val);
             first_norm += squared_norm(first_val);
             second_norm += squared_norm(second_val);
@@ -126,7 +228,7 @@ double get_relative_error(const MatrixData1 &first, const MatrixData2 &second)
     if (first_norm == 0.0 && second_norm == 0.0) {
         first_norm = 1.0;
     }
-    return sqrt(diff / max(first_norm, second_norm));
+    return sqrt(diff / std::max(first_norm, second_norm));
 }
 
 
@@ -155,24 +257,112 @@ ::testing::AssertionResult matrices_near_impl(
              << second_expression << " is " << err << "\n"
              << "\twhich is larger than " << tolerance_expression
              << " (which is " << tolerance << ")\n";
-        fail << first_expression << " is:\n";
-        detail::print_matrix(fail, first);
-        fail << second_expression << " is:\n";
-        detail::print_matrix(fail, second);
-        fail << "component-wise relative error is:\n";
-        detail::print_componentwise_error(fail, first, second);
+        if (num_rows * num_cols <= 1000) {
+            fail << first_expression << " is:\n";
+            detail::print_matrix(fail, first);
+            fail << second_expression << " is:\n";
+            detail::print_matrix(fail, second);
+            fail << "component-wise relative error is:\n";
+            detail::print_componentwise_error(fail, first, second);
+        } else {
+            // build output filenames
+            auto test_case_info =
+                ::testing::UnitTest::GetInstance()->current_test_info();
+            auto testname =
+                test_case_info ? std::string{test_case_info->test_case_name()} +
+                                     "." + test_case_info->name()
+                               : std::string{"null"};
+            auto firstfile = testname + "." + first_expression + ".mtx";
+            auto secondfile = testname + "." + second_expression + ".mtx";
+            auto to_remove = [](char c) {
+                return !std::isalnum(c) && c != '_' && c != '.' && c != '-' &&
+                       c != '<' && c != '>';
+            };
+            // remove all but alphanumerical and _.-<> characters from
+            // expressions
+            firstfile.erase(
+                std::remove_if(firstfile.begin(), firstfile.end(), to_remove),
+                firstfile.end());
+            secondfile.erase(
+                std::remove_if(secondfile.begin(), secondfile.end(), to_remove),
+                secondfile.end());
+            // save matrices
+            std::ofstream first_stream{firstfile};
+            gko::write_raw(first_stream, first, gko::layout_type::coordinate);
+            std::ofstream second_stream{secondfile};
+            gko::write_raw(second_stream, second, gko::layout_type::coordinate);
+            fail << first_expression << " saved as " << firstfile << "\n";
+            fail << second_expression << " saved as " << secondfile << "\n";
+        }
         return fail;
     }
 }
 
 
+template <typename MatrixData1, typename MatrixData2>
+::testing::AssertionResult matrices_equal_sparsity_impl(
+    const std::string &first_expression, const std::string &second_expression,
+    const MatrixData1 &first, const MatrixData2 &second)
+{
+    auto num_rows = first.size[0];
+    auto num_cols = first.size[1];
+    if (num_rows != second.size[0] || num_cols != second.size[1]) {
+        return ::testing::AssertionFailure()
+               << "Expected matrices of equal size\n\t" << first_expression
+               << " is of size [" << num_rows << " x " << num_cols << "]\n\t"
+               << second_expression << " is of size [" << second.size[0]
+               << " x " << second.size[1] << "]";
+    }
+
+    auto fst_it = begin(first.nonzeros);
+    auto snd_it = begin(second.nonzeros);
+    auto fst_end = end(first.nonzeros);
+    auto snd_end = end(second.nonzeros);
+    using nz_type_f = typename std::decay<decltype(*fst_it)>::type;
+    using nz_type_s = typename std::decay<decltype(*snd_it)>::type;
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto cmp_l_f = [](nz_type_f nz, size_type row) { return nz.row < row; };
+        auto cmp_u_f = [](size_type row, nz_type_f nz) { return row < nz.row; };
+        auto cmp_l_s = [](nz_type_s nz, size_type row) { return nz.row < row; };
+        auto cmp_u_s = [](size_type row, nz_type_s nz) { return row < nz.row; };
+        auto col_eq = [](nz_type_f a, nz_type_s b) {
+            return a.column == b.column;
+        };
+        auto fst_row_begin = std::lower_bound(fst_it, fst_end, row, cmp_l_f);
+        auto snd_row_begin = std::lower_bound(snd_it, snd_end, row, cmp_l_s);
+        auto fst_row_end =
+            std::upper_bound(fst_row_begin, fst_end, row, cmp_u_f);
+        auto snd_row_end =
+            std::upper_bound(snd_row_begin, snd_end, row, cmp_u_s);
+        if (std::distance(fst_row_begin, fst_row_end) !=
+                std::distance(snd_row_begin, snd_row_end) ||
+            !std::equal(fst_row_begin, fst_row_end, snd_row_begin, col_eq)) {
+            auto fail = ::testing::AssertionFailure();
+            fail << "Sparsity pattern differs between " << first_expression
+                 << " and " << second_expression << "\nIn row " << row << " "
+                 << first_expression << " has " << (fst_row_end - fst_row_begin)
+                 << " columns:\n";
+            detail::print_columns(fail, fst_row_begin, fst_row_end);
+            fail << "and " << second_expression << " has "
+                 << (snd_row_end - snd_row_begin) << " columns:\n";
+            detail::print_columns(fail, snd_row_begin, snd_row_end);
+            return fail;
+        }
+        fst_it = fst_row_end;
+        snd_it = snd_row_end;
+    }
+
+    return ::testing::AssertionSuccess();
+}
+
+
 template <typename ValueType>
 ::testing::AssertionResult array_equal_impl(
     const std::string &first_expression, const std::string &second_expression,
-    const Array<ValueType> *first, const Array<ValueType> *second)
+    const Array<ValueType> &first, const Array<ValueType> &second)
 {
-    const auto num_elems1 = first->get_num_elems();
-    const auto num_elems2 = second->get_num_elems();
+    const auto num_elems1 = first.get_num_elems();
+    const auto num_elems2 = second.get_num_elems();
     if (num_elems1 != num_elems2) {
         auto fail = ::testing::AssertionFailure();
         fail << "Array " << first_expression << " contains " << num_elems1
@@ -181,10 +371,10 @@ ::testing::AssertionResult array_equal_impl(
         return fail;
     }
 
-    auto exec = first->get_executor()->get_master();
-    Array<ValueType> first_array(exec, *first);
-    Array<ValueType> second_array(exec, *second);
-    for (decltype(first->get_num_elems()) i = 0; i < num_elems1; ++i) {
+    auto exec = first.get_executor()->get_master();
+    Array<ValueType> first_array(exec, first);
+    Array<ValueType> second_array(exec, second);
+    for (decltype(first.get_num_elems()) i = 0; i < num_elems1; ++i) {
         if (!(first_array.get_const_data()[i] ==
               second_array.get_const_data()[i])) {
             auto fail = ::testing::AssertionFailure();
@@ -246,6 +436,85 @@ std::string remove_list_wrapper(const std::string &expression)
 }  // namespace detail
 
 
+/**
+ * This is a gtest predicate which checks if two values are relatively near.
+ *
+ * This function should not be called directly, but used in conjunction with
+ * `ASSERT_PRED_FORMAT3` as follows:
+ *
+ * ```
+ * // Check if first and second are near
+ * ASSERT_PRED_FORMAT3(gko::test::assertions::values_near,
+ *                     first, second, tolerance);
+ * // Check if first and second are far
+ * ASSERT_PRED_FORMAT3(!gko::test::assertions::values_near,
+ *                     first, second, tolerance);
+ * ```
+ *
+ * @see GKO_ASSERT_MTX_NEAR
+ * @see GKO_EXPECT_MTX_NEAR
+ */
+template <typename T, typename U>
+::testing::AssertionResult values_near(const std::string &first_expression,
+                                       const std::string &second_expression,
+                                       const std::string &tolerance_expression,
+                                       T val1, U val2, double abs_error)
+{
+    static_assert(std::is_same<T, U>(),
+                  "The types of the operands should be the same.");
+    const double diff = abs(val1 - val2);
+    if (diff <= abs_error) return ::testing::AssertionSuccess();
+
+    return ::testing::AssertionFailure()
+           << "The difference between " << first_expression << " and "
+           << second_expression << " is " << diff << ", which exceeds "
+           << tolerance_expression << ", where\n"
+           << first_expression << " evaluates to " << val1 << ",\n"
+           << second_expression << " evaluates to " << val2 << ", and\n"
+           << tolerance_expression << " evaluates to " << abs_error << ".";
+}
+
+
+template <>
+::testing::AssertionResult values_near<gko::half, gko::half>(
+    const std::string &first_expression, const std::string &second_expression,
+    const std::string &tolerance_expression, gko::half val1, gko::half val2,
+    double abs_error)
+{
+    using T = float32;
+    const double diff = abs(T{val1} - T{val2});
+    if (diff <= abs_error) return ::testing::AssertionSuccess();
+
+    return ::testing::AssertionFailure()
+           << "The difference between " << first_expression << " and "
+           << second_expression << " is " << diff << ", which exceeds "
+           << tolerance_expression << ", where\n"
+           << first_expression << " evaluates to " << T{val1} << ",\n"
+           << second_expression << " evaluates to " << T{val2} << ", and\n"
+           << tolerance_expression << " evaluates to " << abs_error << ".";
+}
+
+
+template <>
+::testing::AssertionResult values_near<std::complex<half>, std::complex<half>>(
+    const std::string &first_expression, const std::string &second_expression,
+    const std::string &tolerance_expression, std::complex<half> val1,
+    std::complex<half> val2, double abs_error)
+{
+    using T = std::complex<float32>;
+    const double diff = abs(T{val1} - T{val2});
+    if (diff <= abs_error) return ::testing::AssertionSuccess();
+
+    return ::testing::AssertionFailure()
+           << "The difference between " << first_expression << " and "
+           << second_expression << " is " << diff << ", which exceeds "
+           << tolerance_expression << ", where\n"
+           << first_expression << " evaluates to " << T{val1} << ",\n"
+           << second_expression << " evaluates to " << T{val2} << ", and\n"
+           << tolerance_expression << " evaluates to " << abs_error << ".";
+}
+
+
 /**
  * This is a gtest predicate which checks if two matrices are relatively near.
  *
@@ -327,12 +596,11 @@ ::testing::AssertionResult matrices_near(
 template <typename ValueType>
 ::testing::AssertionResult array_equal(const std::string &first_expression,
                                        const std::string &second_expression,
-                                       const Array<ValueType> *first,
-                                       const Array<ValueType> *second)
+                                       const Array<ValueType> &first,
+                                       const Array<ValueType> &second)
 {
-    return detail::array_equal_impl(
-        detail::remove_pointer_wrapper(first_expression),
-        detail::remove_pointer_wrapper(second_expression), first, second);
+    return detail::array_equal_impl(first_expression, second_expression, first,
+                                    second);
 }
 
 
@@ -361,6 +629,52 @@ ::testing::AssertionResult str_contains(const std::string &first_expression,
 }
 
 
+/**
+ * This is a gtest predicate which checks if two matrices have the same sparsity
+ * pattern.
+ *
+ * This means that hat mtx1 and mtx2 have exactly the same non-zero locations
+ * (including zero values!)
+ *
+ * This function should not be called directly, but used in conjunction with
+ * `ASSERT_PRED_FORMAT2` as follows:
+ *
+ * ```
+ * // Check if first and second are equal
+ * ASSERT_PRED_FORMAT2(gko::test::assertions::matrices_equal_sparsity,
+ *                     first, second);
+ * // Check if first and second are not equal
+ * ASSERT_PRED_FORMAT2(!gko::test::assertions::matrices_equal_sparsity,
+ *                     first, second);
+ * ```
+ *
+ * @see GKO_ASSERT_MTX_NEAR
+ * @see GKO_EXPECT_MTX_NEAR
+ */
+template <typename LinOp1, typename LinOp2>
+::testing::AssertionResult matrices_equal_sparsity(
+    const std::string &first_expression, const std::string &second_expression,
+    const LinOp1 *first, const LinOp2 *second)
+{
+    auto exec = first->get_executor()->get_master();
+    matrix_data<typename LinOp1::value_type, typename LinOp1::index_type>
+        first_data;
+    matrix_data<typename LinOp2::value_type, typename LinOp2::index_type>
+        second_data;
+
+    first->write(first_data);
+    second->write(second_data);
+
+    first_data.ensure_row_major_order();
+    second_data.ensure_row_major_order();
+
+    return detail::matrices_equal_sparsity_impl(
+        detail::remove_pointer_wrapper(first_expression),
+        detail::remove_pointer_wrapper(second_expression), first_data,
+        second_data);
+}
+
+
 namespace detail {
 
 
@@ -383,7 +697,6 @@ T &&l(T &&matrix)
     return std::forward<T>(matrix);
 }
 
-
 template <typename T>
 T *plain_ptr(const std::shared_ptr<T> &ptr)
 {
@@ -409,6 +722,33 @@ T plain_ptr(T ptr)
 }  // namespace gko
 
 
+/**
+ * Checks if two values are near each other.
+ *
+ * Has to be called from within a google test unit test.
+ * Internally calls gko::test::assertions::values_near().
+ *
+ * @param _val1  first value
+ * @param _val2  second value
+ * @param _tol  tolerance level
+ */
+#define GKO_ASSERT_NEAR(_val1, _val2, _tol)                              \
+    {                                                                    \
+        ASSERT_PRED_FORMAT3(::gko::test::assertions::values_near, _val1, \
+                            _val2, _tol);                                \
+    }
+
+
+/**
+ * @copydoc GKO_ASSERT_NEAR
+ */
+#define GKO_EXPECT_NEAR(_val1, _val2, _tol)                              \
+    {                                                                    \
+        EXPECT_PRED_FORMAT3(::gko::test::assertions::values_near, _val1, \
+                            _val2, _tol);                                \
+    }
+
+
 /**
  * Checks if two matrices are near each other.
  *
@@ -445,6 +785,38 @@ T plain_ptr(T ptr)
                             plain_ptr(_mtx1), plain_ptr(_mtx2), _tol); \
     }
 
+/**
+ * Checks if two matrices have the same sparsity pattern.
+ *
+ * This means that mtx1 and mtx2 have exactly the same non-zero locations
+ * (including zero values!)
+ *
+ * Has to be called from within a google test unit test.
+ * Internally calls gko::test::assertions::matrices_equal_sparsity().
+ *
+ * @param _mtx1  first matrix
+ * @param _mtx2  second matrix
+ */
+#define GKO_ASSERT_MTX_EQ_SPARSITY(_mtx1, _mtx2)                              \
+    {                                                                         \
+        using ::gko::test::assertions::detail::l;                             \
+        using ::gko::test::assertions::detail::plain_ptr;                     \
+        ASSERT_PRED_FORMAT2(::gko::test::assertions::matrices_equal_sparsity, \
+                            plain_ptr(_mtx1), plain_ptr(_mtx2));              \
+    }
+
+
+/**
+ * @copydoc GKO_ASSERT_MTX_EQ_SPARSITY
+ */
+#define GKO_EXPECT_MTX_EQ_SPARSITY(_mtx1, _mtx2)                              \
+    {                                                                         \
+        using ::gko::test::assertions::detail::l;                             \
+        using ::gko::test::assertions::detail::plain_ptr;                     \
+        EXPECT_PRED_FORMAT2(::gko::test::assertions::matrices_equal_sparsity, \
+                            plain_ptr(_mtx1), plain_ptr(_mtx2));              \
+    }
+
 
 /**
  * Checks if two `gko::Array`s are equal.
@@ -458,11 +830,10 @@ T plain_ptr(T ptr)
  * @param _array1  first array
  * @param _array2  second array
  **/
-#define GKO_ASSERT_ARRAY_EQ(_array1, _array2)                        \
-    {                                                                \
-        using ::gko::test::assertions::detail::plain_ptr;            \
-        EXPECT_PRED_FORMAT2(::gko::test::assertions::array_equal,    \
-                            plain_ptr(_array1), plain_ptr(_array2)); \
+#define GKO_ASSERT_ARRAY_EQ(_array1, _array2)                              \
+    {                                                                      \
+        EXPECT_PRED_FORMAT2(::gko::test::assertions::array_equal, _array1, \
+                            _array2);                                      \
     }
 
 
diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp
index 444b9932c1f..89d15ca585a 100644
--- a/core/test/utils/assertions_test.cpp
+++ b/core/test/utils/assertions_test.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,12 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <core/test/utils/assertions.hpp>
+#include "core/test/utils/assertions.hpp"
+
+
+#include <type_traits>
 
 
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
@@ -45,20 +49,52 @@ namespace {
 class MatricesNear : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Dense<>;
+    using Sparse = gko::matrix::Csr<>;
+
+    template <typename Type, std::size_t size>
+    gko::Array<Type> make_view(std::array<Type, size> &array)
+    {
+        return gko::Array<Type>::view(exec, size, array.data());
+    }
+
     MatricesNear()
         : exec(gko::ReferenceExecutor::create()),
-          mtx1(gko::initialize<gko::matrix::Dense<>>(
-              {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, exec)),
-          mtx2(gko::initialize<gko::matrix::Dense<>>(
-              {{1.0, 2.0, 3.0}, {4.0, 0.0, 4.0}}, exec)),
-          mtx3(gko::initialize<gko::matrix::Dense<>>(
-              {{1.0, 2.0, 3.0}, {0.0, 4.1, 0.0}}, exec))
-    {}
+          mtx1(gko::initialize<Mtx>({{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, exec)),
+          mtx2(gko::initialize<Mtx>({{1.0, 2.0, 3.0}, {4.0, 0.0, 4.0}}, exec)),
+          mtx3(gko::initialize<Mtx>({{1.0, 2.0, 3.0}, {0.0, 4.1, 0.0}}, exec)),
+          mtx13_row_ptrs({0, 3, 4}),
+          mtx2_row_ptrs({0, 3, 5}),
+          mtx13_col_idxs({0, 1, 2, 1}),
+          mtx2_col_idxs({0, 1, 2, 0, 2}),
+          mtx1_vals({1.0, 2.0, 3.0, 4.0}),
+          mtx2_vals({1.0, 2.0, 3.0, 4.0, 4.0}),
+          mtx3_vals({1.0, 2.0, 3.0, 4.1})
+    {
+        mtx1_sp = Sparse::create(exec, mtx1->get_size(), make_view(mtx1_vals),
+                                 make_view(mtx13_col_idxs),
+                                 make_view(mtx13_row_ptrs));
+        mtx2_sp =
+            Sparse::create(exec, mtx2->get_size(), make_view(mtx2_vals),
+                           make_view(mtx2_col_idxs), make_view(mtx2_row_ptrs));
+        mtx3_sp = Sparse::create(exec, mtx3->get_size(), make_view(mtx3_vals),
+                                 make_view(mtx13_col_idxs),
+                                 make_view(mtx13_row_ptrs));
+    }
 
     std::shared_ptr<const gko::Executor> exec;
     std::unique_ptr<Mtx> mtx1;
     std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<Mtx> mtx3;
+    std::array<Sparse::index_type, 3> mtx13_row_ptrs;
+    std::array<Sparse::index_type, 3> mtx2_row_ptrs;
+    std::array<Sparse::index_type, 4> mtx13_col_idxs;
+    std::array<Sparse::index_type, 5> mtx2_col_idxs;
+    std::array<Sparse::value_type, 4> mtx1_vals;
+    std::array<Sparse::value_type, 5> mtx2_vals;
+    std::array<Sparse::value_type, 4> mtx3_vals;
+    std::unique_ptr<Sparse> mtx1_sp;
+    std::unique_ptr<Sparse> mtx2_sp;
+    std::unique_ptr<Sparse> mtx3_sp;
 };
 
 
@@ -66,6 +102,8 @@ TEST_F(MatricesNear, SuceedsIfSame)
 {
     ASSERT_PRED_FORMAT3(gko::test::assertions::matrices_near, mtx1.get(),
                         mtx1.get(), 0.0);
+    ASSERT_PRED_FORMAT2(gko::test::assertions::matrices_equal_sparsity,
+                        mtx1_sp.get(), mtx1_sp.get());
 }
 
 
@@ -73,6 +111,8 @@ TEST_F(MatricesNear, FailsIfDifferent)
 {
     ASSERT_PRED_FORMAT3(!gko::test::assertions::matrices_near, mtx1.get(),
                         mtx2.get(), 0.0);
+    ASSERT_PRED_FORMAT2(!gko::test::assertions::matrices_equal_sparsity,
+                        mtx1_sp.get(), mtx2_sp.get());
 }
 
 
@@ -82,6 +122,8 @@ TEST_F(MatricesNear, SucceedsIfClose)
                         mtx3.get(), 0.0);
     ASSERT_PRED_FORMAT3(gko::test::assertions::matrices_near, mtx1.get(),
                         mtx3.get(), 0.1);
+    ASSERT_PRED_FORMAT2(gko::test::assertions::matrices_equal_sparsity,
+                        mtx1_sp.get(), mtx3_sp.get());
 }
 
 
@@ -89,6 +131,8 @@ TEST_F(MatricesNear, CanUseShortNotation)
 {
     GKO_EXPECT_MTX_NEAR(mtx1, mtx1, 0.0);
     GKO_ASSERT_MTX_NEAR(mtx1, mtx3, 0.1);
+    GKO_EXPECT_MTX_EQ_SPARSITY(mtx1_sp, mtx3_sp);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx1_sp, mtx3_sp);
 }
 
 
@@ -99,4 +143,64 @@ TEST_F(MatricesNear, CanPassInitializerList)
 }
 
 
+TEST(BiggestValueType, SameNonComplex)
+{
+    using T1 = float;
+    using T2 = float;
+    using result =
+        gko::test::assertions::detail::biggest_valuetype<T1, T2>::type;
+
+    bool is_float = std::is_same<result, float>::value;
+    ASSERT_TRUE(is_float);
+}
+
+
+TEST(BiggestValueType, BetweenNonComplex)
+{
+    using T1 = float;
+    using T2 = double;
+    using result =
+        gko::test::assertions::detail::biggest_valuetype<T1, T2>::type;
+
+    bool is_double = std::is_same<result, double>::value;
+    ASSERT_TRUE(is_double);
+}
+
+
+TEST(BiggestValueType, WithSameComplex)
+{
+    using T1 = std::complex<float>;
+    using T2 = std::complex<float>;
+    using result =
+        gko::test::assertions::detail::biggest_valuetype<T1, T2>::type;
+
+    bool is_cpx_float = std::is_same<result, std::complex<float>>::value;
+    ASSERT_TRUE(is_cpx_float);
+}
+
+
+TEST(BiggestValueType, WithAComplex)
+{
+    using T1 = std::complex<float>;
+    using T2 = double;
+    using result =
+        gko::test::assertions::detail::biggest_valuetype<T1, T2>::type;
+
+    bool is_cpx_double = std::is_same<result, std::complex<double>>::value;
+    ASSERT_TRUE(is_cpx_double);
+}
+
+
+TEST(BiggestValueType, WithBothComplex)
+{
+    using T1 = std::complex<float>;
+    using T2 = std::complex<double>;
+    using result =
+        gko::test::assertions::detail::biggest_valuetype<T1, T2>::type;
+
+    bool is_cpx_double = std::is_same<result, std::complex<double>>::value;
+    ASSERT_TRUE(is_cpx_double);
+}
+
+
 }  // namespace
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index fa994bf3f4e..171e4b2dd69 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -186,7 +186,7 @@ std::unique_ptr<MatrixType> generate_random_sparsity_matrix(
 
 
 /**
- * Generates a random lower triangular matrix.
+ * Generates a random triangular matrix.
  *
  * @tparam MatrixType  type of matrix to generate (matrix::Dense must implement
  *                     the interface `ConvertibleTo<MatrixType>`)
@@ -197,6 +197,10 @@ std::unique_ptr<MatrixType> generate_random_sparsity_matrix(
  *
  * @param num_rows  number of rows
  * @param num_cols  number of columns
+ * @param ones_on_diagonal  `true` generates only ones on the diagonal,
+ *                          `false` generates random values on the diagonal
+ * @param lower_triangular  `true` generates a lower triangular matrix,
+ *                          `false` an upper triangular matrix
  * @param nonzero_dist  distribution of nonzeros per row
  * @param value_dist  distribution of matrix values
  * @param engine  a random engine
@@ -205,11 +209,11 @@ std::unique_ptr<MatrixType> generate_random_sparsity_matrix(
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
-std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
+std::unique_ptr<MatrixType> generate_random_triangular_matrix(
     size_type num_rows, size_type num_cols, bool ones_on_diagonal,
-    NonzeroDistribution &&nonzero_dist, ValueDistribution &&value_dist,
-    Engine &&engine, std::shared_ptr<const Executor> exec,
-    MatrixArgs &&... args)
+    bool lower_triangular, NonzeroDistribution &&nonzero_dist,
+    ValueDistribution &&value_dist, Engine &&engine,
+    std::shared_ptr<const Executor> exec, MatrixArgs &&... args)
 {
     using value_type = typename MatrixType::value_type;
     using index_type = typename MatrixType::index_type;
@@ -229,19 +233,34 @@ std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
         // select a subset of `nnz_in_row` column indexes, and fill these
         // locations with random values
         std::shuffle(begin(col_idx), end(col_idx), engine);
-        std::for_each(begin(col_idx), begin(col_idx) + nnz_in_row,
-                      [&](size_type col) {
-                          if (col <= row) {
-                              if (ones_on_diagonal && col == row) {
-                                  data.nonzeros.emplace_back(row, col, one);
-                              } else {
-                                  data.nonzeros.emplace_back(
-                                      row, col,
-                                      detail::get_rand_value<value_type>(
-                                          value_dist, engine));
-                              }
-                          }
-                      });
+        // add non-zeros
+        bool has_diagonal{};
+        for (size_type nz = 0; nz < nnz_in_row; ++nz) {
+            auto col = col_idx[nz];
+            // skip non-zeros outside triangle
+            if ((col > row && lower_triangular) ||
+                (col < row && !lower_triangular)) {
+                continue;
+            }
+
+            // generate and store non-zero
+            auto val = detail::get_rand_value<value_type>(value_dist, engine);
+            if (col == row) {
+                has_diagonal = true;
+                if (ones_on_diagonal) {
+                    val = one;
+                }
+            }
+            data.nonzeros.emplace_back(row, col, val);
+        }
+
+        // add diagonal if it hasn't been added yet
+        if (!has_diagonal) {
+            auto val = ones_on_diagonal ? one
+                                        : detail::get_rand_value<value_type>(
+                                              value_dist, engine);
+            data.nonzeros.emplace_back(row, row, val);
+        }
     }
 
     data.ensure_row_major_order();
@@ -254,7 +273,7 @@ std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
 
 
 /**
- * Generates a random upper triangular matrix.
+ * Generates a random lower triangular matrix.
  *
  * @tparam MatrixType  type of matrix to generate (matrix::Dense must implement
  *                     the interface `ConvertibleTo<MatrixType>`)
@@ -265,6 +284,8 @@ std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
  *
  * @param num_rows  number of rows
  * @param num_cols  number of columns
+ * @param ones_on_diagonal  `true` generates only ones on the diagonal,
+ *                          `false` generates random values on the diagonal
  * @param nonzero_dist  distribution of nonzeros per row
  * @param value_dist  distribution of matrix values
  * @param engine  a random engine
@@ -273,51 +294,49 @@ std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
-std::unique_ptr<MatrixType> generate_random_upper_triangular_matrix(
+std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
     size_type num_rows, size_type num_cols, bool ones_on_diagonal,
     NonzeroDistribution &&nonzero_dist, ValueDistribution &&value_dist,
     Engine &&engine, std::shared_ptr<const Executor> exec,
     MatrixArgs &&... args)
 {
-    using value_type = typename MatrixType::value_type;
-    using index_type = typename MatrixType::index_type;
-    using std::begin;
-    using std::end;
-
-    matrix_data<value_type, index_type> data{gko::dim<2>{num_rows, num_cols},
-                                             {}};
-    value_type one = 1.0;
-    std::vector<size_type> col_idx(num_cols);
-    std::iota(begin(col_idx), end(col_idx), size_type(0));
-
-    for (size_type row = 0; row < num_rows; ++row) {
-        // randomly generate number of nonzeros in this row
-        auto nnz_in_row = static_cast<size_type>(nonzero_dist(engine));
-        nnz_in_row = std::max(size_type(0), std::min(nnz_in_row, num_cols));
-        // select a subset of `nnz_in_row` column indexes, and fill these
-        // locations with random values
-        std::shuffle(begin(col_idx), end(col_idx), engine);
-        std::for_each(begin(col_idx), begin(col_idx) + nnz_in_row,
-                      [&](size_type col) {
-                          if (col >= row) {
-                              if (ones_on_diagonal && col == row) {
-                                  data.nonzeros.emplace_back(row, col, one);
-                              } else {
-                                  data.nonzeros.emplace_back(
-                                      row, col,
-                                      detail::get_rand_value<value_type>(
-                                          value_dist, engine));
-                              }
-                          }
-                      });
-    }
+    return generate_random_triangular_matrix<MatrixType>(
+        num_rows, num_cols, ones_on_diagonal, true, nonzero_dist, value_dist,
+        engine, std::move(exec), std::forward<MatrixArgs>(args)...);
+}
 
-    data.ensure_row_major_order();
 
-    // convert to the correct matrix type
-    auto result = MatrixType::create(exec, std::forward<MatrixArgs>(args)...);
-    result->read(data);
-    return result;
+/**
+ * Generates a random upper triangular matrix.
+ *
+ * @tparam MatrixType  type of matrix to generate (matrix::Dense must implement
+ *                     the interface `ConvertibleTo<MatrixType>`)
+ * @tparam NonzeroDistribution  type of nonzero distribution
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ * @tparam MatrixArgs  the arguments from the matrix to be forwarded.
+ *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param ones_on_diagonal  `true` generates only ones on the diagonal,
+ *                          `false` generates random values on the diagonal
+ * @param nonzero_dist  distribution of nonzeros per row
+ * @param value_dist  distribution of matrix values
+ * @param engine  a random engine
+ * @param exec  executor where the matrix should be allocated
+ * @param args  additional arguments for the matrix constructor
+ */
+template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
+          typename ValueDistribution, typename Engine, typename... MatrixArgs>
+std::unique_ptr<MatrixType> generate_random_upper_triangular_matrix(
+    size_type num_rows, size_type num_cols, bool ones_on_diagonal,
+    NonzeroDistribution &&nonzero_dist, ValueDistribution &&value_dist,
+    Engine &&engine, std::shared_ptr<const Executor> exec,
+    MatrixArgs &&... args)
+{
+    return generate_random_triangular_matrix<MatrixType>(
+        num_rows, num_cols, ones_on_diagonal, false, nonzero_dist, value_dist,
+        engine, std::move(exec), std::forward<MatrixArgs>(args)...);
 }
 
 
diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp
index 21710886bac..8a585994dc0 100644
--- a/core/test/utils/matrix_generator_test.cpp
+++ b/core/test/utils/matrix_generator_test.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,16 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <core/test/utils/matrix_generator.hpp>
-
-
-#include <gtest/gtest.h>
+#include "core/test/utils/matrix_generator.hpp"
 
 
 #include <cmath>
 #include <random>
 
 
+#include <gtest/gtest.h>
+
+
 namespace {
 
 
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index af9ba8efddd..381e454fcf8 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -9,9 +9,11 @@ endif()
 if(MSVC)
     # MSVC can not find CUDA automatically
     # Use CUDA_COMPILER PATH to define the CUDA TOOLKIT ROOT DIR
+    string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER})
     if("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" STREQUAL "")
-        string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER})
         set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/include")
+    endif()
+    if("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" STREQUAL "")
         set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/lib/x64")
     endif()
 
@@ -59,36 +61,50 @@ find_library(CUSPARSE cusparse
 add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 target_sources(ginkgo_cuda
     PRIVATE
-        base/exception.cpp
-        base/executor.cpp
-        base/version.cpp
-        components/zero_array.cu
-        factorization/par_ilu_kernels.cu
-        matrix/coo_kernels.cu
-        matrix/csr_kernels.cu
-        matrix/dense_kernels.cu
-        matrix/ell_kernels.cu
-        matrix/hybrid_kernels.cu
-        matrix/sellp_kernels.cu
-        matrix/sparsity_csr_kernels.cu
-        preconditioner/jacobi_advanced_apply_kernel.cu
-        preconditioner/jacobi_generate_kernel.cu
-        preconditioner/jacobi_kernels.cu
-        preconditioner/jacobi_simple_apply_kernel.cu
-        solver/bicgstab_kernels.cu
-        solver/cg_kernels.cu
-        solver/cgs_kernels.cu
-        solver/fcg_kernels.cu
-        solver/gmres_kernels.cu
-        solver/ir_kernels.cu
-        solver/lower_trs_kernels.cu
-        solver/upper_trs_kernels.cu
-        stop/criterion_kernels.cu
-        stop/residual_norm_reduction_kernels.cu)
+    base/exception.cpp
+    base/executor.cpp
+    base/version.cpp
+    components/fill_array.cu
+    components/precision_conversion.cu
+    components/prefix_sum.cu
+    factorization/ilu_kernels.cu
+    factorization/factorization_kernels.cu
+    factorization/par_ict_kernels.cu
+    factorization/par_ilu_kernels.cu
+    factorization/par_ilut_approx_filter_kernel.cu
+    factorization/par_ilut_filter_kernel.cu
+    factorization/par_ilut_select_kernel.cu
+    factorization/par_ilut_select_common.cu
+    factorization/par_ilut_spgeam_kernel.cu
+    factorization/par_ilut_sweep_kernel.cu
+    matrix/coo_kernels.cu
+    matrix/csr_kernels.cu
+    matrix/dense_kernels.cu
+    matrix/ell_kernels.cu
+    matrix/hybrid_kernels.cu
+    matrix/sellp_kernels.cu
+    matrix/sparsity_csr_kernels.cu
+    preconditioner/isai_kernels.cu
+    preconditioner/jacobi_advanced_apply_kernel.cu
+    preconditioner/jacobi_generate_kernel.cu
+    preconditioner/jacobi_kernels.cu
+    preconditioner/jacobi_simple_apply_kernel.cu
+    solver/bicg_kernels.cu
+    solver/bicgstab_kernels.cu
+    solver/cg_kernels.cu
+    solver/cgs_kernels.cu
+    solver/fcg_kernels.cu
+    solver/gmres_kernels.cu
+    solver/ir_kernels.cu
+    solver/lower_trs_kernels.cu
+    solver/upper_trs_kernels.cu
+    stop/criterion_kernels.cu
+    stop/residual_norm_kernels.cu)
 
 # This creates a compilation bug on nvcc 9.0.102 *with* the new array_deleter
-# merged at commit ed12b3df5d26
-if(NOT CMAKE_CUDA_COMPILER_VERSION MATCHES "9.0")
+# merged at commit ed12b3df5d26, and the parameter is not recognized by clang-cuda
+if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
+   NOT CMAKE_CUDA_COMPILER_VERSION MATCHES "9.0")
     # remove false positive CUDA warnings when calling one<T>() and zero<T>()
     target_compile_options(ginkgo_cuda
         PRIVATE
@@ -107,6 +123,9 @@ target_include_directories(ginkgo_cuda
     SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS})
 target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE})
 
+# Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method
+target_link_libraries(ginkgo_cuda PUBLIC ginkgo_hip)
+
 cas_target_cuda_architectures(ginkgo_cuda
     ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
     UNSUPPORTED "20" "21")
@@ -114,6 +133,10 @@ cas_target_cuda_architectures(ginkgo_cuda
 ginkgo_default_includes(ginkgo_cuda)
 ginkgo_install_library(ginkgo_cuda cuda)
 
+if (GINKGO_CHECK_CIRCULAR_DEPS)
+    ginkgo_check_headers(ginkgo_cuda)
+endif()
+
 if(GINKGO_BUILD_TESTS)
     add_subdirectory(test)
 endif()
diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp
new file mode 100644
index 00000000000..cd69b6a2c56
--- /dev/null
+++ b/cuda/base/config.hpp
@@ -0,0 +1,82 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_BASE_CONFIG_HPP_
+#define GKO_CUDA_BASE_CONFIG_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "cuda/base/math.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+struct config {
+    /**
+     * The type containing a bitmask over all lanes of a warp.
+     */
+    using lane_mask_type = uint32;
+
+    /**
+     * The number of threads within a CUDA warp.
+     */
+    static constexpr uint32 warp_size = 32;
+
+    /**
+     * The bitmask of the entire warp.
+     */
+    static constexpr auto full_lane_mask = ~zero<lane_mask_type>();
+
+    /**
+     * The maximal number of threads allowed in a CUDA warp.
+     */
+    static constexpr uint32 max_block_size = 1024;
+
+    /**
+     * The minimal amount of warps that need to be scheduled for each block
+     * to maximize GPU occupancy.
+     */
+    static constexpr uint32 min_warps_per_block = 4;
+};
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_CONFIG_HPP_
diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
index 9dd4d67fa07..72a67d958e9 100644
--- a/cuda/base/cublas_bindings.hpp
+++ b/cuda/base/cublas_bindings.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/zero_array.hpp"
 
 
 namespace gko {
@@ -215,24 +214,9 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUBLAS_DOT
 
 
-#define GKO_BIND_CUBLAS_COMPLEX_NORM2(ValueType, CublasName)                 \
-    inline void norm2(cublasHandle_t handle, int n, const ValueType *x,      \
-                      int incx, ValueType *result)                           \
-    {                                                                        \
-        zero_array(n, result);                                               \
-        GKO_ASSERT_NO_CUBLAS_ERRORS(                                         \
-            CublasName(handle, n, as_culibs_type(x), incx,                   \
-                       reinterpret_cast<remove_complex<ValueType> *>(        \
-                           as_culibs_type(result))));                        \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-
 #define GKO_BIND_CUBLAS_NORM2(ValueType, CublasName)                           \
     inline void norm2(cublasHandle_t handle, int n, const ValueType *x,        \
-                      int incx, ValueType *result)                             \
+                      int incx, remove_complex<ValueType> *result)             \
     {                                                                          \
         GKO_ASSERT_NO_CUBLAS_ERRORS(CublasName(handle, n, as_culibs_type(x),   \
                                                incx, as_culibs_type(result))); \
@@ -244,8 +228,8 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
 
 GKO_BIND_CUBLAS_NORM2(float, cublasSnrm2);
 GKO_BIND_CUBLAS_NORM2(double, cublasDnrm2);
-GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<float>, cublasScnrm2);
-GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<double>, cublasDznrm2);
+GKO_BIND_CUBLAS_NORM2(std::complex<float>, cublasScnrm2);
+GKO_BIND_CUBLAS_NORM2(std::complex<double>, cublasDznrm2);
 template <typename ValueType>
 GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented);
 
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index e9da6b9952b..ed9f043f9ef 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,87 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
-namespace solver {
-
-
-#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
-
-
-struct SolveStruct {
-    int algorithm;
-    csrsm2Info_t solve_info;
-    cusparseSolvePolicy_t policy;
-    cusparseMatDescr_t factor_descr;
-    size_t factor_work_size;
-    void *factor_work_vec;
-    SolveStruct()
-    {
-        factor_work_vec = nullptr;
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&solve_info));
-        algorithm = 0;
-        policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-    }
-    SolveStruct(const SolveStruct &) : SolveStruct() {}
-    SolveStruct(SolveStruct &&) : SolveStruct() {}
-    SolveStruct &operator=(const SolveStruct &) { return *this; }
-    SolveStruct &operator=(SolveStruct &&) { return *this; }
-    ~SolveStruct()
-    {
-        cusparseDestroyMatDescr(factor_descr);
-        if (solve_info) {
-            cusparseDestroyCsrsm2Info(solve_info);
-        }
-        if (factor_work_vec != nullptr) {
-            cudaFree(factor_work_vec);
-            factor_work_vec = nullptr;
-        }
-    }
-};
-
-
-#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
-
-
-struct SolveStruct {
-    cusparseSolveAnalysisInfo_t solve_info;
-    cusparseMatDescr_t factor_descr;
-    SolveStruct()
-    {
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseCreateSolveAnalysisInfo(&solve_info));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
-    }
-    SolveStruct(const SolveStruct &) : SolveStruct() {}
-    SolveStruct(SolveStruct &&) : SolveStruct() {}
-    SolveStruct &operator=(const SolveStruct &) { return *this; }
-    SolveStruct &operator=(SolveStruct &&) { return *this; }
-    ~SolveStruct()
-    {
-        cusparseDestroyMatDescr(factor_descr);
-        cusparseDestroySolveAnalysisInfo(solve_info);
-    }
-};
-
-
-#endif
-
-
-}  // namespace solver
-
-
 namespace kernels {
 namespace cuda {
 /**
@@ -417,6 +336,129 @@ GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE32_SPMV
 
 
+template <typename ValueType, typename IndexType>
+void spgemm_buffer_size(
+    cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
+    const ValueType *alpha, const cusparseMatDescr_t descrA, IndexType nnzA,
+    const IndexType *csrRowPtrA, const IndexType *csrColIndA,
+    const cusparseMatDescr_t descrB, IndexType nnzB,
+    const IndexType *csrRowPtrB, const IndexType *csrColIndB,
+    const ValueType *beta, const cusparseMatDescr_t descrD, IndexType nnzD,
+    const IndexType *csrRowPtrD, const IndexType *csrColIndD,
+    csrgemm2Info_t info, size_type &result) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(ValueType, CusparseName)          \
+    template <>                                                                \
+    inline void spgemm_buffer_size<ValueType, int32>(                          \
+        cusparseHandle_t handle, int32 m, int32 n, int32 k,                    \
+        const ValueType *alpha, const cusparseMatDescr_t descrA, int32 nnzA,   \
+        const int32 *csrRowPtrA, const int32 *csrColIndA,                      \
+        const cusparseMatDescr_t descrB, int32 nnzB, const int32 *csrRowPtrB,  \
+        const int32 *csrColIndB, const ValueType *beta,                        \
+        const cusparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD,  \
+        const int32 *csrColIndD, csrgemm2Info_t info, size_type &result)       \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
+            CusparseName(handle, m, n, k, as_culibs_type(alpha), descrA, nnzA, \
+                         csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB,     \
+                         csrColIndB, as_culibs_type(beta), descrD, nnzD,       \
+                         csrRowPtrD, csrColIndD, info, &result));              \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(float, cusparseScsrgemm2_bufferSizeExt);
+GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(double, cusparseDcsrgemm2_bufferSizeExt);
+GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(std::complex<float>,
+                                     cusparseCcsrgemm2_bufferSizeExt);
+GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(std::complex<double>,
+                                     cusparseZcsrgemm2_bufferSizeExt);
+
+
+#undef GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE
+
+
+template <typename IndexType>
+void spgemm_nnz(cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
+                const cusparseMatDescr_t descrA, IndexType nnzA,
+                const IndexType *csrRowPtrA, const IndexType *csrColIndA,
+                const cusparseMatDescr_t descrB, IndexType nnzB,
+                const IndexType *csrRowPtrB, const IndexType *csrColIndB,
+                const cusparseMatDescr_t descrD, IndexType nnzD,
+                const IndexType *csrRowPtrD, const IndexType *csrColIndD,
+                const cusparseMatDescr_t descrC, IndexType *csrRowPtrC,
+                IndexType *nnzC, csrgemm2Info_t info,
+                void *buffer) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void spgemm_nnz<int32>(
+    cusparseHandle_t handle, int32 m, int32 n, int32 k,
+    const cusparseMatDescr_t descrA, int32 nnzA, const int32 *csrRowPtrA,
+    const int32 *csrColIndA, const cusparseMatDescr_t descrB, int32 nnzB,
+    const int32 *csrRowPtrB, const int32 *csrColIndB,
+    const cusparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD,
+    const int32 *csrColIndD, const cusparseMatDescr_t descrC, int32 *csrRowPtrC,
+    int32 *nnzC, csrgemm2Info_t info, void *buffer)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrgemm2Nnz(
+        handle, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB,
+        csrRowPtrB, csrColIndB, descrD, nnzD, csrRowPtrD, csrColIndD, descrC,
+        csrRowPtrC, nnzC, info, buffer));
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm(cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
+            const ValueType *alpha, const cusparseMatDescr_t descrA,
+            IndexType nnzA, const ValueType *csrValA,
+            const IndexType *csrRowPtrA, const IndexType *csrColIndA,
+            const cusparseMatDescr_t descrB, IndexType nnzB,
+            const ValueType *csrValB, const IndexType *csrRowPtrB,
+            const IndexType *csrColIndB, const ValueType *beta,
+            const cusparseMatDescr_t descrD, IndexType nnzD,
+            const ValueType *csrValD, const IndexType *csrRowPtrD,
+            const IndexType *csrColIndD, const cusparseMatDescr_t descrC,
+            ValueType *csrValC, const IndexType *csrRowPtrC,
+            IndexType *csrColIndC, csrgemm2Info_t info,
+            void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_SPGEMM(ValueType, CusparseName)                      \
+    template <>                                                                \
+    inline void spgemm<ValueType, int32>(                                      \
+        cusparseHandle_t handle, int32 m, int32 n, int32 k,                    \
+        const ValueType *alpha, const cusparseMatDescr_t descrA, int32 nnzA,   \
+        const ValueType *csrValA, const int32 *csrRowPtrA,                     \
+        const int32 *csrColIndA, const cusparseMatDescr_t descrB, int32 nnzB,  \
+        const ValueType *csrValB, const int32 *csrRowPtrB,                     \
+        const int32 *csrColIndB, const ValueType *beta,                        \
+        const cusparseMatDescr_t descrD, int32 nnzD, const ValueType *csrValD, \
+        const int32 *csrRowPtrD, const int32 *csrColIndD,                      \
+        const cusparseMatDescr_t descrC, ValueType *csrValC,                   \
+        const int32 *csrRowPtrC, int32 *csrColIndC, csrgemm2Info_t info,       \
+        void *buffer)                                                          \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                            \
+            handle, m, n, k, as_culibs_type(alpha), descrA, nnzA,              \
+            as_culibs_type(csrValA), csrRowPtrA, csrColIndA, descrB, nnzB,     \
+            as_culibs_type(csrValB), csrRowPtrB, csrColIndB,                   \
+            as_culibs_type(beta), descrD, nnzD, as_culibs_type(csrValD),       \
+            csrRowPtrD, csrColIndD, descrC, as_culibs_type(csrValC),           \
+            csrRowPtrC, csrColIndC, info, buffer));                            \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_SPGEMM(float, cusparseScsrgemm2);
+GKO_BIND_CUSPARSE_SPGEMM(double, cusparseDcsrgemm2);
+GKO_BIND_CUSPARSE_SPGEMM(std::complex<float>, cusparseCcsrgemm2);
+GKO_BIND_CUSPARSE_SPGEMM(std::complex<double>, cusparseZcsrgemm2);
+
+
+#undef GKO_BIND_CUSPARSE_SPGEMM
+
+
 #define GKO_BIND_CUSPARSE32_CSR2HYB(ValueType, CusparseName)                 \
     inline void csr2hyb(cusparseHandle_t handle, int32 m, int32 n,           \
                         const cusparseMatDescr_t descrA,                     \
@@ -573,6 +615,73 @@ inline void destroy(cusparseMatDescr_t descr)
 }
 
 
+inline csrgemm2Info_t create_spgemm_info()
+{
+    csrgemm2Info_t info{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrgemm2Info(&info));
+    return info;
+}
+
+
+inline void destroy(csrgemm2Info_t info)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrgemm2Info(info));
+}
+
+
+// CUDA versions 9.2 and above have csrsm2.
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
+
+
+inline csrsm2Info_t create_solve_info()
+{
+    csrsm2Info_t info{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&info));
+    return info;
+}
+
+
+inline void destroy(csrsm2Info_t info)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrsm2Info(info));
+}
+
+
+// CUDA_VERSION<=9.1 do not support csrsm2.
+#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
+
+
+inline cusparseSolveAnalysisInfo_t create_solve_info()
+{
+    cusparseSolveAnalysisInfo_t info{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateSolveAnalysisInfo(&info));
+    return info;
+}
+
+
+inline void destroy(cusparseSolveAnalysisInfo_t info)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySolveAnalysisInfo(info));
+}
+
+
+#endif
+
+
+inline csrilu02Info_t create_ilu0_info()
+{
+    csrilu02Info_t info{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrilu02Info(&info));
+    return info;
+}
+
+
+inline void destroy(csrilu02Info_t info)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrilu02Info(info));
+}
+
+
 // CUDA versions 9.2 and above have csrsm2.
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
 
@@ -788,7 +897,8 @@ GKO_BIND_CUSPARSE64_CSRSM_ANALYSIS(ValueType, detail::not_implemented);
         size_type n, const ValueType *one, const cusparseMatDescr_t descr,   \
         const ValueType *csrVal, const int32 *csrRowPtr,                     \
         const int32 *csrColInd, cusparseSolveAnalysisInfo_t factor_info,     \
-        ValueType *rhs, int32 rhs_stride, ValueType *sol, int32 sol_stride)  \
+        const ValueType *rhs, int32 rhs_stride, ValueType *sol,              \
+        int32 sol_stride)                                                    \
     {                                                                        \
         GKO_ASSERT_NO_CUSPARSE_ERRORS(                                       \
             CusparseName(handle, trans, m, n, as_culibs_type(one), descr,    \
@@ -806,8 +916,8 @@ GKO_BIND_CUSPARSE64_CSRSM_ANALYSIS(ValueType, detail::not_implemented);
         size_type n, const ValueType *one, const cusparseMatDescr_t descr,   \
         const ValueType *csrVal, const int64 *csrRowPtr,                     \
         const int64 *csrColInd, cusparseSolveAnalysisInfo_t factor_info,     \
-        ValueType *rhs, int64 rhs_stride, ValueType *sol, int64 sol_stride)  \
-        GKO_NOT_IMPLEMENTED;                                                 \
+        const ValueType *rhs, int64 rhs_stride, ValueType *sol,              \
+        int64 sol_stride) GKO_NOT_IMPLEMENTED;                               \
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
@@ -831,6 +941,180 @@ GKO_BIND_CUSPARSE64_CSRSM_SOLVE(ValueType, detail::not_implemented);
 #endif
 
 
+template <typename IndexType>
+void create_identity_permutation(cusparseHandle_t handle, IndexType size,
+                                 IndexType *permutation) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void create_identity_permutation<int32>(cusparseHandle_t handle,
+                                               int32 size, int32 *permutation)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseCreateIdentityPermutation(handle, size, permutation));
+}
+
+
+template <typename IndexType>
+void csrsort_buffer_size(cusparseHandle_t handle, IndexType m, IndexType n,
+                         IndexType nnz, const IndexType *row_ptrs,
+                         const IndexType *col_idxs,
+                         size_type &buffer_size) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void csrsort_buffer_size<int32>(cusparseHandle_t handle, int32 m,
+                                       int32 n, int32 nnz,
+                                       const int32 *row_ptrs,
+                                       const int32 *col_idxs,
+                                       size_type &buffer_size)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrsort_bufferSizeExt(
+        handle, m, n, nnz, row_ptrs, col_idxs, &buffer_size));
+}
+
+
+template <typename IndexType>
+void csrsort(cusparseHandle_t handle, IndexType m, IndexType n, IndexType nnz,
+             const cusparseMatDescr_t descr, const IndexType *row_ptrs,
+             IndexType *col_idxs, IndexType *permutation,
+             void *buffer) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void csrsort<int32>(cusparseHandle_t handle, int32 m, int32 n, int32 nnz,
+                           const cusparseMatDescr_t descr,
+                           const int32 *row_ptrs, int32 *col_idxs,
+                           int32 *permutation, void *buffer)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrsort(
+        handle, m, n, nnz, descr, row_ptrs, col_idxs, permutation, buffer));
+}
+
+
+template <typename ValueType, typename IndexType>
+void gather(cusparseHandle_t handle, IndexType nnz, const ValueType *in,
+            ValueType *out, const IndexType *permutation) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_GATHER(ValueType, CusparseName)                      \
+    template <>                                                                \
+    inline void gather<ValueType, int32>(cusparseHandle_t handle, int32 nnz,   \
+                                         const ValueType *in, ValueType *out,  \
+                                         const int32 *permutation)             \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
+            CusparseName(handle, nnz, as_culibs_type(in), as_culibs_type(out), \
+                         permutation, CUSPARSE_INDEX_BASE_ZERO));              \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_GATHER(float, cusparseSgthr);
+GKO_BIND_CUSPARSE_GATHER(double, cusparseDgthr);
+GKO_BIND_CUSPARSE_GATHER(std::complex<float>, cusparseCgthr);
+GKO_BIND_CUSPARSE_GATHER(std::complex<double>, cusparseZgthr);
+
+#undef GKO_BIND_CUSPARSE_GATHER
+
+
+template <typename ValueType, typename IndexType>
+void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz,
+                      const cusparseMatDescr_t descr, const ValueType *vals,
+                      const IndexType *row_ptrs, const IndexType *col_idxs,
+                      csrilu02Info_t info,
+                      size_type &buffer_size) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(ValueType, CusparseName)          \
+    template <>                                                              \
+    inline void ilu0_buffer_size<ValueType, int32>(                          \
+        cusparseHandle_t handle, int32 m, int32 nnz,                         \
+        const cusparseMatDescr_t descr, const ValueType *vals,               \
+        const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info,   \
+        size_type &buffer_size)                                              \
+    {                                                                        \
+        int tmp_buffer_size{};                                               \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                       \
+            CusparseName(handle, m, nnz, descr,                              \
+                         as_culibs_type(const_cast<ValueType *>(vals)),      \
+                         row_ptrs, col_idxs, info, &tmp_buffer_size));       \
+        buffer_size = tmp_buffer_size;                                       \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(float, cusparseScsrilu02_bufferSize);
+GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(double, cusparseDcsrilu02_bufferSize);
+GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(std::complex<float>,
+                                   cusparseCcsrilu02_bufferSize);
+GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(std::complex<double>,
+                                   cusparseZcsrilu02_bufferSize);
+
+#undef GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE
+
+
+template <typename ValueType, typename IndexType>
+void ilu0_analysis(cusparseHandle_t handle, IndexType m, IndexType nnz,
+                   const cusparseMatDescr_t descr, const ValueType *vals,
+                   const IndexType *row_ptrs, const IndexType *col_idxs,
+                   csrilu02Info_t info, cusparseSolvePolicy_t policy,
+                   void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_ILU0_ANALYSIS(ValueType, CusparseName)             \
+    template <>                                                              \
+    inline void ilu0_analysis<ValueType, int32>(                             \
+        cusparseHandle_t handle, int32 m, int32 nnz,                         \
+        const cusparseMatDescr_t descr, const ValueType *vals,               \
+        const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info,   \
+        cusparseSolvePolicy_t policy, void *buffer)                          \
+    {                                                                        \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                       \
+            CusparseName(handle, m, nnz, descr, as_culibs_type(vals),        \
+                         row_ptrs, col_idxs, info, policy, buffer));         \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_ILU0_ANALYSIS(float, cusparseScsrilu02_analysis);
+GKO_BIND_CUSPARSE_ILU0_ANALYSIS(double, cusparseDcsrilu02_analysis);
+GKO_BIND_CUSPARSE_ILU0_ANALYSIS(std::complex<float>,
+                                cusparseCcsrilu02_analysis);
+GKO_BIND_CUSPARSE_ILU0_ANALYSIS(std::complex<double>,
+                                cusparseZcsrilu02_analysis);
+
+#undef GKO_BIND_CUSPARSE_ILU0_ANALYSIS
+
+
+template <typename ValueType, typename IndexType>
+void ilu0(cusparseHandle_t handle, IndexType m, IndexType nnz,
+          const cusparseMatDescr_t descr, ValueType *vals,
+          const IndexType *row_ptrs, const IndexType *col_idxs,
+          csrilu02Info_t info, cusparseSolvePolicy_t policy,
+          void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_ILU0(ValueType, CusparseName)                      \
+    template <>                                                              \
+    inline void ilu0<ValueType, int32>(                                      \
+        cusparseHandle_t handle, int32 m, int32 nnz,                         \
+        const cusparseMatDescr_t descr, ValueType *vals,                     \
+        const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info,   \
+        cusparseSolvePolicy_t policy, void *buffer)                          \
+    {                                                                        \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                       \
+            CusparseName(handle, m, nnz, descr, as_culibs_type(vals),        \
+                         row_ptrs, col_idxs, info, policy, buffer));         \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_ILU0(float, cusparseScsrilu02);
+GKO_BIND_CUSPARSE_ILU0(double, cusparseDcsrilu02);
+GKO_BIND_CUSPARSE_ILU0(std::complex<float>, cusparseCcsrilu02);
+GKO_BIND_CUSPARSE_ILU0(std::complex<double>, cusparseZcsrilu02);
+
+#undef GKO_BIND_CUSPARSE_ILU0
+
+
 }  // namespace cusparse
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/base/device_guard.hpp b/cuda/base/device_guard.hpp
index 7cda48593d0..aa347994327 100644
--- a/cuda/base/device_guard.hpp
+++ b/cuda/base/device_guard.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace cuda {
 
 
 /**
@@ -85,6 +86,7 @@ class device_guard {
 };
 
 
+}  // namespace cuda
 }  // namespace gko
 
 
diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp
index a781867cc27..93fcd5e7cfd 100644
--- a/cuda/base/exception.cpp
+++ b/cuda/base/exception.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 
 
+#include <string>
+
+
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
 
 
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 
 
@@ -67,6 +73,8 @@ std::string CublasError::get_error(int64 error_code)
     GKO_REGISTER_CUBLAS_ERROR(CUBLAS_STATUS_NOT_SUPPORTED);
     GKO_REGISTER_CUBLAS_ERROR(CUBLAS_STATUS_LICENSE_ERROR);
     return "Unknown error";
+
+#undef GKO_REGISTER_CUBLAS_ERROR
 }
 
 
@@ -86,6 +94,8 @@ std::string CusparseError::get_error(int64 error_code)
     GKO_REGISTER_CUSPARSE_ERROR(CUSPARSE_STATUS_INTERNAL_ERROR);
     GKO_REGISTER_CUSPARSE_ERROR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
     return "Unknown error";
+
+#undef GKO_REGISTER_CUSPARSE_ERROR
 }
 
 
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 3faa5e7390f..a72ecef7591 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,76 +43,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/device_guard.hpp"
 
 
 namespace gko {
-namespace {
 
 
-// The function is copied from _ConvertSMVer2Cores of
-// cuda-9.2/samples/common/inc/helper_cuda.h
-inline int convert_sm_ver_to_cores(int major, int minor)
-{
-    // Defines for GPU Architecture types (using the SM version to determine
-    // the # of cores per SM
-    typedef struct {
-        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
-        // and m = SM minor version
-        int Cores;
-    } sSMtoCores;
-
-    sSMtoCores nGpuArchCoresPerSM[] = {
-        {0x30, 192},  // Kepler Generation (SM 3.0) GK10x class
-        {0x32, 192},  // Kepler Generation (SM 3.2) GK10x class
-        {0x35, 192},  // Kepler Generation (SM 3.5) GK11x class
-        {0x37, 192},  // Kepler Generation (SM 3.7) GK21x class
-        {0x50, 128},  // Maxwell Generation (SM 5.0) GM10x class
-        {0x52, 128},  // Maxwell Generation (SM 5.2) GM20x class
-        {0x53, 128},  // Maxwell Generation (SM 5.3) GM20x class
-        {0x60, 64},   // Pascal Generation (SM 6.0) GP100 class
-        {0x61, 128},  // Pascal Generation (SM 6.1) GP10x class
-        {0x62, 128},  // Pascal Generation (SM 6.2) GP10x class
-        {0x70, 64},   // Volta Generation (SM 7.0) GV100 class
-        {0x72, 64},   // Volta Generation (SM 7.2) GV11b class
-        {0x75, 64},   // Turing Generation (SM 7.5) TU1xx class
-        {-1, -1}};
-
-    int index = 0;
-
-    while (nGpuArchCoresPerSM[index].SM != -1) {
-        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-            return nGpuArchCoresPerSM[index].Cores;
-        }
-        index++;
-    }
-
-#if GKO_VERBOSE_LEVEL >= 1
-    // If we don't find the values, we use the last valid value by default
-    // to allow proper execution
-    std::cerr << "MapSMtoCores for SM " << major << "." << minor
-              << "is undefined. The default value of "
-              << nGpuArchCoresPerSM[index - 1].Cores << " Cores/SM is used."
-              << std::endl;
-#endif
-    return nGpuArchCoresPerSM[index - 1].Cores;
-}
-
-
-}  // namespace
+#include "common/base/executor.hpp.inc"
 
 
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
-    int device_id, std::shared_ptr<Executor> master)
+    int device_id, std::shared_ptr<Executor> master, bool device_reset)
 {
     return std::shared_ptr<CudaExecutor>(
-        new CudaExecutor(device_id, std::move(master)),
+        new CudaExecutor(device_id, std::move(master), device_reset),
         [device_id](CudaExecutor *exec) {
             delete exec;
-            if (!CudaExecutor::get_num_execs(device_id)) {
-                device_guard g(device_id);
+            if (!CudaExecutor::get_num_execs(device_id) &&
+                exec->get_device_reset()) {
+                cuda::device_guard g(device_id);
                 cudaDeviceReset();
             }
         });
@@ -122,15 +74,17 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
 void OmpExecutor::raw_copy_to(const CudaExecutor *dest, size_type num_bytes,
                               const void *src_ptr, void *dest_ptr) const
 {
-    device_guard g(dest->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(
-        cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyHostToDevice));
+    if (num_bytes > 0) {
+        cuda::device_guard g(dest->get_device_id());
+        GKO_ASSERT_NO_CUDA_ERRORS(
+            cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyHostToDevice));
+    }
 }
 
 
 void CudaExecutor::raw_free(void *ptr) const noexcept
 {
-    device_guard g(this->get_device_id());
+    cuda::device_guard g(this->get_device_id());
     auto error_code = cudaFree(ptr);
     if (error_code != cudaSuccess) {
 #if GKO_VERBOSE_LEVEL >= 1
@@ -148,7 +102,7 @@ void CudaExecutor::raw_free(void *ptr) const noexcept
 void *CudaExecutor::raw_alloc(size_type num_bytes) const
 {
     void *dev_ptr = nullptr;
-    device_guard g(this->get_device_id());
+    cuda::device_guard g(this->get_device_id());
     auto error_code = cudaMalloc(&dev_ptr, num_bytes);
     if (error_code != cudaErrorMemoryAllocation) {
         GKO_ASSERT_NO_CUDA_ERRORS(error_code);
@@ -161,24 +115,45 @@ void *CudaExecutor::raw_alloc(size_type num_bytes) const
 void CudaExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes,
                                const void *src_ptr, void *dest_ptr) const
 {
-    device_guard g(this->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(
-        cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyDeviceToHost));
+    if (num_bytes > 0) {
+        cuda::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_CUDA_ERRORS(
+            cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyDeviceToHost));
+    }
 }
 
 
 void CudaExecutor::raw_copy_to(const CudaExecutor *src, size_type num_bytes,
                                const void *src_ptr, void *dest_ptr) const
 {
-    device_guard g(this->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeer(
-        dest_ptr, this->device_id_, src_ptr, src->get_device_id(), num_bytes));
+    if (num_bytes > 0) {
+        cuda::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeer(dest_ptr, this->device_id_,
+                                                 src_ptr, src->get_device_id(),
+                                                 num_bytes));
+    }
+}
+
+
+void CudaExecutor::raw_copy_to(const HipExecutor *src, size_type num_bytes,
+                               const void *src_ptr, void *dest_ptr) const
+{
+#if GINKGO_HIP_PLATFORM_NVCC == 1
+    if (num_bytes > 0) {
+        cuda::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeer(dest_ptr, this->device_id_,
+                                                 src_ptr, src->get_device_id(),
+                                                 num_bytes));
+    }
+#else
+    GKO_NOT_SUPPORTED(this);
+#endif
 }
 
 
 void CudaExecutor::synchronize() const
 {
-    device_guard g(this->get_device_id());
+    cuda::device_guard g(this->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceSynchronize());
 }
 
@@ -186,7 +161,7 @@ void CudaExecutor::synchronize() const
 void CudaExecutor::run(const Operation &op) const
 {
     this->template log<log::Logger::operation_launched>(this, &op);
-    device_guard g(this->get_device_id());
+    cuda::device_guard g(this->get_device_id());
     op.run(
         std::static_pointer_cast<const CudaExecutor>(this->shared_from_this()));
     this->template log<log::Logger::operation_completed>(this, &op);
@@ -208,14 +183,16 @@ int CudaExecutor::get_num_devices()
 void CudaExecutor::set_gpu_property()
 {
     if (device_id_ < this->get_num_devices() && device_id_ >= 0) {
-        device_guard g(this->get_device_id());
+        cuda::device_guard g(this->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &major_, cudaDevAttrComputeCapabilityMajor, device_id_));
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &minor_, cudaDevAttrComputeCapabilityMinor, device_id_));
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &num_multiprocessor_, cudaDevAttrMultiProcessorCount, device_id_));
-        num_cores_per_sm_ = convert_sm_ver_to_cores(major_, minor_);
+        num_warps_per_sm_ = convert_sm_ver_to_cores(major_, minor_) /
+                            kernels::cuda::config::warp_size;
+        warp_size_ = kernels::cuda::config::warp_size;
     }
 }
 
@@ -224,15 +201,15 @@ void CudaExecutor::init_handles()
 {
     if (device_id_ < this->get_num_devices() && device_id_ >= 0) {
         const auto id = this->get_device_id();
-        device_guard g(id);
+        cuda::device_guard g(id);
         this->cublas_handle_ = handle_manager<cublasContext>(
             kernels::cuda::cublas::init(), [id](cublasHandle_t handle) {
-                device_guard g(id);
+                cuda::device_guard g(id);
                 kernels::cuda::cublas::destroy(handle);
             });
         this->cusparse_handle_ = handle_manager<cusparseContext>(
             kernels::cuda::cusparse::init(), [id](cusparseHandle_t handle) {
-                device_guard g(id);
+                cuda::device_guard g(id);
                 kernels::cuda::cusparse::destroy(handle);
             });
     }
diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp
index bb425214a78..7e970486a1e 100644
--- a/cuda/base/math.hpp
+++ b/cuda/base/math.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -41,178 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
-namespace detail {
-
-
-template <typename T>
-struct remove_complex_impl<thrust::complex<T>> {
-    using type = T;
-};
-
-
-template <typename T>
-struct is_complex_impl<thrust::complex<T>>
-    : public std::integral_constant<bool, true> {};
-
-
-template <typename T>
-struct truncate_type_impl<thrust::complex<T>> {
-    using type = thrust::complex<typename truncate_type_impl<T>::type>;
-};
-
-
-}  // namespace detail
-
-
-template <>
-__device__ GKO_INLINE std::complex<float> zero<std::complex<float>>()
-{
-    thrust::complex<float> z(0);
-    return reinterpret_cast<std::complex<float> &>(z);
-}
-
-template <>
-__device__ GKO_INLINE std::complex<double> zero<std::complex<double>>()
-{
-    thrust::complex<double> z(0);
-    return reinterpret_cast<std::complex<double> &>(z);
-}
-
-template <>
-__device__ GKO_INLINE std::complex<float> one<std::complex<float>>()
-{
-    thrust::complex<float> z(1);
-    return reinterpret_cast<std::complex<float> &>(z);
-}
-
-template <>
-__device__ GKO_INLINE std::complex<double> one<std::complex<double>>()
-{
-    thrust::complex<double> z(1);
-    return reinterpret_cast<std::complex<double> &>(z);
-}
-
-
-// This first part is specific for clang and intel in combination with the nvcc
-// compiler from the toolkit older than 9.2.
-// Both want to use their `__builtin_isfinite` function, which is not present
-// as a __device__ function, so it results in a compiler error.
-// Here, `isfinite` is written by hand, which might not be as performant as the
-// intrinsic function from CUDA, but it compiles and works.
-#if defined(__CUDA_ARCH__) &&                                           \
-    (defined(_MSC_VER) ||                                               \
-     (defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && \
-      (__CUDACC_VER_MAJOR__ * 1000 + __CUDACC_VER_MINOR__) < 9002 &&    \
-      (defined(__clang__) || defined(__ICC) || defined(__ICL))))
-
-
-namespace detail {
-
-
-/**
- * This structure can be used to get the exponent mask of a given floating
- * point type. Uses specialization to implement different types.
- */
-template <typename T>
-struct mask_creator {};
-
-template <>
-struct mask_creator<float> {
-    using int_type = int32;
-    static constexpr int_type number_exponent_bits = 8;
-    static constexpr int_type number_significand_bits = 23;
-    // integer representation of a floating point number, where all exponent
-    // bits are set
-    static constexpr int_type exponent_mask =
-        ((int_type{1} << number_exponent_bits) - 1) << number_significand_bits;
-    static __device__ int_type reinterpret_int(const float &value)
-    {
-        return __float_as_int(value);
-    }
-};
-
-template <>
-struct mask_creator<double> {
-    using int_type = int64;
-    static constexpr int_type number_exponent_bits = 11;
-    static constexpr int_type number_significand_bits = 52;
-    // integer representation of a floating point number, where all exponent
-    // bits are set
-    static constexpr int_type exponent_mask =
-        ((int_type{1} << number_exponent_bits) - 1) << number_significand_bits;
-    static __device__ int_type reinterpret_int(const double &value)
-    {
-        return __double_as_longlong(value);
-    }
-};
-
-
-}  // namespace detail
-
-
-/**
- * Checks if a given value is finite, meaning it is neither +/- infinity
- * nor NaN.
- *
- * @internal  It checks if all exponent bits are set. If all are set, the
- *            number either represents NaN or +/- infinity, meaning it is a
- *            non-finite number.
- *
- * @param value  value to check
- *
- * returns `true` if the given value is finite, meaning it is neither
- *         +/- infinity nor NaN.
- */
-#define GKO_DEFINE_ISFINITE_FOR_TYPE(_type)                               \
-    GKO_INLINE __device__ bool isfinite(const _type &value)               \
-    {                                                                     \
-        constexpr auto mask = detail::mask_creator<_type>::exponent_mask; \
-        const auto re_int =                                               \
-            detail::mask_creator<_type>::reinterpret_int(value);          \
-        return (re_int & mask) != mask;                                   \
-    }
-
-GKO_DEFINE_ISFINITE_FOR_TYPE(float)
-GKO_DEFINE_ISFINITE_FOR_TYPE(double)
-#undef GKO_DEFINE_ISFINITE_FOR_TYPE
-
-
-/**
- * Checks if all components of a complex value are finite, meaning they are
- * neither +/- infinity nor NaN.
- *
- * @internal required for the clang compiler. This function will be used rather
- *           than the `isfinite` function in the public `math.hpp` because
- *           there is no template parameter, so it is prefered during lookup.
- *
- * @tparam T  complex type of the value to check
- *
- * @param value  complex value to check
- *
- * returns `true` if both components of the given value are finite, meaning
- *         they are neither +/- infinity nor NaN.
- */
-#define GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE(_type)              \
-    GKO_INLINE __device__ bool isfinite(const _type &value)      \
-    {                                                            \
-        return isfinite(value.real()) && isfinite(value.imag()); \
-    }
-
-GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE(thrust::complex<float>)
-GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE(thrust::complex<double>)
-#undef GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE
-
-
-// For all other compiler in combination with CUDA, just use the provided
-// `isfinite` function
-#elif defined(__CUDA_ARCH__)
-
-
-// If it is compiled with the CUDA compiler, use their `isfinite`
-using ::isfinite;
-
-
-#endif  // defined(__CUDA_ARCH__)
+
+
+#include "common/base/math.hpp.inc"
 
 
 }  // namespace gko
diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp
index f9094d21f8e..89bd724bedf 100644
--- a/cuda/base/pointer_mode_guard.hpp
+++ b/cuda/base/pointer_mode_guard.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 4cc9e304ce7..3da51bd2ac9 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -35,9 +35,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cusparse.h>
 #include <thrust/complex.h>
 
 
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 
 
@@ -190,6 +196,33 @@ constexpr cudaDataType_t cuda_data_type_impl<uint8>()
 }
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
+    !(defined(_WIN32) || defined(__CYGWIN__))
+
+
+template <typename T>
+constexpr cusparseIndexType_t cusparse_index_type_impl()
+{
+    return CUSPARSE_INDEX_16U;
+}
+
+template <>
+constexpr cusparseIndexType_t cusparse_index_type_impl<int32>()
+{
+    return CUSPARSE_INDEX_32I;
+}
+
+template <>
+constexpr cusparseIndexType_t cusparse_index_type_impl<int64>()
+{
+    return CUSPARSE_INDEX_64I;
+}
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
+        // !(defined(_WIN32) || defined(__CYGWIN__))
+
+
 }  // namespace detail
 
 
@@ -208,6 +241,29 @@ constexpr cudaDataType_t cuda_data_type()
 }
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
+    !(defined(_WIN32) || defined(__CYGWIN__))
+
+
+/**
+ * This is an alias for the `cudaIndexType_t` equivalent of `T`. By default,
+ * CUSPARSE_INDEX_16U is returned.
+ *
+ * @tparam T  a type
+ *
+ * @returns the actual `cusparseIndexType_t`
+ */
+template <typename T>
+constexpr cusparseIndexType_t cusparse_index_type()
+{
+    return detail::cusparse_index_type_impl<T>();
+}
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
+        // !(defined(_WIN32) || defined(__CYGWIN__))
+
+
 /**
  * This is an alias for CUDA's equivalent of `T`.
  *
@@ -270,30 +326,6 @@ inline culibs_type<T> as_culibs_type(T val)
 }
 
 
-struct cuda_config {
-    /**
-     * The number of threads within a CUDA warp.
-     */
-    static constexpr uint32 warp_size = 32;
-
-    /**
-     * The bitmask of the entire warp.
-     */
-    static constexpr uint32 full_lane_mask = (1ll << warp_size) - 1;
-
-    /**
-     * The maximal number of threads allowed in a CUDA warp.
-     */
-    static constexpr uint32 max_block_size = 1024;
-
-    /**
-     * The minimal amount of warps that need to be scheduled for each block
-     * to maximize GPU occupancy.
-     */
-    static constexpr uint32 min_warps_per_block = 4;
-};
-
-
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/version.cpp b/cuda/base/version.cpp
index 8403ccbf50c..41785e5fc1f 100644
--- a/cuda/base/version.cpp
+++ b/cuda/base/version.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
index 8031fe70b7d..7195ea85f61 100644
--- a/cuda/components/atomic.cuh
+++ b/cuda/components/atomic.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,147 +34,50 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_ATOMIC_CUH_
 
 
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-namespace detail {
-
-
-template <typename ValueType, typename = void>
-struct atomic_helper {
-    __forceinline__ __device__ static void atomic_add(ValueType *, ValueType)
-    {
-        static_assert(sizeof(ValueType) == 0,
-                      "This default function is not implemented, only the "
-                      "specializations are.");
-        // TODO: add proper implementation of generic atomic add
-    }
-};
-
-
-template <typename ResultType, typename ValueType>
-__forceinline__ __device__ ResultType reinterpret(ValueType val)
-{
-    static_assert(sizeof(ValueType) == sizeof(ResultType),
-                  "The type to reinterpret to must be of the same size as the "
-                  "original type.");
-    return reinterpret_cast<ResultType &>(val);
-}
-
-
-#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE)                     \
-    template <typename ValueType>                                            \
-    struct atomic_helper<ValueType,                                          \
-                         gko::xstd::enable_if_t<(sizeof(ValueType) ==        \
-                                                 sizeof(CONVERTER_TYPE))>> { \
-        __forceinline__ __device__ static void atomic_add(                   \
-            ValueType *__restrict__ addr, ValueType val)                     \
-        {                                                                    \
-            CONVERTER_TYPE *address_as_ull =                                 \
-                reinterpret_cast<CONVERTER_TYPE *>(addr);                    \
-            CONVERTER_TYPE old = *address_as_ull;                            \
-            CONVERTER_TYPE assumed;                                          \
-            do {                                                             \
-                assumed = old;                                               \
-                old = atomicCAS(address_as_ull, assumed,                     \
-                                reinterpret<CONVERTER_TYPE>(                 \
-                                    val + reinterpret<ValueType>(assumed))); \
-            } while (assumed != old);                                        \
-        }                                                                    \
-    };
-
-// Support 64-bit ATOMIC_ADD
-GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
-// Support 32-bit ATOMIC_ADD
-GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
-
+#include <ginkgo/core/base/std_extensions.hpp>
 
-#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10100))
-// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
-GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
-#endif
 
-#undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
 
 
-}  // namespace detail
-
-
-template <typename T>
-__forceinline__ __device__ void atomic_add(T *__restrict__ addr, T val)
-{
-    detail::atomic_helper<T>::atomic_add(addr, val);
-}
-
-
-#define GKO_BIND_ATOMIC_ADD(ValueType)                                       \
-    __forceinline__ __device__ void atomic_add(ValueType *__restrict__ addr, \
-                                               ValueType val)                \
-    {                                                                        \
-        atomicAdd(addr, val);                                                \
-    }
-
-GKO_BIND_ATOMIC_ADD(int);
-GKO_BIND_ATOMIC_ADD(unsigned int);
-GKO_BIND_ATOMIC_ADD(unsigned long long int);
-GKO_BIND_ATOMIC_ADD(float);
-
-
-#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
-      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
-// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute
-// capability 6.x and higher
-GKO_BIND_ATOMIC_ADD(double);
-#endif
-
-#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
-      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
-// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices
-// of compute capability 7.x and higher.
-GKO_BIND_ATOMIC_ADD(__half);
-#endif
+namespace gko {
+namespace kernels {
+namespace cuda {
 
-#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
-      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
-// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on
-// devices of compute capability 6.x and higher. note: The atomicity of the
-// __half2 add operation is guaranteed separately for each of the two __half
-// elements; the entire __half2 is not guaranteed to be atomic as a single
-// 32-bit access.
-GKO_BIND_ATOMIC_ADD(__half2);
-#endif
 
-#undef GKO_BIND_ATOMIC_ADD
+#include "common/components/atomic.hpp.inc"
 
 
 /**
  * @internal
  *
- * @note It is not 'real' complex<float> atomic add opeartion
+ * @note It is not 'real' complex<float> atomic add operation
  */
-__forceinline__ __device__ void atomic_add(
+__forceinline__ __device__ thrust::complex<float> atomic_add(
     thrust::complex<float> *__restrict__ address, thrust::complex<float> val)
 {
-    cuComplex *cuaddr = reinterpret_cast<cuComplex *>(address);
+    cuComplex *addr = reinterpret_cast<cuComplex *>(address);
     // Separate to real part and imag part
-    atomic_add(&(cuaddr->x), val.real());
-    atomic_add(&(cuaddr->y), val.imag());
+    auto real = atomic_add(&(addr->x), val.real());
+    auto imag = atomic_add(&(addr->y), val.imag());
+    return {real, imag};
 }
 
+
 /**
  * @internal
  *
- * @note It is not 'real' complex<double> atomic add opeartion
+ * @note It is not 'real' complex<double> atomic add operation
  */
-__forceinline__ __device__ void atomic_add(
+__forceinline__ __device__ thrust::complex<double> atomic_add(
     thrust::complex<double> *__restrict__ address, thrust::complex<double> val)
 {
-    cuDoubleComplex *cuaddr = reinterpret_cast<cuDoubleComplex *>(address);
+    cuDoubleComplex *addr = reinterpret_cast<cuDoubleComplex *>(address);
     // Separate to real part and imag part
-    atomic_add(&(cuaddr->x), val.real());
-    atomic_add(&(cuaddr->y), val.imag());
+    auto real = atomic_add(&(addr->x), val.real());
+    auto imag = atomic_add(&(addr->y), val.imag());
+    return {real, imag};
 }
 
 
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index af9c1e68a06..e90f15fdf44 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,7 +40,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/std_extensions.hpp>
 
 
+#include "cuda/base/config.hpp"
+
+
 namespace gko {
+namespace kernels {
+namespace cuda {
 
 
 /**
@@ -59,7 +64,7 @@ namespace gko {
  * A cooperative group (both from standard CUDA and from Ginkgo) is not a
  * specific type, but a concept. That is, any type  satisfying the interface
  * imposed by the cooperative groups API is considered a cooperative
- * group (a.k.a. "duck typing"). To maximize the generality of components than
+ * group (a.k.a. "duck typing"). To maximize the generality of components that
  * need cooperative groups, instead of creating the group manually, consider
  * requesting one as an input parameter. Make sure its type is a template
  * parameter to maximize the set of groups for which your algorithm can be
@@ -228,19 +233,18 @@ public:
     __device__ unsigned thread_rank() const noexcept { return data_.rank; }
 
 private:
+    // clang-format off
     __device__ grid_group()
-        : data_{blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y *
-                    gridDim.z,
-                threadIdx.x +
-                    blockDim.x *
-                        (threadIdx.y +
-                         blockDim.y *
-                             (threadIdx.z +
-                              blockDim.z *
-                                  (blockIdx.x +
-                                   gridDim.x *
-                                       (blockIdx.y + gridDim.y * blockIdx.z))))}
+        : data_{
+                blockDim.x * blockDim.y * blockDim.z *
+                    gridDim.x * gridDim.y * gridDim.z,
+                threadIdx.x + blockDim.x *
+                    (threadIdx.y + blockDim.y *
+                        (threadIdx.z + blockDim.z *
+                            (blockIdx.x + gridDim.x *
+                                (blockIdx.y + gridDim.y * blockIdx.z))))}                      
     {}
+    // clang-format on
 
     struct alignas(8) {
         unsigned size;
@@ -341,7 +345,7 @@ private:
     template <typename ShuffleOperator, typename ValueType,
               typename SelectorType>
     static __device__ __forceinline__ ValueType
-    shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType &var,
+    shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var,
                  SelectorType selector)
     {
         static_assert(sizeof(ValueType) % sizeof(uint32) == 0,
@@ -450,15 +454,23 @@ __device__ __forceinline__ auto tiled_partition(const Group &g)
 }
 
 
+// Only support tile_partition with 1, 2, 4, 8, 16, 32.
+// Reference:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes
 template <size_type Size, typename Group>
-__device__ __forceinline__ thread_block_tile<Size> tiled_partition(
-    const Group &)
+__device__ __forceinline__ gko::xstd::enable_if_t<
+    (Size <= kernels::cuda::config::warp_size) && (Size > 0) &&
+        (kernels::cuda::config::warp_size % Size == 0),
+    thread_block_tile<Size>>
+tiled_partition(const Group &)
 {
     return thread_block_tile<Size>();
 }
 
 
 }  // namespace group
+}  // namespace cuda
+}  // namespace kernels
 }  // namespace gko
 
 
diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
index 482c780a9f3..ca2dacbbdef 100644
--- a/cuda/components/diagonal_block_manipulation.cuh
+++ b/cuda/components/diagonal_block_manipulation.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
 
 
+#include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 
@@ -43,69 +44,8 @@ namespace kernels {
 namespace cuda {
 namespace csr {
 
-/**
- * @internal
- *
- * @note assumes that block dimensions are in "standard format":
- *       (subwarp_size, cuda_config::warp_size / subwarp_size, z)
- */
-template <
-    int max_block_size, int warps_per_block, typename Group, typename ValueType,
-    typename IndexType,
-    typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
-__device__ __forceinline__ void extract_transposed_diag_blocks(
-    const Group &group, int processed_blocks,
-    const IndexType *__restrict__ row_ptrs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values,
-    const IndexType *__restrict__ block_ptrs, size_type num_blocks,
-    ValueType *__restrict__ block_row, int increment,
-    ValueType *__restrict__ workspace)
-{
-    const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-    const auto warp = group::tiled_partition<cuda_config::warp_size>(group);
-    auto bid = static_cast<size_type>(blockIdx.x) * warps_per_block *
-                   processed_blocks +
-               threadIdx.z * processed_blocks;
-    auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero<IndexType>();
-    IndexType bsize = 0;
-#pragma unroll
-    for (int b = 0; b < processed_blocks; ++b, ++bid) {
-        if (bid >= num_blocks) {
-            break;
-        }
-        bstart += bsize;
-        bsize = block_ptrs[bid + 1] - bstart;
-#pragma unroll
-        for (int i = 0; i < max_block_size; ++i) {
-            if (i >= bsize) {
-                break;
-            }
-            if (threadIdx.y == b && threadIdx.x < max_block_size) {
-                workspace[threadIdx.x] = zero<ValueType>();
-            }
-            warp.sync();
-            const auto row = bstart + i;
-            const auto rstart = row_ptrs[row] + tid;
-            const auto rend = row_ptrs[row + 1];
-            // use the entire warp to ensure coalesced memory access
-            for (auto j = rstart; j < rend; j += cuda_config::warp_size) {
-                const auto col = col_idxs[j] - bstart;
-                if (col >= bsize) {
-                    break;
-                }
-                if (col >= 0) {
-                    workspace[col] = values[j];
-                }
-            }
-            warp.sync();
-            if (threadIdx.y == b && threadIdx.x < bsize) {
-                block_row[i * increment] = workspace[threadIdx.x];
-            }
-            warp.sync();
-        }
-    }
-}
+
+#include "common/components/diagonal_block_manipulation.hpp.inc"
 
 
 }  // namespace csr
diff --git a/cuda/components/zero_array.cu b/cuda/components/fill_array.cu
similarity index 71%
rename from cuda/components/zero_array.cu
rename to cuda/components/fill_array.cu
index 0596640c603..63344b7f94b 100644
--- a/cuda/components/zero_array.cu
+++ b/cuda/components/fill_array.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,50 +30,41 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "cuda/components/zero_array.hpp"
+#include "core/components/fill_array.hpp"
+
+
+#include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
 namespace cuda {
+namespace components {
 
 
 constexpr int default_block_size = 512;
 
 
-namespace kernel {
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void zero_array(
-    size_type n, ValueType *__restrict__ array)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    if (tidx < n) {
-        array[tidx] = zero<ValueType>();
-    }
-}
-
-
-}  // namespace kernel
+#include "common/components/fill_array.hpp.inc"
 
 
 template <typename ValueType>
-void zero_array(size_type n, ValueType *array)
+void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType *array,
+                size_type n, ValueType val)
 {
     const dim3 block_size(default_block_size, 1, 1);
     const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
-    kernel::zero_array<<<grid_size, block_size, 0, 0>>>(n, array);
+    kernel::fill_array<<<grid_size, block_size, 0, 0>>>(n, as_cuda_type(array),
+                                                        as_cuda_type(val));
 }
 
-
-#define GKO_DECLARE_ZERO_ARRAY(_type) \
-    void zero_array<_type>(size_type n, _type * array);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_ZERO_ARRAY);
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_ZERO_ARRAY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type);
 
 
+}  // namespace components
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
index 557c8d70f8d..28206769f3e 100644
--- a/cuda/components/format_conversion.cuh
+++ b/cuda/components/format_conversion.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_
 
 
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 
 
@@ -89,24 +90,24 @@ namespace host_kernel {
 /**
  * @internal
  *
- * It calculates the number of warps used in Coo Spmv by GPU architecture and
- * the number of stored elements.
+ * It calculates the number of warps used in Coo Spmv depending on the GPU
+ * architecture and the number of stored elements.
  */
-template <size_type subwarp_size = cuda_config::warp_size>
+template <size_type subwarp_size = config::warp_size>
 __host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
                                     const size_type nnz)
 {
-    size_type warps_per_sm = exec->get_num_cores_per_sm() / subwarp_size;
+    size_type warps_per_sm =
+        exec->get_num_warps_per_sm() * config::warp_size / subwarp_size;
     size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm;
     size_type multiple = 8;
-    if (nnz >= 2000000) {
+    if (nnz >= 2e6) {
         multiple = 128;
-    } else if (nnz >= 200000) {
+    } else if (nnz >= 2e5) {
         multiple = 32;
     }
-    return std::min(
-        multiple * nwarps_in_cuda,
-        static_cast<size_type>(ceildiv(nnz, cuda_config::warp_size)));
+    return std::min(multiple * nwarps_in_cuda,
+                    size_type(ceildiv(nnz, config::warp_size)));
 }
 
 
diff --git a/cuda/components/intrinsics.cuh b/cuda/components/intrinsics.cuh
new file mode 100644
index 00000000000..7726062cfa7
--- /dev/null
+++ b/cuda/components/intrinsics.cuh
@@ -0,0 +1,53 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
+#define GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+#include "common/components/intrinsics.hpp.inc"
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
diff --git a/cuda/components/zero_array.hpp b/cuda/components/merging.cuh
similarity index 78%
rename from cuda/components/zero_array.hpp
rename to cuda/components/merging.cuh
index a4757a49082..80b300a4daf 100644
--- a/cuda/components/zero_array.hpp
+++ b/cuda/components/merging.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,13 +30,14 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
-#ifndef GKO_CUDA_COMPONENTS_ZERO_ARRAY_HPP_
-#define GKO_CUDA_COMPONENTS_ZERO_ARRAY_HPP_
+#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_
+#define GKO_CUDA_COMPONENTS_MERGING_CUH_
 
 
+#include "core/base/utils.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/searching.cuh"
 
 
 namespace gko {
@@ -44,16 +45,7 @@ namespace kernels {
 namespace cuda {
 
 
-/**
- * Zeroes an array allocated on a CUDA device.
- *
- * @tparam ValueType  the type of the array's elements
- *
- * @param n  the size of the array
- * @param array  the array to fill with zeros
- **/
-template <typename ValueType>
-void zero_array(size_type n, ValueType *array);
+#include "common/components/merging.hpp.inc"
 
 
 }  // namespace cuda
@@ -61,4 +53,4 @@ void zero_array(size_type n, ValueType *array);
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_COMPONENTS_ZERO_ARRAY_HPP_
+#endif  // GKO_CUDA_COMPONENTS_MERGING_CUH_
diff --git a/cuda/components/precision_conversion.cu b/cuda/components/precision_conversion.cu
new file mode 100644
index 00000000000..f98ef2cba32
--- /dev/null
+++ b/cuda/components/precision_conversion.cu
@@ -0,0 +1,67 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/precision_conversion.hpp"
+
+
+#include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace components {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/components/precision_conversion.hpp.inc"
+
+
+template <typename SourceType, typename TargetType>
+void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
+                       size_type size, const SourceType *in, TargetType *out)
+{
+    auto num_blocks = ceildiv(size, default_block_size);
+    convert_precision<<<num_blocks, default_block_size>>>(
+        size, as_cuda_type(in), as_cuda_type(out));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+
+
+}  // namespace components
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
new file mode 100644
index 00000000000..ba4767a2547
--- /dev/null
+++ b/cuda/components/prefix_sum.cu
@@ -0,0 +1,72 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include "cuda/components/prefix_sum.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace components {
+
+
+constexpr int prefix_sum_block_size = 512;
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+    Array<IndexType> block_sum_array(exec, num_blocks);
+    auto block_sums = block_sum_array.get_data();
+    start_prefix_sum<prefix_sum_block_size>
+        <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
+                                                block_sums);
+    finalize_prefix_sum<prefix_sum_block_size>
+        <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
+                                                block_sums);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// instantiate for size_type as well, as this is used in the Sellp format
+template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh
index 8ce31f32a93..6a0cf5344c5 100644
--- a/cuda/components/prefix_sum.cuh
+++ b/cuda/components/prefix_sum.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -48,99 +48,7 @@ namespace kernels {
 namespace cuda {
 
 
-/**
- * @internal
- * First step of the calculation of a prefix sum. Calculates the prefix sum
- * in-place on parts of the array `elements`.
- *
- * @param block_size  thread block size for this kernel, also size of blocks on
- * which this kernel calculates the prefix sum in-place
- * @param elements  array on which the prefix sum is to be calculated
- * @param block_sum  array which stores the total sum of each block, requires at
- * least `ceildiv(num_elements, block_size)` elements
- * @param num_elements  total number of entries in `elements`
- *
- * @note To calculate the prefix sum over an array of size bigger than
- * `block_size`, `finalize_prefix_sum` has to be used as well.
- */
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void start_prefix_sum(
-    size_type num_elements, ValueType *__restrict__ elements,
-    ValueType *__restrict__ block_sum)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    const auto element_id = threadIdx.x;
-    __shared__ size_type prefix_helper[block_size];
-    prefix_helper[element_id] =
-        (tidx < num_elements) ? elements[tidx] : zero<ValueType>();
-    auto this_block = group::this_thread_block();
-    this_block.sync();
-
-    // Do a normal reduction
-#pragma unroll
-    for (int i = 1; i < block_size; i <<= 1) {
-        const auto ai = i * (2 * element_id + 1) - 1;
-        const auto bi = i * (2 * element_id + 2) - 1;
-        if (bi < block_size) {
-            prefix_helper[bi] += prefix_helper[ai];
-        }
-        this_block.sync();
-    }
-
-    if (element_id == 0) {
-        // Store the total sum
-        block_sum[blockIdx.x] = prefix_helper[block_size - 1];
-        prefix_helper[block_size - 1] = zero<ValueType>();
-    }
-
-    this_block.sync();
-
-    // Perform the down-sweep phase to get the true prefix sum
-#pragma unroll
-    for (int i = block_size >> 1; i > 0; i >>= 1) {
-        const auto ai = i * (2 * element_id + 1) - 1;
-        const auto bi = i * (2 * element_id + 2) - 1;
-        if (bi < block_size) {
-            auto tmp = prefix_helper[ai];
-            prefix_helper[ai] = prefix_helper[bi];
-            prefix_helper[bi] += tmp;
-        }
-        this_block.sync();
-    }
-    if (tidx < num_elements) {
-        elements[tidx] = prefix_helper[element_id];
-    }
-}
-
-
-/**
- * @internal
- * Second step of the calculation of a prefix sum. Increases the value of each
- * entry of `elements` by the total sum of all preceding blocks.
- *
- * @param block_size  thread block size for this kernel, has to be the same as
- * for `start_prefix_sum`
- * @param elements  array on which the prefix sum is to be calculated
- * @param block_sum  array storing the total sum of each block
- * @param num_elements  total number of entries in `elements`
- *
- * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
- */
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void finalize_prefix_sum(
-    size_type num_elements, ValueType *__restrict__ elements,
-    const ValueType *__restrict__ block_sum)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tidx < num_elements) {
-        ValueType prefix_block_sum = zero<ValueType>();
-        for (size_type i = 0; i < blockIdx.x; i++) {
-            prefix_block_sum += block_sum[i];
-        }
-        elements[tidx] += prefix_block_sum;
-    }
-}
+#include "common/components/prefix_sum.hpp.inc"
 
 
 }  // namespace cuda
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
index 839c90afed8..fd3522e6d99 100644
--- a/cuda/components/reduction.cuh
+++ b/cuda/components/reduction.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/std_extensions.hpp>
 
 
+#include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
@@ -53,152 +54,7 @@ namespace cuda {
 constexpr int default_block_size = 512;
 
 
-/**
- * @internal
- *
- * Computes a reduction using the binary operation `reduce_op` on a group
- * `group`. Each thread contributes with one element `local_data`. The local
- * thread element is always passed as the first parameter to the `reduce_op`.
- * The function returns the result of the reduction on all threads.
- *
- * @note The function is guarantied to return the correct value on all threads
- *       only if `reduce_op` is commutative (in addition to being associative).
- *       Otherwise, the correct value is returned only to the thread with
- *       subwarp index 0.
- */
-template <
-    typename Group, typename ValueType, typename Operator,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ ValueType reduce(const Group &group,
-                                            ValueType local_data,
-                                            Operator reduce_op = Operator{})
-{
-#pragma unroll
-    for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) {
-        const auto remote_data = group.shfl_xor(local_data, bitmask);
-        local_data = reduce_op(local_data, remote_data);
-    }
-    return local_data;
-}
-
-
-/**
- * @internal
- *
- * Returns the index of the thread that has the element with the largest
- * magnitude among all the threads in the group.
- * Only the values from threads which set `is_pivoted` to `false` will be
- * considered.
- */
-template <
-    typename Group, typename ValueType,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ int choose_pivot(const Group &group,
-                                            ValueType local_data,
-                                            bool is_pivoted)
-{
-    using real = remove_complex<ValueType>;
-    real lmag = is_pivoted ? -one<real>() : abs(local_data);
-    const auto pivot =
-        reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
-            const auto rmag = group.shfl(lmag, ridx);
-            if (rmag > lmag) {
-                lmag = rmag;
-                lidx = ridx;
-            }
-            return lidx;
-        });
-    // pivot operator not commutative, make sure everyone has the same pivot
-    return group.shfl(pivot, 0);
-}
-
-
-/**
- * @internal
- *
- * Computes a reduction using the binary operation `reduce_op` on entire block.
- * The data for the reduction is taken from the `data` array which has to be of
- * size `block_size` and accessible from all threads. The `data` array is also
- * used as work space (so its content will be destroyed in the process), as well
- * as to store the return value - which is stored in the 0-th position of the
- * array.
- */
-template <
-    typename Group, typename ValueType, typename Operator,
-    typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
-__device__ void reduce(const Group &__restrict__ group,
-                       ValueType *__restrict__ data,
-                       Operator reduce_op = Operator{})
-{
-    const auto local_id = group.thread_rank();
-
-#pragma unroll
-    for (int k = group.size() / 2; k >= cuda_config::warp_size; k /= 2) {
-        group.sync();
-        if (local_id < k) {
-            data[local_id] = reduce_op(data[local_id], data[local_id + k]);
-        }
-    }
-
-    const auto warp = group::tiled_partition<cuda_config::warp_size>(group);
-    const auto warp_id = group.thread_rank() / warp.size();
-    if (warp_id > 0) {
-        return;
-    }
-    auto result = reduce(warp, data[warp.thread_rank()], reduce_op);
-    if (warp.thread_rank() == 0) {
-        data[0] = result;
-    }
-}
-
-
-/**
- * @internal
- *
- * Computes a reduction using the binary operation `reduce_op` on an array
- * `source` of any size. Has to be called a second time on `result` to reduce
- * an array larger than `block_size`.
- */
-template <typename Operator, typename ValueType>
-__device__ void reduce_array(size_type size,
-                             const ValueType *__restrict__ source,
-                             ValueType *__restrict__ result,
-                             Operator reduce_op = Operator{})
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    auto thread_result = zero<ValueType>();
-    for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) {
-        thread_result = reduce_op(thread_result, source[i]);
-    }
-    result[threadIdx.x] = thread_result;
-
-    group::this_thread_block().sync();
-
-    // Stores the result of the reduction inside `result[0]`
-    reduce(group::this_thread_block(), result, reduce_op);
-}
-
-
-/**
- * @internal
- *
- * Computes a reduction using the add operation (+) on an array
- * `source` of any size. Has to be called a second time on `result` to reduce
- * an array larger than `default_block_size`.
- */
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void reduce_add_array(
-    size_type size, const ValueType *__restrict__ source,
-    ValueType *__restrict__ result)
-{
-    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
-    reduce_array(size, source, static_cast<ValueType *>(block_sum),
-                 [](const ValueType &x, const ValueType &y) { return x + y; });
-
-    if (threadIdx.x == 0) {
-        result[blockIdx.x] = block_sum[0];
-    }
-}
+#include "common/components/reduction.hpp.inc"
 
 
 /**
@@ -233,9 +89,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
     reduce_add_array<<<1, default_block_size>>>(
         grid_dim, as_cuda_type(block_results_val),
         as_cuda_type(d_result.get_data()));
-    ValueType answer = zero<ValueType>();
-    exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
-                                  &answer);
+    auto answer = exec->copy_val_to_host(d_result.get_const_data());
     return answer;
 }
 
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
new file mode 100644
index 00000000000..186123e04f3
--- /dev/null
+++ b/cuda/components/searching.cuh
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_
+#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_
+
+
+#include "cuda/base/config.hpp"
+#include "cuda/components/intrinsics.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+#include "common/components/searching.hpp.inc"
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_COMPONENTS_SEARCHING_CUH_
diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh
index a0f87e4e555..37f5127da06 100644
--- a/cuda/components/segment_scan.cuh
+++ b/cuda/components/segment_scan.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -46,36 +45,7 @@ namespace kernels {
 namespace cuda {
 
 
-/**
- * @internal
- *
- * Compute a segement scan using add operation (+) of a subwarp. Each segment
- * performs suffix sum. Works on the source array and returns whether the thread
- * is the first element of its segment with same `ind`.
- */
-template <size_type subwarp_size, typename ValueType, typename IndexType>
-__device__ __forceinline__ bool segment_scan(
-    const group::thread_block_tile<subwarp_size> &group, const IndexType ind,
-    ValueType *__restrict__ val)
-{
-    bool head = true;
-#pragma unroll
-    for (int i = 1; i < subwarp_size; i <<= 1) {
-        const IndexType add_ind = group.shfl_up(ind, i);
-        ValueType add_val = zero<ValueType>();
-        if (add_ind == ind && threadIdx.x >= i) {
-            add_val = *val;
-            if (i == 1) {
-                head = false;
-            }
-        }
-        add_val = group.shfl_down(add_val, i);
-        if (threadIdx.x < subwarp_size - i) {
-            *val += add_val;
-        }
-    }
-    return head;
-}
+#include "common/components/segment_scan.hpp.inc"
 
 
 }  // namespace cuda
diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh
new file mode 100644
index 00000000000..9a5525f7a94
--- /dev/null
+++ b/cuda/components/sorting.cuh
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_SORTING_CUH_
+#define GKO_CUDA_COMPONENTS_SORTING_CUH_
+
+
+#include "cuda/base/config.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+#include "common/components/sorting.hpp.inc"
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_COMPONENTS_SORTING_CUH_
diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh
index fff091f4efd..31ebe0a28a6 100644
--- a/cuda/components/thread_ids.cuh
+++ b/cuda/components/thread_ids.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
 
 
+#include "cuda/base/config.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {
@@ -45,169 +48,7 @@ namespace cuda {
 namespace thread {
 
 
-/**
- * @internal
- *
- * Returns the ID of the block group this thread belongs to.
- *
- * @return the ID of the block group this thread belongs to
- *
- * @note Assumes that grid dimensions are in standard format:
- *       `(block_group_size, first_grid_dimension, second grid_dimension)`
- */
-__device__ __forceinline__ size_type get_block_group_id()
-{
-    return static_cast<size_type>(blockIdx.z) * gridDim.y + blockIdx.y;
-}
-
-/**
- * @internal
- *
- * Returns the ID of the block this thread belongs to.
- *
- * @return the ID of the block this thread belongs to
- *
- * @note Assumes that grid dimensions are in standard format:
- *       `(block_group_size, first_grid_dimension, second grid_dimension)`
- */
-__device__ __forceinline__ size_type get_block_id()
-{
-    return get_block_group_id() * gridDim.x + blockIdx.x;
-}
-
-
-/**
- * @internal
- *
- * Returns the local ID of the warp (relative to the block) this thread belongs
- * to.
- *
- * @return the local ID of the warp (relative to the block) this thread belongs
- *         to
- *
- * @note Assumes that block dimensions are in standard format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)`
- */
-__device__ __forceinline__ size_type get_local_warp_id()
-{
-    return static_cast<size_type>(threadIdx.z);
-}
-
-
-/**
- * @internal
- *
- * Returns the local ID of the sub-warp (relative to the block) this thread
- * belongs to.
- *
- * @tparam subwarp_size  size of the subwarp
- *
- * @return the local ID of the sub-warp (relative to the block) this thread
- *         belongs to
- *
- * @note Assumes that block dimensions are in standard format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)`
- */
-template <int subwarp_size>
-__device__ __forceinline__ size_type get_local_subwarp_id()
-{
-    constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size;
-    return get_local_warp_id() * subwarps_per_warp + threadIdx.y;
-}
-
-
-/**
- * @internal
- *
- * Returns the local ID of the thread (relative to the block).
- * to.
- *
- * @tparam subwarp_size  size of the subwarp
- *
- * @return the local ID of the thread (relative to the block)
- *
- * @note Assumes that block dimensions are in standard format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)`
- */
-template <int subwarp_size>
-__device__ __forceinline__ size_type get_local_thread_id()
-{
-    return get_local_subwarp_id<subwarp_size>() * subwarp_size + threadIdx.x;
-}
-
-
-/**
- * @internal
- *
- * Returns the global ID of the warp this thread belongs to.
- *
- * @tparam warps_per_block  number of warps within each block
- *
- * @return the global ID of the warp this thread belongs to.
- *
- * @note Assumes that block dimensions and grid dimensions are in standard
- *       format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)` and
- *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
- *       respectively.
- */
-template <int warps_per_block>
-__device__ __forceinline__ size_type get_warp_id()
-{
-    return get_block_id() * warps_per_block + get_local_warp_id();
-}
-
-
-/**
- * @internal
- *
- * Returns the global ID of the sub-warp this thread belongs to.
- *
- * @tparam subwarp_size  size of the subwarp
- *
- * @return the global ID of the sub-warp this thread belongs to.
- *
- * @note Assumes that block dimensions and grid dimensions are in standard
- *       format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)` and
- *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
- *       respectively.
- */
-template <int subwarp_size, int warps_per_block>
-__device__ __forceinline__ size_type get_subwarp_id()
-{
-    constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size;
-    return get_warp_id<warps_per_block>() * subwarps_per_warp + threadIdx.y;
-}
-
-
-/**
- * @internal
- *
- * Returns the global ID of the thread.
- *
- * @return the global ID of the thread.
- *
- * @tparam subwarp_size  size of the subwarp
- *
- * @note Assumes that block dimensions and grid dimensions are in standard
- *       format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)` and
- *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
- *       respectively.
- */
-template <int subwarp_size, int warps_per_block>
-__device__ __forceinline__ size_type get_thread_id()
-{
-    return get_subwarp_id<subwarp_size, warps_per_block>() * subwarp_size +
-           threadIdx.x;
-}
+#include "common/components/thread_ids.hpp.inc"
 
 
 }  // namespace thread
diff --git a/cuda/components/uninitialized_array.hpp b/cuda/components/uninitialized_array.hpp
index e1d47d9e717..b3d9096f0c9 100644
--- a/cuda/components/uninitialized_array.hpp
+++ b/cuda/components/uninitialized_array.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,54 +42,7 @@ namespace kernels {
 namespace cuda {
 
 
-template <typename ValueType, size_type size>
-/**
- * Stores an array with uninitialized contents.
- */
-class UninitializedArray {
-public:
-    /**
-     * Operator for casting an UninitializedArray into its constexpr value
-     * pointer.
-     * @return the constexpr pointer to the first entry of the array.
-     */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
-    {
-        return &(*this)[0];
-    }
-
-    /**
-     * Operator for casting an UninitializedArray into its non-const value
-     * pointer.
-     * @return the non-const pointer to the first entry of the array.
-     */
-    GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; }
-
-    /**
-     * constexpr array access operator.
-     * @param pos The array index. Using a value outside [0, size) is undefined
-     * behavior.
-     * @return a reference to the array entry at the given index.
-     */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
-    {
-        return reinterpret_cast<const ValueType *>(data_)[pos];
-    }
-
-    /**
-     * Non-const array access operator.
-     * @param pos The array index. Using a value outside [0, size) is undefined
-     * behavior.
-     * @return a reference to the array entry at the given index.
-     */
-    GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept
-    {
-        return reinterpret_cast<ValueType *>(data_)[pos];
-    }
-
-private:
-    unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
-};
+#include "common/components/uninitialized_array.hpp.inc"
 
 
 }  // namespace cuda
@@ -97,4 +50,4 @@ class UninitializedArray {
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_BASE_COMPONENTS_ARRAY_HPP_
+#endif  // GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh
index ba5906142a2..4ae18bfde18 100644
--- a/cuda/components/warp_blas.cuh
+++ b/cuda/components/warp_blas.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
 
 
+#include <cassert>
+
+
 #include <ginkgo/config.hpp>
 
 
@@ -41,374 +44,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/reduction.cuh"
 
 
-#include <cassert>
-
-
 namespace gko {
 namespace kernels {
 namespace cuda {
 
 
-/**
- * @internal
- *
- * Defines a postprocessing transformation that should be performed on the
- * result of a function call.
- *
- * @note This functionality should become useless once accessors and ranges are
- *       in place, as they will define the storage scheme.
- */
-enum postprocess_transformation { and_return, and_transpose };
-
-
-/**
- * @internal
- *
- * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan
- * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using
- * using the thread group `group.  Each thread contributes one `row` of the
- * matrix, and the routine uses warp shuffles to exchange data between rows. The
- * transform is performed by using the `key_row`-th row and `key_col`-th column
- * of the matrix.
- */
-template <
-    int max_problem_size, typename Group, typename ValueType,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ void apply_gauss_jordan_transform(
-    const Group &__restrict__ group, int32 key_row, int32 key_col,
-    ValueType *__restrict__ row, bool &__restrict__ status)
-{
-    auto key_col_elem = group.shfl(row[key_col], key_row);
-    if (key_col_elem == zero<ValueType>()) {
-        // TODO: implement error handling for GPUs to be able to properly
-        //       report it here
-        status = false;
-        return;
-    }
-    if (group.thread_rank() == key_row) {
-        key_col_elem = one<ValueType>() / key_col_elem;
-    } else {
-        key_col_elem = -row[key_col] / key_col_elem;
-    }
-#pragma unroll
-    for (int32 i = 0; i < max_problem_size; ++i) {
-        const auto key_row_elem = group.shfl(row[i], key_row);
-        if (group.thread_rank() == key_row) {
-            row[i] = zero<ValueType>();
-        }
-        row[i] += key_col_elem * key_row_elem;
-    }
-    row[key_col] = key_col_elem;
-}
-
-
-/**
- * @internal
- *
- * Inverts a matrix using Gauss-Jordan elimination. The inversion is
- * done in-place, so the original matrix will be overridden with the inverse.
- * The inversion routine uses implicit pivoting, so the returned matrix will be
- * a permuted inverse (from both sides). To obtain the correct inverse, the
- * rows of the result should be permuted with $P$, and the columns with
- * $ P^T $ (i.e.
- * $ A^{-1} = P X P $, where $ X $ is the returned matrix). These
- * permutation matrices are returned compressed as vectors `perm` and
- * `trans_perm`, respectively. `i`-th value of each of the vectors is returned
- * to thread of the group with rank `i`.
- *
- * @tparam max_problem_size  the maximum problem size that will be passed to the
- *                           inversion routine (a tighter bound results in
- *                           faster code
- * @tparam Group  type of the group of threads
- * @tparam ValueType  type of values stored in the matrix
- *
- * @param group  the group of threads which participate in the inversion
- * @param problem_size  the actual size of the matrix (cannot be larger than
- *                      max_problem_size)
- * @param row  a pointer to the matrix row (i-th thread in the group should
- *             pass the pointer to the i-th row), has to have at least
- *             max_problem_size elements
- * @param perm  a value to hold an element of permutation matrix $ P $
- * @param trans_perm  a value to hold an element of permutation matrix $ P^T
- * $
- *
- * @return true if the inversion succeeded, false otherwise
- */
-template <
-    int max_problem_size, typename Group, typename ValueType,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ bool invert_block(const Group &__restrict__ group,
-                                             uint32 problem_size,
-                                             ValueType *__restrict__ row,
-                                             uint32 &__restrict__ perm,
-                                             uint32 &__restrict__ trans_perm)
-{
-    GKO_ASSERT(problem_size <= max_problem_size);
-    // prevent rows after problem_size to become pivots
-    auto pivoted = group.thread_rank() >= problem_size;
-    auto status = true;
-#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
-#pragma unroll
-#else
-#pragma unroll 1
-#endif
-    for (int32 i = 0; i < max_problem_size; ++i) {
-        if (i >= problem_size) {
-            break;
-        }
-        const auto piv = choose_pivot(group, row[i], pivoted);
-        if (group.thread_rank() == piv) {
-            perm = i;
-            pivoted = true;
-        }
-        if (group.thread_rank() == i) {
-            trans_perm = piv;
-        }
-        apply_gauss_jordan_transform<max_problem_size>(group, piv, i, row,
-                                                       status);
-    }
-    return status;
-}
-
-
-/**
- * @internal
- *
- * Performs the correct index calculation for the given postprocess operation.
- */
-template <postprocess_transformation mod, typename T1, typename T2, typename T3>
-__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col,
-                                                             T3 stride) ->
-    typename std::enable_if<
-        mod != and_transpose,
-        typename std::decay<decltype(row * stride + col)>::type>::type
-{
-    return row * stride + col;
-}
-
-
-template <postprocess_transformation mod, typename T1, typename T2, typename T3>
-__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col,
-                                                             T3 stride) ->
-    typename std::enable_if<
-        mod == and_transpose,
-        typename std::decay<decltype(col * stride + row)>::type>::type
-{
-    return col * stride + row;
-}
-
-
-/**
- * @internal
- *
- * Copies a matrix stored as a collection of rows in different threads of the
- * warp in a block of memory accessible by all threads in row-major order.
- * Optionally permutes rows and columns of the matrix in the process.
- *
- * @tparam max_problem_size  maximum problem size passed to the routine
- * @tparam mod  the transformation to perform on the return data
- * @tparam Group  type of the group of threads
- * @tparam SourceValueType  type of values stored in the source matrix
- * @tparam ResultValueType  type of values stored in the result matrix
- *
- * @param group  group of threads participating in the copy
- * @param problem_size  actual size of the matrix
- *                      (`problem_size <= max_problem_size`)
- * @param source_row  pointer to memory used to store a row of the source matrix
- *                    `i`-th thread of the sub-warp should pass in the `i`-th
- *                    row of the matrix
- * @param increment  offset between two consecutive elements of the row
- * @param row_perm  permutation vector to apply on the rows of the matrix
- *                  (thread `i` supplies the `i`-th value of the vector)
- * @param col_perm  permutation vector to apply on the column of the matrix
- *                  (thread `i` supplies the `i`-th value of the vector)
- * @param destination  pointer to memory where the result will be stored
- *                     (all threads supply the same value)
- * @param stride  offset between two consecutive rows of the matrix
- */
-template <
-    int max_problem_size, postprocess_transformation mod = and_return,
-    typename Group, typename SourceValueType, typename ResultValueType,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ void copy_matrix(
-    const Group &__restrict__ group, uint32 problem_size,
-    const SourceValueType *__restrict__ source_row, uint32 increment,
-    uint32 row_perm, uint32 col_perm, ResultValueType *__restrict__ destination,
-    size_type stride)
-{
-    GKO_ASSERT(problem_size <= max_problem_size);
-#pragma unroll
-    for (int32 i = 0; i < max_problem_size; ++i) {
-        if (i >= problem_size) {
-            break;
-        }
-        const auto idx = group.shfl(col_perm, i);
-        if (group.thread_rank() < problem_size) {
-            destination[get_row_major_index<mod>(idx, row_perm, stride)] =
-                static_cast<ResultValueType>(source_row[i * increment]);
-        }
-    }
-}
-
-
-/**
- * @internal
- *
- * Multiplies a transposed vector and a matrix stored in column-major order.
- *
- * In mathematical terms, performs the operation $ res^T = vec^T \cdot mtx$.
- *
- * @tparam max_problem_size  maximum problem size passed to the routine
- * @tparam Group  type of the group of threads
- * @tparam MatrixValueType  type of values stored in the matrix
- * @tparam VectorValueType  type of values stored in the vectors
- *
- * @param group  group of threads participating in the operation
- * @param problem_size  actual size of the matrix
- *                      (`problem_size <= max_problem_size`)
- * @param vec  input vector to multiply (thread `i` supplies the `i`-th value of
- *             the vector)
- * @param mtx_row  pointer to memory used to store a row of the input matrix,
- *                    `i`-th thread of the sub-warp should pass in the
- *                    `i`-th row of the matrix
- * @param mtx_increment  offset between two consecutive elements of the row
- * @param res  pointer to a block of memory where the result will be written
- *             (only thread 0 of the group has to supply a valid value)
- * @param mtx_increment  offset between two consecutive elements of the result
- */
-template <
-    int max_problem_size, typename Group, typename MatrixValueType,
-    typename VectorValueType,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ void multiply_transposed_vec(
-    const Group &__restrict__ group, uint32 problem_size,
-    const VectorValueType &__restrict__ vec,
-    const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment,
-    VectorValueType *__restrict__ res, uint32 res_increment)
-{
-    GKO_ASSERT(problem_size <= max_problem_size);
-    auto mtx_elem = zero<VectorValueType>();
-#pragma unroll
-    for (int32 i = 0; i < max_problem_size; ++i) {
-        if (i >= problem_size) {
-            break;
-        }
-        if (group.thread_rank() < problem_size) {
-            mtx_elem = static_cast<VectorValueType>(mtx_row[i * mtx_increment]);
-        }
-        const auto out =
-            reduce(group, mtx_elem * vec,
-                   [](VectorValueType x, VectorValueType y) { return x + y; });
-        if (group.thread_rank() == 0) {
-            res[i * res_increment] = out;
-        }
-    }
-}
-
-
-/**
- * @internal
- *
- * Multiplies a matrix and a vector stored in column-major order.
- *
- * In mathematical terms, performs the operation $res = mtx \cdot vec$.
- *
- * @tparam max_problem_size  maximum problem size passed to the routine
- * @tparam Group  type of the group of threads
- * @tparam MatrixValueType  type of values stored in the matrix
- * @tparam VectorValueType  type of values stored in the vectors
- * @tparam Closure  type of the function used to write the result
- *
- * @param group  group of threads participating in the operation
- * @param problem_size  actual size of the matrix
- *                      (`problem_size <= max_problem_size`)
- * @param vec  input vector to multiply (thread `i` supplies the `i`-th value of
- *             the vector)
- * @param mtx_row  pointer to memory used to store a row of the input matrix,
- *                    `i`-th thread of the sub-warp should pass in the
- *                    `i`-th row of the matrix
- * @param mtx_increment  offset between two consecutive elements of the row
- * @param res  pointer to a block of memory where the result will be written
- *             (only thread 0 of the group has to supply a valid value)
- * @param mtx_increment  offset between two consecutive elements of the result
- * @param closure_op  Operation that is performed when writing to
-                     `res[group.thread_rank() * res_increment]` as
-                     `closure_op(res[group.thread_rank() * res_increment], out)`
-                      where `out` is the result of $mtx \cdot vec$.
- */
-template <
-    int max_problem_size, typename Group, typename MatrixValueType,
-    typename VectorValueType, typename Closure,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ void multiply_vec(
-    const Group &__restrict__ group, uint32 problem_size,
-    const VectorValueType &__restrict__ vec,
-    const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment,
-    VectorValueType *__restrict__ res, uint32 res_increment, Closure closure_op)
-{
-    GKO_ASSERT(problem_size <= max_problem_size);
-    auto mtx_elem = zero<VectorValueType>();
-    auto out = zero<VectorValueType>();
-#pragma unroll
-    for (int32 i = 0; i < max_problem_size; ++i) {
-        if (i >= problem_size) {
-            break;
-        }
-        if (group.thread_rank() < problem_size) {
-            mtx_elem = static_cast<VectorValueType>(mtx_row[i * mtx_increment]);
-        }
-        out += mtx_elem * group.shfl(vec, i);
-    }
-    if (group.thread_rank() < problem_size) {
-        closure_op(res[group.thread_rank() * res_increment], out);
-    }
-}
-
-
-/**
- * @internal
- *
- * Computes the infinity norm of a matrix. Each thread in the group supplies
- * one row of the matrix.
- *
- * @tparam max_problem_size  maximum problem size passed to the routine
- * @tparam Group  type of the group of threads
- * @tparam ValueType  type of values stored in the matrix
- *
- * @param group  group of threads participating in the operation
- * @param num_rows  number of rows of the matrix
- *                      (`num_rows <= max_problem_size`)
- * @param num_cols  number of columns of the matrix
- * @param row  pointer to memory used to store a row of the input matrix,
- *             `i`-th thread of the group should pass in the `i`-th row of the
- *             matrix
- *
- * @return the infinity norm of the matrix
- */
-template <
-    int max_problem_size, typename Group, typename ValueType,
-    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
-__device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
-    const Group &group, uint32 num_rows, uint32 num_cols, const ValueType *row)
-{
-    using result_type = remove_complex<ValueType>;
-    auto sum = zero<result_type>();
-    if (group.thread_rank() < num_rows) {
-#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
-#pragma unroll
-#else
-#pragma unroll 1
-#endif
-        for (uint32 i = 0; i < max_problem_size; ++i) {
-            if (i >= num_cols) {
-                break;
-            }
-            sum += abs(row[i]);
-        }
-    }
-    return reduce(group, sum,
-                  [](result_type x, result_type y) { return max(x, y); });
-}
+#include "common/components/warp_blas.hpp.inc"
 
 
 }  // namespace cuda
diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu
new file mode 100644
index 00000000000..6f5f6b4ee05
--- /dev/null
+++ b/cuda/factorization/factorization_kernels.cu
@@ -0,0 +1,252 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/factorization_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/searching.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+constexpr int default_block_size{512};
+
+
+#include "common/factorization/factorization_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void add_diagonal_elements(std::shared_ptr<const CudaExecutor> exec,
+                           matrix::Csr<ValueType, IndexType> *mtx,
+                           bool is_sorted)
+{
+    // TODO: Runtime can be optimized by choosing a appropriate size for the
+    //       subwarp dependent on the matrix properties
+    constexpr int subwarp_size = config::warp_size;
+    auto mtx_size = mtx->get_size();
+    auto num_rows = static_cast<IndexType>(mtx_size[0]);
+    auto num_cols = static_cast<IndexType>(mtx_size[1]);
+    size_type row_ptrs_size = num_rows + 1;
+
+    Array<IndexType> row_ptrs_addition(exec, row_ptrs_size);
+    Array<bool> needs_change_host{exec->get_master(), 1};
+    needs_change_host.get_data()[0] = false;
+    Array<bool> needs_change_device{exec, 1};
+    needs_change_device = needs_change_host;
+
+    auto cuda_old_values = as_cuda_type(mtx->get_const_values());
+    auto cuda_old_col_idxs = as_cuda_type(mtx->get_const_col_idxs());
+    auto cuda_old_row_ptrs = as_cuda_type(mtx->get_row_ptrs());
+    auto cuda_row_ptrs_add = as_cuda_type(row_ptrs_addition.get_data());
+
+    const dim3 block_dim{default_block_size, 1, 1};
+    const dim3 grid_dim{
+        static_cast<uint32>(ceildiv(num_rows, block_dim.x / subwarp_size)), 1,
+        1};
+    if (is_sorted) {
+        kernel::find_missing_diagonal_elements<true, subwarp_size>
+            <<<grid_dim, block_dim>>>(
+                num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs,
+                cuda_row_ptrs_add,
+                as_cuda_type(needs_change_device.get_data()));
+    } else {
+        kernel::find_missing_diagonal_elements<false, subwarp_size>
+            <<<grid_dim, block_dim>>>(
+                num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs,
+                cuda_row_ptrs_add,
+                as_cuda_type(needs_change_device.get_data()));
+    }
+    needs_change_host = needs_change_device;
+    if (!needs_change_host.get_const_data()[0]) {
+        return;
+    }
+
+    components::prefix_sum(exec, cuda_row_ptrs_add, row_ptrs_size);
+    exec->synchronize();
+
+    auto total_additions =
+        exec->copy_val_to_host(cuda_row_ptrs_add + row_ptrs_size - 1);
+    size_type new_num_elems = static_cast<size_type>(total_additions) +
+                              mtx->get_num_stored_elements();
+
+
+    Array<ValueType> new_values{exec, new_num_elems};
+    Array<IndexType> new_col_idxs{exec, new_num_elems};
+    auto cuda_new_values = as_cuda_type(new_values.get_data());
+    auto cuda_new_col_idxs = as_cuda_type(new_col_idxs.get_data());
+
+    kernel::add_missing_diagonal_elements<subwarp_size>
+        <<<grid_dim, block_dim>>>(num_rows, cuda_old_values, cuda_old_col_idxs,
+                                  cuda_old_row_ptrs, cuda_new_values,
+                                  cuda_new_col_idxs, cuda_row_ptrs_add);
+
+    const dim3 grid_dim_row_ptrs_update{
+        static_cast<uint32>(ceildiv(num_rows, block_dim.x)), 1, 1};
+    kernel::update_row_ptrs<<<grid_dim_row_ptrs_update, block_dim>>>(
+        num_rows + 1, cuda_old_row_ptrs, cuda_row_ptrs_add);
+
+    matrix::CsrBuilder<ValueType, IndexType> mtx_builder{mtx};
+    mtx_builder.get_value_array() = std::move(new_values);
+    mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l_u(
+    std::shared_ptr<const CudaExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+
+    const dim3 block_size{default_block_size, 1, 1};
+    const uint32 number_blocks =
+        ceildiv(num_rows, static_cast<size_type>(block_size.x));
+    const dim3 grid_dim{number_blocks, 1, 1};
+
+    kernel::count_nnz_per_l_u_row<<<grid_dim, block_size, 0, 0>>>(
+        num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()),
+        as_cuda_type(system_matrix->get_const_col_idxs()),
+        as_cuda_type(system_matrix->get_const_values()),
+        as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs));
+
+    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    components::prefix_sum(exec, u_row_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l_u(std::shared_ptr<const CudaExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *system_matrix,
+                    matrix::Csr<ValueType, IndexType> *csr_l,
+                    matrix::Csr<ValueType, IndexType> *csr_u)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    kernel::initialize_l_u<<<grid_dim, block_size, 0, 0>>>(
+        num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()),
+        as_cuda_type(system_matrix->get_const_col_idxs()),
+        as_cuda_type(system_matrix->get_const_values()),
+        as_cuda_type(csr_l->get_const_row_ptrs()),
+        as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()),
+        as_cuda_type(csr_u->get_const_row_ptrs()),
+        as_cuda_type(csr_u->get_col_idxs()), as_cuda_type(csr_u->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l(
+    std::shared_ptr<const CudaExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+
+    const dim3 block_size{default_block_size, 1, 1};
+    const uint32 number_blocks =
+        ceildiv(num_rows, static_cast<size_type>(block_size.x));
+    const dim3 grid_dim{number_blocks, 1, 1};
+
+    kernel::count_nnz_per_l_row<<<grid_dim, block_size, 0, 0>>>(
+        num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()),
+        as_cuda_type(system_matrix->get_const_col_idxs()),
+        as_cuda_type(system_matrix->get_const_values()),
+        as_cuda_type(l_row_ptrs));
+
+    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l(std::shared_ptr<const CudaExecutor> exec,
+                  const matrix::Csr<ValueType, IndexType> *system_matrix,
+                  matrix::Csr<ValueType, IndexType> *csr_l, bool diag_sqrt)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    kernel::initialize_l<<<grid_dim, block_size, 0, 0>>>(
+        num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()),
+        as_cuda_type(system_matrix->get_const_col_idxs()),
+        as_cuda_type(system_matrix->get_const_values()),
+        as_cuda_type(csr_l->get_const_row_ptrs()),
+        as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()),
+        diag_sqrt);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu
new file mode 100644
index 00000000000..b7debb21bc3
--- /dev/null
+++ b/cuda/factorization/ilu_kernels.cu
@@ -0,0 +1,95 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/ilu_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "cuda/base/cusparse_bindings.hpp"
+#include "cuda/base/device_guard.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace ilu_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
+                matrix::Csr<ValueType, IndexType> *m)
+{
+    const auto id = exec->get_device_id();
+    auto handle = exec->get_cusparse_handle();
+    gko::cuda::device_guard g{id};
+    auto desc = cusparse::create_mat_descr();
+    auto info = cusparse::create_ilu0_info();
+
+    // get buffer size for ILU
+    IndexType num_rows = m->get_size()[0];
+    IndexType nnz = m->get_num_stored_elements();
+    size_type buffer_size{};
+    cusparse::ilu0_buffer_size(handle, num_rows, nnz, desc,
+                               m->get_const_values(), m->get_const_row_ptrs(),
+                               m->get_const_col_idxs(), info, buffer_size);
+
+    Array<char> buffer{exec, buffer_size};
+
+    // set up ILU(0)
+    cusparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+                            m->get_const_row_ptrs(), m->get_const_col_idxs(),
+                            info, CUSPARSE_SOLVE_POLICY_USE_LEVEL,
+                            buffer.get_data());
+
+    cusparse::ilu0(handle, num_rows, nnz, desc, m->get_values(),
+                   m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
+                   CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+
+    cusparse::destroy(info);
+    cusparse::destroy(desc);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+
+
+}  // namespace ilu_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
new file mode 100644
index 00000000000..98aa1c04831
--- /dev/null
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -0,0 +1,209 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/searching.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ict_spgeam_kernels.hpp.inc"
+#include "common/factorization/par_ict_sweep_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *llt,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    matrix::Csr<ValueType, IndexType> *l_new)
+{
+    auto num_rows = static_cast<IndexType>(llt->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llt_row_ptrs = llt->get_const_row_ptrs();
+    auto llt_col_idxs = llt->get_const_col_idxs();
+    auto llt_vals = llt->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    kernel::ict_tri_spgeam_nnz<subwarp_size>
+        <<<num_blocks, default_block_size>>>(llt_row_ptrs, llt_col_idxs,
+                                             a_row_ptrs, a_col_idxs,
+                                             l_new_row_ptrs, num_rows);
+
+    // build row ptrs
+    components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    kernel::ict_tri_spgeam_init<subwarp_size>
+        <<<num_blocks, default_block_size>>>(
+            llt_row_ptrs, llt_col_idxs, as_cuda_type(llt_vals), a_row_ptrs,
+            a_col_idxs, as_cuda_type(a_vals), l_row_ptrs, l_col_idxs,
+            as_cuda_type(l_vals), l_new_row_ptrs, l_new_col_idxs,
+            as_cuda_type(l_new_vals), num_rows);
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Coo<ValueType, IndexType> *l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    kernel::ict_sweep<subwarp_size><<<num_blocks, default_block_size>>>(
+        a->get_const_row_ptrs(), a->get_const_col_idxs(),
+        as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(),
+        l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+        as_cuda_type(l->get_values()),
+        static_cast<IndexType>(l->get_num_stored_elements()));
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *llt,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    matrix::Csr<ValueType, IndexType> *l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llt->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llt, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Coo<ValueType, IndexType> *l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index 6f212b9d75c..65e3798a881 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,14 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
-#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -57,186 +56,7 @@ namespace par_ilu_factorization {
 constexpr int default_block_size{512};
 
 
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
-    size_type num_rows, const IndexType *__restrict__ row_ptrs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row,
-    IndexType *__restrict__ u_nnz_row)
-{
-    const auto row = blockDim.x * blockIdx.x + threadIdx.x;
-    if (row < num_rows) {
-        IndexType l_row_nnz{};
-        IndexType u_row_nnz{};
-        for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) {
-            auto col = col_idxs[idx];
-            l_row_nnz += (col <= row);
-            u_row_nnz += (row <= col);
-        }
-        l_nnz_row[row] = l_row_nnz;
-        u_nnz_row[row] = u_row_nnz;
-    }
-}
-
-
-}  // namespace kernel
-
-
-template <typename ValueType, typename IndexType>
-void initialize_row_ptrs_l_u(
-    std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *system_matrix,
-    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
-{
-    const size_type num_rows{system_matrix->get_size()[0]};
-    const size_type num_row_ptrs{num_rows + 1};
-
-    const dim3 block_size{default_block_size, 1, 1};
-    const uint32 number_blocks =
-        ceildiv(num_rows, static_cast<size_type>(block_size.x));
-    const dim3 grid_dim{number_blocks, 1, 1};
-
-    kernel::count_nnz_per_l_u_row<<<grid_dim, block_size, 0, 0>>>(
-        num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()),
-        as_cuda_type(system_matrix->get_const_col_idxs()),
-        as_cuda_type(system_matrix->get_const_values()),
-        as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs));
-
-    Array<IndexType> block_sum(exec, grid_dim.x);
-    auto block_sum_ptr = block_sum.get_data();
-
-    start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
-    finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
-
-    start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
-    finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL);
-
-
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l_u(
-    size_type num_rows, const IndexType *__restrict__ row_ptrs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values,
-    const IndexType *__restrict__ l_row_ptrs,
-    IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values,
-    const IndexType *__restrict__ u_row_ptrs,
-    IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values)
-{
-    const auto row = blockDim.x * blockIdx.x + threadIdx.x;
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row];
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            if (col <= row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = (col == row ? one<ValueType>() : val);
-                ++l_idx;
-            }
-            if (row <= col) {
-                u_col_idxs[u_idx] = col;
-                u_values[u_idx] = val;
-                ++u_idx;
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
-template <typename ValueType, typename IndexType>
-void initialize_l_u(std::shared_ptr<const CudaExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType> *system_matrix,
-                    matrix::Csr<ValueType, IndexType> *csr_l,
-                    matrix::Csr<ValueType, IndexType> *csr_u)
-{
-    const size_type num_rows{system_matrix->get_size()[0]};
-    const dim3 block_size{default_block_size, 1, 1};
-    const dim3 grid_dim{static_cast<uint32>(ceildiv(
-                            num_rows, static_cast<size_type>(block_size.x))),
-                        1, 1};
-
-    kernel::initialize_l_u<<<grid_dim, block_size, 0, 0>>>(
-        num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()),
-        as_cuda_type(system_matrix->get_const_col_idxs()),
-        as_cuda_type(system_matrix->get_const_values()),
-        as_cuda_type(csr_l->get_const_row_ptrs()),
-        as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()),
-        as_cuda_type(csr_u->get_const_row_ptrs()),
-        as_cuda_type(csr_u->get_col_idxs()), as_cuda_type(csr_u->get_values()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL);
-
-
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
-    size_type num_elements, const IndexType *__restrict__ row_idxs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values,
-    const IndexType *__restrict__ l_row_ptrs,
-    const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values,
-    const IndexType *__restrict__ u_row_ptrs,
-    const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values)
-{
-    const auto elem_id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (elem_id < num_elements) {
-        const auto row = row_idxs[elem_id];
-        const auto col = col_idxs[elem_id];
-        const auto val = values[elem_id];
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[col];
-        ValueType sum{val};
-        ValueType last_operation{};
-        while (l_idx < l_row_ptrs[row + 1] && u_idx < u_row_ptrs[col + 1]) {
-            const auto l_col = l_col_idxs[l_idx];
-            const auto u_col = u_col_idxs[u_idx];
-            last_operation = zero<ValueType>();
-            if (l_col == u_col) {
-                last_operation = l_values[l_idx] * u_values[u_idx];
-                sum -= last_operation;
-            }
-            l_idx += (l_col <= u_col);
-            u_idx += (u_col <= l_col);
-        }
-        sum += last_operation;  // undo the last operation
-        if (row > col) {
-            auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1];
-            if (::gko::isfinite(to_write)) {
-                l_values[l_idx - 1] = to_write;
-            }
-        } else {
-            auto to_write = sum;
-            if (::gko::isfinite(to_write)) {
-                u_values[u_idx - 1] = to_write;
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
+#include "common/factorization/par_ilu_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu
new file mode 100644
index 00000000000..8b7b1a88443
--- /dev/null
+++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu
@@ -0,0 +1,206 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/atomic.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/sorting.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/factorization/par_ilut_select_common.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_filter_kernels.hpp.inc"
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter_approx(syn::value_list<int, subwarp_size>,
+                             std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> *tmp,
+                             remove_complex<ValueType> *threshold,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp->resize_and_reset(tmp_size);
+
+    auto total_counts = reinterpret_cast<IndexType *>(tmp->get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType *>(tmp->get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char *>(
+        tmp->get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType *>(tmp->get_data() + tmp_size_totals +
+                                    tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank
+    auto bucket = static_cast<unsigned char>(
+        sampleselect_find_bucket(exec, total_counts, rank).idx);
+    *threshold =
+        exec->copy_val_to_host(tree + kernel::searchtree_inner_size + bucket);
+    // we implicitly set the first splitter to -inf, but 0 works as well
+    if (bucket == 0) {
+        *threshold = zero<AbsType>();
+    }
+
+    // filter the elements
+    auto old_row_ptrs = m->get_const_row_ptrs();
+    auto old_col_idxs = m->get_const_col_idxs();
+    auto old_vals = m->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(m->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    kernel::bucket_filter_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
+        old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs);
+
+    // build row pointers
+    components::prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType *new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            Array<ValueType>::view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    kernel::bucket_filter<subwarp_size><<<num_blocks, default_block_size>>>(
+        old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), oracles, num_rows,
+        bucket, new_row_ptrs, new_row_idxs, new_col_idxs,
+        as_cuda_type(new_vals));
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx,
+                                    threshold_filter_approx);
+
+
+template <typename ValueType, typename IndexType>
+void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> &tmp,
+                             remove_complex<ValueType> &threshold,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto num_rows = m->get_size()[0];
+    auto total_nnz = m->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter_approx(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp,
+        &threshold, m_out, m_out_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu
new file mode 100644
index 00000000000..1b2e6e921f8
--- /dev/null
+++ b/cuda/factorization/par_ilut_filter_kernel.cu
@@ -0,0 +1,162 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_filter_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter(syn::value_list<int, subwarp_size>,
+                      std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType> *m_out,
+                      matrix::Coo<ValueType, IndexType> *m_out_coo, bool lower)
+{
+    auto old_row_ptrs = a->get_const_row_ptrs();
+    auto old_col_idxs = a->get_const_col_idxs();
+    auto old_vals = a->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    kernel::threshold_filter_nnz<subwarp_size>
+        <<<num_blocks, default_block_size>>>(old_row_ptrs,
+                                             as_cuda_type(old_vals), num_rows,
+                                             threshold, new_row_ptrs, lower);
+
+    // build row pointers
+    components::prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType *new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            Array<ValueType>::view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    kernel::threshold_filter<subwarp_size><<<num_blocks, default_block_size>>>(
+        old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows, threshold,
+        new_row_ptrs, new_row_idxs, new_col_idxs, as_cuda_type(new_vals),
+        lower);
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
+
+
+}  // namespace
+
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType> *m_out,
+                      matrix::Coo<ValueType, IndexType> *m_out_coo, bool lower)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
+        m_out_coo, lower);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu
new file mode 100644
index 00000000000..1b564801cee
--- /dev/null
+++ b/cuda/factorization/par_ilut_select_common.cu
@@ -0,0 +1,117 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/factorization/par_ilut_select_common.cuh"
+
+
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/components/atomic.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/searching.cuh"
+#include "cuda/components/sorting.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_count(const ValueType *values, IndexType size,
+                        remove_complex<ValueType> *tree, unsigned char *oracles,
+                        IndexType *partial_counts, IndexType *total_counts)
+{
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    // pick sample, build searchtree
+    kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
+                                                  tree);
+    // determine bucket sizes
+    kernel::count_buckets<<<num_blocks, default_block_size>>>(
+        as_cuda_type(values), size, tree, partial_counts, oracles,
+        items_per_thread);
+    // compute prefix sum and total sum over block-local values
+    kernel::block_prefix_sum<<<bucket_count, default_block_size>>>(
+        partial_counts, total_counts, num_blocks);
+    // compute prefix sum over bucket counts
+    start_prefix_sum<bucket_count><<<1, bucket_count>>>(
+        bucket_count, total_counts, total_counts + bucket_count);
+}
+
+
+#define DECLARE_SSSS_COUNT(ValueType, IndexType)                               \
+    void sampleselect_count(const ValueType *values, IndexType size,           \
+                            remove_complex<ValueType> *tree,                   \
+                            unsigned char *oracles, IndexType *partial_counts, \
+                            IndexType *total_counts)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
+
+
+template <typename IndexType>
+sampleselect_bucket<IndexType> sampleselect_find_bucket(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType *prefix_sum,
+    IndexType rank)
+{
+    kernel::find_bucket<<<1, config::warp_size>>>(prefix_sum, rank);
+    IndexType values[3]{};
+    exec->get_master()->copy_from(exec.get(), 3, prefix_sum, values);
+    return {values[0], values[1], values[2]};
+}
+
+
+#define DECLARE_SSSS_FIND_BUCKET(IndexType)                                 \
+    sampleselect_bucket<IndexType> sampleselect_find_bucket(                \
+        std::shared_ptr<const DefaultExecutor> exec, IndexType *prefix_sum, \
+        IndexType rank)
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/factorization/par_ilut_select_common.cuh b/cuda/factorization/par_ilut_select_common.cuh
new file mode 100644
index 00000000000..1f2eded3b0b
--- /dev/null
+++ b/cuda/factorization/par_ilut_select_common.cuh
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_
+#define GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+constexpr auto items_per_thread = 16;
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_count(const ValueType *values, IndexType size,
+                        remove_complex<ValueType> *tree, unsigned char *oracles,
+                        IndexType *partial_counts, IndexType *total_counts);
+
+
+template <typename IndexType>
+struct sampleselect_bucket {
+    IndexType idx;
+    IndexType begin;
+    IndexType size;
+};
+
+
+template <typename IndexType>
+sampleselect_bucket<IndexType> sampleselect_find_bucket(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType *prefix_sum,
+    IndexType rank);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_
\ No newline at end of file
diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu
new file mode 100644
index 00000000000..469bde6ccc6
--- /dev/null
+++ b/cuda/factorization/par_ilut_select_kernel.cu
@@ -0,0 +1,184 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/components/atomic.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/searching.cuh"
+#include "cuda/components/sorting.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/factorization/par_ilut_select_common.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_filter(const ValueType *values, IndexType size,
+                         const unsigned char *oracles,
+                         const IndexType *partial_counts, IndexType bucket,
+                         remove_complex<ValueType> *out)
+{
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    kernel::filter_bucket<<<num_blocks, default_block_size>>>(
+        as_cuda_type(values), size, bucket, oracles, partial_counts, out,
+        items_per_thread);
+}
+
+
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *m,
+                      IndexType rank, Array<ValueType> &tmp1,
+                      Array<remove_complex<ValueType>> &tmp2,
+                      remove_complex<ValueType> &threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size_vals =
+        size / bucket_count * 4;  // pessimistic estimate for temporary storage
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp1.resize_and_reset(tmp_size);
+    tmp2.resize_and_reset(tmp_size_vals);
+
+    auto total_counts = reinterpret_cast<IndexType *>(tmp1.get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType *>(tmp1.get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char *>(
+        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType *>(tmp1.get_data() + tmp_size_totals +
+                                    tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank, use bucket-local rank
+    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
+    rank -= bucket.begin;
+
+    if (bucket.size * 2 > tmp_size_vals) {
+        // we need to reallocate tmp2
+        tmp2.resize_and_reset(bucket.size * 2);
+    }
+    auto tmp21 = tmp2.get_data();
+    auto tmp22 = tmp2.get_data() + bucket.size;
+    // extract target bucket
+    sampleselect_filter(values, size, oracles, partial_counts, bucket.idx,
+                        tmp22);
+
+    // recursively select from smaller buckets
+    int step{};
+    while (bucket.size > kernel::basecase_size) {
+        std::swap(tmp21, tmp22);
+        const auto *tmp_in = tmp21;
+        auto tmp_out = tmp22;
+
+        sampleselect_count(tmp_in, bucket.size, tree, oracles, partial_counts,
+                           total_counts);
+        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
+        sampleselect_filter(tmp_in, bucket.size, oracles, partial_counts,
+                            bucket.idx, tmp_out);
+
+        rank -= new_bucket.begin;
+        bucket.size = new_bucket.size;
+        // we should never need more than 5 recursion steps, this would mean
+        // 256^5 = 2^40. fall back to standard library algorithm in that case.
+        ++step;
+        if (step > 5) {
+            Array<AbsType> cpu_out_array{
+                exec->get_master(),
+                Array<AbsType>::view(exec, bucket.size, tmp_out)};
+            auto begin = cpu_out_array.get_data();
+            auto end = begin + bucket.size;
+            auto middle = begin + rank;
+            std::nth_element(begin, middle, end);
+            threshold = *middle;
+            return;
+        }
+    }
+
+    // base case
+    auto out_ptr = reinterpret_cast<AbsType *>(tmp1.get_data());
+    kernel::basecase_select<<<1, kernel::basecase_block_size>>>(
+        tmp22, bucket.size, rank, out_ptr);
+    threshold = exec->copy_val_to_host(out_ptr);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernel.cu
new file mode 100644
index 00000000000..1efb704e272
--- /dev/null
+++ b/cuda/factorization/par_ilut_spgeam_kernel.cu
@@ -0,0 +1,179 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/searching.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_spgeam_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *lu,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Csr<ValueType, IndexType> *u,
+                    matrix::Csr<ValueType, IndexType> *l_new,
+                    matrix::Csr<ValueType, IndexType> *u_new)
+{
+    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
+    auto lu_row_ptrs = lu->get_const_row_ptrs();
+    auto lu_col_idxs = lu->get_const_col_idxs();
+    auto lu_vals = lu->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    // count non-zeros per row
+    kernel::tri_spgeam_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
+        lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs,
+        u_new_row_ptrs, num_rows);
+
+    // build row ptrs
+    components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
+    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    // fill columns and values
+    kernel::tri_spgeam_init<subwarp_size><<<num_blocks, default_block_size>>>(
+        lu_row_ptrs, lu_col_idxs, as_cuda_type(lu_vals), a_row_ptrs, a_col_idxs,
+        as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, as_cuda_type(l_vals),
+        u_row_ptrs, u_col_idxs, as_cuda_type(u_vals), l_new_row_ptrs,
+        l_new_col_idxs, as_cuda_type(l_new_vals), u_new_row_ptrs,
+        u_new_col_idxs, as_cuda_type(u_new_vals), num_rows);
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *lu,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Csr<ValueType, IndexType> *u,
+                    matrix::Csr<ValueType, IndexType> *l_new,
+                    matrix::Csr<ValueType, IndexType> *u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        lu->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
+        u_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu
new file mode 100644
index 00000000000..91b68b723da
--- /dev/null
+++ b/cuda/factorization/par_ilut_sweep_kernel.cu
@@ -0,0 +1,145 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/searching.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_sweep_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_l_u_factors(syn::value_list<int, subwarp_size>,
+                         std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *a,
+                         matrix::Csr<ValueType, IndexType> *l,
+                         const matrix::Coo<ValueType, IndexType> *l_coo,
+                         matrix::Csr<ValueType, IndexType> *u,
+                         const matrix::Coo<ValueType, IndexType> *u_coo,
+                         matrix::Csr<ValueType, IndexType> *u_csc)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
+                                            u->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    kernel::sweep<subwarp_size><<<num_blocks, default_block_size>>>(
+        a->get_const_row_ptrs(), a->get_const_col_idxs(),
+        as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(),
+        l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+        as_cuda_type(l->get_values()),
+        static_cast<IndexType>(l->get_num_stored_elements()),
+        u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
+        as_cuda_type(u->get_values()), u_csc->get_const_row_ptrs(),
+        u_csc->get_const_col_idxs(), as_cuda_type(u_csc->get_values()),
+        static_cast<IndexType>(u->get_num_stored_elements()));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
+                                    compute_l_u_factors);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *a,
+                         matrix::Csr<ValueType, IndexType> *l,
+                         const matrix::Coo<ValueType, IndexType> *l_coo,
+                         matrix::Csr<ValueType, IndexType> *u,
+                         const matrix::Coo<ValueType, IndexType> *u_coo,
+                         matrix::Csr<ValueType, IndexType> *u_csc)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        l->get_num_stored_elements() + u->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_l_u_factors(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
+        u_csc);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
index 446280ba185..ef94a07a8a2 100644
--- a/cuda/matrix/coo_kernels.cu
+++ b/cuda/matrix/coo_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,7 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/components/fill_array.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
@@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/format_conversion.cuh"
 #include "cuda/components/segment_scan.cuh"
-#include "cuda/components/zero_array.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -69,188 +71,10 @@ namespace coo {
 
 constexpr int default_block_size = 512;
 constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * cuda_config::warp_size;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
 
 
-namespace {
-
-
-/**
- * The device function of COO spmv
- *
- * @param nnz  the number of nonzeros in the matrix
- * @param num_lines  the maximum round of each warp
- * @param val  the value array of the matrix
- * @param col  the column index array of the matrix
- * @param row  the row index array of the matrix
- * @param b  the input dense vector
- * @param b_stride  the stride of the input dense vector
- * @param c  the output dense vector
- * @param c_stride  the stride of the output dense vector
- * @param scale  the function on the added value
- */
-template <int subwarp_size = cuda_config::warp_size, typename ValueType,
-          typename IndexType, typename Closure>
-__device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
-                            const ValueType *__restrict__ val,
-                            const IndexType *__restrict__ col,
-                            const IndexType *__restrict__ row,
-                            const ValueType *__restrict__ b,
-                            const size_type b_stride, ValueType *__restrict__ c,
-                            const size_type c_stride, Closure scale)
-{
-    ValueType temp_val = zero<ValueType>();
-    const auto start = static_cast<size_type>(blockDim.x) * blockIdx.x *
-                           blockDim.y * num_lines +
-                       threadIdx.y * blockDim.x * num_lines;
-    const auto column_id = blockIdx.y;
-    size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size);
-    num = min(num, num_lines);
-    const IndexType ind_start = start + threadIdx.x;
-    const IndexType ind_end = ind_start + (num - 1) * subwarp_size;
-    IndexType ind = ind_start;
-    IndexType curr_row = (ind < nnz) ? row[ind] : 0;
-    const auto tile_block =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    for (; ind < ind_end; ind += subwarp_size) {
-        temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
-                                : zero<ValueType>();
-        auto next_row =
-            (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
-        // segmented scan
-        if (tile_block.any(curr_row != next_row)) {
-            bool is_first_in_segment =
-                segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
-            if (is_first_in_segment) {
-                atomic_add(&(c[curr_row * c_stride + column_id]),
-                           scale(temp_val));
-            }
-            temp_val = zero<ValueType>();
-        }
-        curr_row = next_row;
-    }
-    if (num > 0) {
-        ind = ind_end;
-        temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
-                                : zero<ValueType>();
-        // segmented scan
-        bool is_first_in_segment =
-            segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
-        if (is_first_in_segment) {
-            atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val));
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
-    const size_type nnz, const size_type num_lines,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
-    const IndexType *__restrict__ row, const ValueType *__restrict__ b,
-    const size_type b_stride, ValueType *__restrict__ c,
-    const size_type c_stride)
-{
-    spmv_kernel(nnz, num_lines, val, col, row, b, b_stride, c, c_stride,
-                [](const ValueType &x) { return x; });
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
-    const size_type nnz, const size_type num_lines,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col, const IndexType *__restrict__ row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    ValueType scale_factor = alpha[0];
-    spmv_kernel(
-        nnz, num_lines, val, col, row, b, b_stride, c, c_stride,
-        [&scale_factor](const ValueType &x) { return scale_factor * x; });
-}
-
-
-/**
- * The device function of COO spmm
- *
- * @param nnz  the number of nonzeros in the matrix
- * @param num_elems  the maximum number of nonzeros in each warp
- * @param val  the value array of the matrix
- * @param col  the column index array of the matrix
- * @param row  the row index array of the matrix
- * @param num_cols the number of columns of the matrix
- * @param b  the input dense vector
- * @param b_stride  the stride of the input dense vector
- * @param c  the output dense vector
- * @param c_stride  the stride of the output dense vector
- * @param scale  the function on the added value
- */
-template <typename ValueType, typename IndexType, typename Closure>
-__device__ void spmm_kernel(const size_type nnz, const size_type num_elems,
-                            const ValueType *__restrict__ val,
-                            const IndexType *__restrict__ col,
-                            const IndexType *__restrict__ row,
-                            const size_type num_cols,
-                            const ValueType *__restrict__ b,
-                            const size_type b_stride, ValueType *__restrict__ c,
-                            const size_type c_stride, Closure scale)
-{
-    ValueType temp = zero<ValueType>();
-    const auto coo_idx =
-        (static_cast<size_type>(blockDim.y) * blockIdx.x + threadIdx.y) *
-        num_elems;
-    const auto column_id = blockIdx.y * blockDim.x + threadIdx.x;
-    const auto coo_end =
-        (coo_idx + num_elems > nnz) ? nnz : coo_idx + num_elems;
-    if (column_id < num_cols && coo_idx < nnz) {
-        auto curr_row = row[coo_idx];
-        auto idx = coo_idx;
-        for (; idx < coo_end - 1; idx++) {
-            temp += val[idx] * b[col[idx] * b_stride + column_id];
-            const auto next_row = row[idx + 1];
-            if (next_row != curr_row) {
-                atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp));
-                curr_row = next_row;
-                temp = zero<ValueType>();
-            }
-        }
-        temp += val[idx] * b[col[idx] * b_stride + column_id];
-        atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp));
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_spmm(
-    const size_type nnz, const size_type num_elems,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
-    const IndexType *__restrict__ row, const size_type num_cols,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    spmm_kernel(nnz, num_elems, val, col, row, num_cols, b, b_stride, c,
-                c_stride, [](const ValueType &x) { return x; });
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_spmm(
-    const size_type nnz, const size_type num_elems,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col, const IndexType *__restrict__ row,
-    const size_type num_cols, const ValueType *__restrict__ b,
-    const size_type b_stride, ValueType *__restrict__ c,
-    const size_type c_stride)
-{
-    ValueType scale_factor = alpha[0];
-    spmm_kernel(
-        nnz, num_elems, val, col, row, num_cols, b, b_stride, c, c_stride,
-        [&scale_factor](const ValueType &x) { return scale_factor * x; });
-}
-
-
-}  // namespace
+#include "common/matrix/coo_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
@@ -258,7 +82,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Coo<ValueType, IndexType> *a,
           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
 {
-    zero_array(c->get_num_stored_elements(), c->get_values());
+    components::fill_array(exec, c->get_values(), c->get_num_stored_elements(),
+                           zero<ValueType>());
 
     spmv2(exec, a, b, c);
 }
@@ -289,23 +114,23 @@ void spmv2(std::shared_ptr<const CudaExecutor> exec,
 {
     const auto nnz = a->get_num_stored_elements();
     const auto b_ncols = b->get_size()[1];
-    const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
+    const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
 
     if (nwarps > 0) {
         if (b_ncols < 4) {
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
-            int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
+            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             abstract_spmv<<<coo_grid, coo_block>>>(
                 nnz, num_lines, as_cuda_type(a->get_const_values()),
                 a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()),
                 as_cuda_type(b->get_const_values()), b->get_stride(),
                 as_cuda_type(c->get_values()), c->get_stride());
         } else {
-            int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) *
-                            cuda_config::warp_size;
+            int num_elems =
+                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, cuda_config::warp_size));
+                                ceildiv(b_ncols, config::warp_size));
             abstract_spmm<<<coo_grid, coo_block>>>(
                 nnz, num_elems, as_cuda_type(a->get_const_values()),
                 a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()),
@@ -327,12 +152,12 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
 {
     const auto nnz = a->get_num_stored_elements();
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
-    const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
+    const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto b_ncols = b->get_size()[1];
 
     if (nwarps > 0) {
         if (b_ncols < 4) {
-            int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
+            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
             abstract_spmv<<<coo_grid, coo_block>>>(
                 nnz, num_lines, as_cuda_type(alpha->get_const_values()),
@@ -341,10 +166,10 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
                 as_cuda_type(b->get_const_values()), b->get_stride(),
                 as_cuda_type(c->get_values()), c->get_stride());
         } else {
-            int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) *
-                            cuda_config::warp_size;
+            int num_elems =
+                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, cuda_config::warp_size));
+                                ceildiv(b_ncols, config::warp_size));
             abstract_spmm<<<coo_grid, coo_block>>>(
                 nnz, num_elems, as_cuda_type(alpha->get_const_values()),
                 as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
@@ -358,31 +183,6 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
-namespace kernel {
-
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
-    const IndexType *__restrict__ idxs, size_type num_nonzeros,
-    IndexType *__restrict__ ptrs, size_type length)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tidx == 0) {
-        ptrs[0] = 0;
-        ptrs[length - 1] = num_nonzeros;
-    }
-
-    if (0 < tidx && tidx < num_nonzeros) {
-        if (idxs[tidx - 1] < idxs[tidx]) {
-            for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) {
-                ptrs[i] = tidx;
-            }
-        }
-    }
-}
-
-}  // namespace kernel
-
 
 template <typename IndexType>
 void convert_row_idxs_to_ptrs(std::shared_ptr<const CudaExecutor> exec,
@@ -398,8 +198,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -416,44 +216,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType>
-__global__
-    __launch_bounds__(cuda_config::max_block_size) void initialize_zero_dense(
-        size_type num_rows, size_type num_cols, size_type stride,
-        ValueType *__restrict__ result)
-{
-    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
-    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
-    if (tidx_x < num_cols && tidx_y < num_rows) {
-        result[tidx_y * stride + tidx_x] = zero<ValueType>();
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_dense(
-    size_type nnz, const IndexType *__restrict__ row_idxs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values, size_type stride,
-    ValueType *__restrict__ result)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (tidx < nnz) {
-        result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -461,9 +227,8 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const auto nnz = source->get_num_stored_elements();
 
-    const dim3 block_size(cuda_config::warp_size,
-                          cuda_config::max_block_size / cuda_config::warp_size,
-                          1);
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
     const dim3 init_grid_dim(ceildiv(stride, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index d0b46c83c51..f1781d2ce5c 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -45,19 +46,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/merging.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
+#include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/zero_array.hpp"
 
 
 namespace gko {
@@ -73,9 +79,9 @@ namespace csr {
 
 constexpr int default_block_size = 512;
 constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * cuda_config::warp_size;
-constexpr int classical_block_size = 64;
-constexpr int wsize = cuda_config::warp_size;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+constexpr int wsize = config::warp_size;
+constexpr int classical_overweight = 32;
 
 
 /**
@@ -84,463 +90,14 @@ constexpr int wsize = cuda_config::warp_size;
  */
 using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
 
+using classical_kernels =
+    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
 
-namespace kernel {
+using spgeam_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-template <typename T>
-__host__ __device__ __forceinline__ T ceildivT(T nom, T denom)
-{
-    return (nom + denom - 1ll) / denom;
-}
-
-
-template <typename ValueType, typename IndexType>
-__device__ __forceinline__ bool block_segment_scan_reverse(
-    const IndexType *__restrict__ ind, ValueType *__restrict__ val)
-{
-    bool last = true;
-    const auto reg_ind = ind[threadIdx.x];
-#pragma unroll
-    for (int i = 1; i < spmv_block_size; i <<= 1) {
-        if (i == 1 && threadIdx.x < spmv_block_size - 1 &&
-            reg_ind == ind[threadIdx.x + 1]) {
-            last = false;
-        }
-        auto temp = zero<ValueType>();
-        if (threadIdx.x >= i && reg_ind == ind[threadIdx.x - i]) {
-            temp = val[threadIdx.x - i];
-        }
-        group::this_thread_block().sync();
-        val[threadIdx.x] += temp;
-        group::this_thread_block().sync();
-    }
-
-    return last;
-}
-
-
-template <bool overflow, typename IndexType>
-__device__ __forceinline__ void find_next_row(
-    const IndexType num_rows, const IndexType data_size, const IndexType ind,
-    IndexType *__restrict__ row, IndexType *__restrict__ row_end,
-    const IndexType row_predict, const IndexType row_predict_end,
-    const IndexType *__restrict__ row_ptr)
-{
-    if (!overflow || ind < data_size) {
-        if (ind >= *row_end) {
-            *row = row_predict;
-            *row_end = row_predict_end;
-            for (; ind >= *row_end; *row_end = row_ptr[++*row + 1])
-                ;
-        }
-
-    } else {
-        *row = num_rows - 1;
-        *row_end = data_size;
-    }
-}
-
-
-template <size_type subwarp_size, typename ValueType, typename IndexType,
-          typename Closure>
-__device__ __forceinline__ void warp_atomic_add(
-    const group::thread_block_tile<subwarp_size> &group, bool force_write,
-    ValueType *__restrict__ val, const IndexType row, ValueType *__restrict__ c,
-    const size_type c_stride, const IndexType column_id, Closure scale)
-{
-    // do a local scan to avoid atomic collisions
-    const bool need_write = segment_scan(group, row, val);
-    if (need_write && force_write) {
-        atomic_add(&(c[row * c_stride + column_id]), scale(*val));
-    }
-    if (!need_write || force_write) {
-        *val = zero<ValueType>();
-    }
-}
-
-
-template <bool last, size_type subwarp_size, typename ValueType,
-          typename IndexType, typename Closure>
-__device__ __forceinline__ void process_window(
-    const group::thread_block_tile<subwarp_size> &group,
-    const IndexType num_rows, const IndexType data_size, const IndexType ind,
-    IndexType *__restrict__ row, IndexType *__restrict__ row_end,
-    IndexType *__restrict__ nrow, IndexType *__restrict__ nrow_end,
-    ValueType *__restrict__ temp_val, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b,
-    const size_type b_stride, ValueType *__restrict__ c,
-    const size_type c_stride, const IndexType column_id, Closure scale)
-{
-    const IndexType curr_row = *row;
-    find_next_row<last>(num_rows, data_size, ind, row, row_end, *nrow,
-                        *nrow_end, row_ptrs);
-    // segmented scan
-    if (group.any(curr_row != *row)) {
-        warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c,
-                        c_stride, column_id, scale);
-        *nrow = group.shfl(*row, subwarp_size - 1);
-        *nrow_end = group.shfl(*row_end, subwarp_size - 1);
-    }
-
-    if (!last || ind < data_size) {
-        const auto col = col_idxs[ind];
-        *temp_val += val[ind] * b[col * b_stride + column_id];
-    }
-}
-
-
-template <typename IndexType>
-__device__ __forceinline__ IndexType get_warp_start_idx(
-    const IndexType nwarps, const IndexType nnz, const IndexType warp_idx)
-{
-    const long long cache_lines = ceildivT<IndexType>(nnz, wsize);
-    return (warp_idx * cache_lines / nwarps) * wsize;
-}
-
-
-template <typename ValueType, typename IndexType, typename Closure>
-__device__ __forceinline__ void spmv_kernel(
-    const IndexType nwarps, const IndexType num_rows,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride, Closure scale)
-{
-    const IndexType warp_idx = blockIdx.x * warps_in_block + threadIdx.y;
-    const IndexType column_id = blockIdx.y;
-    if (warp_idx >= nwarps) {
-        return;
-    }
-    const IndexType data_size = row_ptrs[num_rows];
-    const IndexType start = get_warp_start_idx(nwarps, data_size, warp_idx);
-    const IndexType end =
-        min(get_warp_start_idx(nwarps, data_size, warp_idx + 1),
-            ceildivT<IndexType>(data_size, wsize) * wsize);
-    auto row = srow[warp_idx];
-    auto row_end = row_ptrs[row + 1];
-    auto nrow = row;
-    auto nrow_end = row_end;
-    ValueType temp_val = zero<ValueType>();
-    IndexType ind = start + threadIdx.x;
-    find_next_row<true>(num_rows, data_size, ind, &row, &row_end, nrow,
-                        nrow_end, row_ptrs);
-    const IndexType ind_end = end - wsize;
-    const auto tile_block =
-        group::tiled_partition<wsize>(group::this_thread_block());
-    for (; ind < ind_end; ind += wsize) {
-        process_window<false>(tile_block, num_rows, data_size, ind, &row,
-                              &row_end, &nrow, &nrow_end, &temp_val, val,
-                              col_idxs, row_ptrs, b, b_stride, c, c_stride,
-                              column_id, scale);
-    }
-    process_window<true>(tile_block, num_rows, data_size, ind, &row, &row_end,
-                         &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs,
-                         b, b_stride, c, c_stride, column_id, scale);
-    warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id,
-                    scale);
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
-    const IndexType nwarps, const IndexType num_rows,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c,
-                c_stride, [](const ValueType &x) { return x; });
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
-    const IndexType nwarps, const IndexType num_rows,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    ValueType scale_factor = alpha[0];
-    spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c,
-                c_stride, [&scale_factor](const ValueType &x) {
-                    return scale_factor * x;
-                });
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void set_zero(
-    const size_type nnz, ValueType *__restrict__ val)
-{
-    const auto ind =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    if (ind < nnz) {
-        val[ind] = zero<ValueType>();
-    }
-}
-
-
-template <typename IndexType>
-__forceinline__ __device__ void merge_path_search(
-    const IndexType diagonal, const IndexType a_len, const IndexType b_len,
-    const IndexType *__restrict__ a, const IndexType offset_b,
-    IndexType *__restrict__ x, IndexType *__restrict__ y)
-{
-    auto x_min = max(diagonal - b_len, zero<IndexType>());
-    auto x_max = min(diagonal, a_len);
-    while (x_min < x_max) {
-        auto pivot = (x_min + x_max) >> 1;
-        if (a[pivot] <= offset_b + diagonal - pivot - 1) {
-            x_min = pivot + 1;
-        } else {
-            x_max = pivot;
-        }
-    }
-
-    *x = min(x_min, a_len);
-    *y = diagonal - x_min;
-}
-
-
-template <typename ValueType, typename IndexType, typename Alpha_op>
-__device__ void reduce(const IndexType nwarps,
-                       const ValueType *__restrict__ last_val,
-                       const IndexType *__restrict__ last_row,
-                       ValueType *__restrict__ c, const size_type c_stride,
-                       Alpha_op alpha_op)
-{
-    const IndexType cache_lines = ceildivT<IndexType>(nwarps, spmv_block_size);
-    const IndexType tid = threadIdx.x;
-    const IndexType start = min(tid * cache_lines, nwarps);
-    const IndexType end = min((tid + 1) * cache_lines, nwarps);
-    ValueType value = zero<ValueType>();
-    IndexType row = last_row[nwarps - 1];
-    if (start < nwarps) {
-        value = last_val[start];
-        row = last_row[start];
-        for (IndexType i = start + 1; i < end; i++) {
-            if (last_row[i] != row) {
-                c[row * c_stride] += alpha_op(value);
-                row = last_row[i];
-                value = last_val[i];
-            } else {
-                value += last_val[i];
-            }
-        }
-    }
-    __shared__ UninitializedArray<IndexType, spmv_block_size> tmp_ind;
-    __shared__ UninitializedArray<ValueType, spmv_block_size> tmp_val;
-    tmp_val[threadIdx.x] = value;
-    tmp_ind[threadIdx.x] = row;
-    group::this_thread_block().sync();
-    bool last = block_segment_scan_reverse(static_cast<IndexType *>(tmp_ind),
-                                           static_cast<ValueType *>(tmp_val));
-    group::this_thread_block().sync();
-    if (last) {
-        c[row * c_stride] += alpha_op(tmp_val[threadIdx.x]);
-    }
-}
-
-
-template <int items_per_thread, typename ValueType, typename IndexType,
-          typename Alpha_op, typename Beta_op>
-__device__ void merge_path_spmv(
-    const IndexType num_rows, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride,
-    IndexType *__restrict__ row_out, ValueType *__restrict__ val_out,
-    Alpha_op alpha_op, Beta_op beta_op)
-{
-    const auto *row_end_ptrs = row_ptrs + 1;
-    const auto nnz = row_ptrs[num_rows];
-    const IndexType num_merge_items = num_rows + nnz;
-    const auto block_items = spmv_block_size * items_per_thread;
-    __shared__ IndexType shared_row_ptrs[block_items];
-    const IndexType diagonal =
-        min(static_cast<IndexType>(block_items * blockIdx.x), num_merge_items);
-    const IndexType diagonal_end = min(diagonal + block_items, num_merge_items);
-    IndexType block_start_x;
-    IndexType block_start_y;
-    IndexType end_x;
-    IndexType end_y;
-    merge_path_search(diagonal, num_rows, nnz, row_end_ptrs, zero<IndexType>(),
-                      &block_start_x, &block_start_y);
-    merge_path_search(diagonal_end, num_rows, nnz, row_end_ptrs,
-                      zero<IndexType>(), &end_x, &end_y);
-    const IndexType block_num_rows = end_x - block_start_x;
-    const IndexType block_num_nonzeros = end_y - block_start_y;
-    for (int i = threadIdx.x;
-         i < block_num_rows && block_start_x + i < num_rows;
-         i += spmv_block_size) {
-        shared_row_ptrs[i] = row_end_ptrs[block_start_x + i];
-    }
-    group::this_thread_block().sync();
-
-    IndexType start_x;
-    IndexType start_y;
-    merge_path_search(static_cast<IndexType>(items_per_thread * threadIdx.x),
-                      block_num_rows, block_num_nonzeros, shared_row_ptrs,
-                      block_start_y, &start_x, &start_y);
-
-
-    IndexType ind = block_start_y + start_y;
-    IndexType row_i = block_start_x + start_x;
-    ValueType value = zero<ValueType>();
-#pragma unroll
-    for (IndexType i = 0; i < items_per_thread; i++) {
-        if (row_i < num_rows) {
-            if (start_x == block_num_rows || ind < shared_row_ptrs[start_x]) {
-                value += val[ind] * b[col_idxs[ind] * b_stride];
-                ind++;
-            } else {
-                c[row_i * c_stride] =
-                    alpha_op(value) + beta_op(c[row_i * c_stride]);
-                start_x++;
-                row_i++;
-                value = zero<ValueType>();
-            }
-        }
-    }
-    group::this_thread_block().sync();
-    IndexType *tmp_ind = shared_row_ptrs;
-    ValueType *tmp_val =
-        reinterpret_cast<ValueType *>(shared_row_ptrs + spmv_block_size);
-    tmp_val[threadIdx.x] = value;
-    tmp_ind[threadIdx.x] = row_i;
-    group::this_thread_block().sync();
-    bool last = block_segment_scan_reverse(static_cast<IndexType *>(tmp_ind),
-                                           static_cast<ValueType *>(tmp_val));
-    if (threadIdx.x == spmv_block_size - 1) {
-        row_out[blockIdx.x] = min(end_x, num_rows - 1);
-        val_out[blockIdx.x] = tmp_val[threadIdx.x];
-    } else if (last) {
-        c[row_i * c_stride] += alpha_op(tmp_val[threadIdx.x]);
-    }
-}
-
-template <int items_per_thread, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv(
-    const IndexType num_rows, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride,
-    IndexType *__restrict__ row_out, ValueType *__restrict__ val_out)
-{
-    merge_path_spmv<items_per_thread>(
-        num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride,
-        row_out, val_out, [](ValueType &x) { return x; },
-        [](ValueType &x) { return zero<ValueType>(); });
-}
-
-
-template <int items_per_thread, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv(
-    const IndexType num_rows, const ValueType *__restrict__ alpha,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
-    const size_type c_stride, IndexType *__restrict__ row_out,
-    ValueType *__restrict__ val_out)
-{
-    const auto alpha_val = alpha[0];
-    const auto beta_val = beta[0];
-    merge_path_spmv<items_per_thread>(
-        num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride,
-        row_out, val_out, [&alpha_val](ValueType &x) { return alpha_val * x; },
-        [&beta_val](ValueType &x) { return beta_val * x; });
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_reduce(
-    const IndexType nwarps, const ValueType *__restrict__ last_val,
-    const IndexType *__restrict__ last_row, ValueType *__restrict__ c,
-    const size_type c_stride)
-{
-    reduce(nwarps, last_val, last_row, c, c_stride,
-           [](ValueType &x) { return x; });
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_reduce(
-    const IndexType nwarps, const ValueType *__restrict__ last_val,
-    const IndexType *__restrict__ last_row, const ValueType *__restrict__ alpha,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    const auto alpha_val = alpha[0];
-    reduce(nwarps, last_val, last_row, c, c_stride,
-           [&alpha_val](ValueType &x) { return alpha_val * x; });
-}
-
-
-template <typename ValueType, typename IndexType, typename Closure>
-__device__ void classical_spmv(const size_type num_rows,
-                               const ValueType *__restrict__ val,
-                               const IndexType *__restrict__ col_idxs,
-                               const IndexType *__restrict__ row_ptrs,
-                               const ValueType *__restrict__ b,
-                               const size_type b_stride,
-                               ValueType *__restrict__ c,
-                               const size_type c_stride, Closure scale)
-{
-    const auto tid =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    if (tid >= num_rows) {
-        return;
-    }
-    const auto column_id = blockIdx.y;
-    const auto ind_end = row_ptrs[tid + 1];
-    ValueType temp_value = zero<ValueType>();
-    for (auto ind = row_ptrs[tid]; ind < ind_end; ind++) {
-        temp_value += val[ind] * b[col_idxs[ind] * b_stride + column_id];
-    }
-    c[tid * c_stride + column_id] =
-        scale(temp_value, c[tid * c_stride + column_id]);
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(classical_block_size) void abstract_classical_spmv(
-    const size_type num_rows, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b,
-    const size_type b_stride, ValueType *__restrict__ c,
-    const size_type c_stride)
-{
-    classical_spmv(num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride,
-                   [](const ValueType &x, const ValueType &y) { return x; });
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(classical_block_size) void abstract_classical_spmv(
-    const size_type num_rows, const ValueType *__restrict__ alpha,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs,
-    const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b,
-    const size_type b_stride, const ValueType *__restrict__ beta,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    const auto alpha_val = alpha[0];
-    const auto beta_val = beta[0];
-    classical_spmv(
-        num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride,
-        [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
-            return alpha_val * x + beta_val * y;
-        });
-}
-
-
-}  // namespace kernel
+#include "common/matrix/csr_kernels.hpp.inc"
 
 
 namespace host_kernel {
@@ -614,8 +171,8 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
 template <typename ValueType, typename IndexType>
 int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
 {
-    const int version = exec->get_major_version()
-                        << 4 + exec->get_minor_version();
+    const int version =
+        (exec->get_major_version() << 4) + exec->get_minor_version();
     // The num_item is decided to make the occupancy 100%
     // TODO: Extend this list when new GPU is released
     //       Tune this parameter
@@ -640,7 +197,7 @@ int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
     case 0x37:
         num_item = 14;
     }
-    // Ensure that satisfy:
+    // Ensure that the following is satisfied:
     // sizeof(IndexType) + sizeof(ValueType)
     // <= items_per_thread * sizeof(IndexType)
     constexpr int minimal_num =
@@ -650,6 +207,46 @@ int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
 }
 
 
+template <int subwarp_size, typename ValueType, typename IndexType>
+void classical_spmv(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const CudaExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Dense<ValueType> *b,
+                    matrix::Dense<ValueType> *c,
+                    const matrix::Dense<ValueType> *alpha = nullptr,
+                    const matrix::Dense<ValueType> *beta = nullptr)
+{
+    const auto nwarps = exec->get_num_warps_per_sm() *
+                        exec->get_num_multiprocessor() * classical_overweight;
+    const auto gridx =
+        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
+                 int64(nwarps / warps_in_block));
+    const dim3 grid(gridx, b->get_size()[1]);
+    const dim3 block(spmv_block_size);
+
+    if (alpha == nullptr && beta == nullptr) {
+        kernel::abstract_classical_spmv<subwarp_size><<<grid, block, 0, 0>>>(
+            a->get_size()[0], as_cuda_type(a->get_const_values()),
+            a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()),
+            as_cuda_type(b->get_const_values()), b->get_stride(),
+            as_cuda_type(c->get_values()), c->get_stride());
+
+    } else if (alpha != nullptr && beta != nullptr) {
+        kernel::abstract_classical_spmv<subwarp_size><<<grid, block, 0, 0>>>(
+            a->get_size()[0], as_cuda_type(alpha->get_const_values()),
+            as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
+            as_cuda_type(a->get_const_row_ptrs()),
+            as_cuda_type(b->get_const_values()), b->get_stride(),
+            as_cuda_type(beta->get_const_values()),
+            as_cuda_type(c->get_values()), c->get_stride());
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
+
+
 }  // namespace host_kernel
 
 
@@ -659,10 +256,11 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
 {
     if (a->get_strategy()->get_name() == "load_balance") {
-        zero_array(c->get_num_stored_elements(), c->get_values());
+        components::fill_array(exec, c->get_values(),
+                               c->get_num_stored_elements(), zero<ValueType>());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
-            const dim3 csr_block(cuda_config::warp_size, warps_in_block, 1);
+            const dim3 csr_block(config::warp_size, warps_in_block, 1);
             const dim3 csr_grid(ceildiv(nwarps, warps_in_block),
                                 b->get_size()[1]);
             kernel::abstract_spmv<<<csr_grid, csr_block>>>(
@@ -686,14 +284,26 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
             },
             syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
     } else if (a->get_strategy()->get_name() == "classical") {
-        const dim3 grid(ceildiv(a->get_size()[0], classical_block_size),
-                        b->get_size()[1]);
-        kernel::abstract_classical_spmv<<<grid, classical_block_size>>>(
-            a->get_size()[0], as_cuda_type(a->get_const_values()),
-            a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()),
-            as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(c->get_values()), c->get_stride());
-    } else if (a->get_strategy()->get_name() == "cusparse") {
+        IndexType max_length_per_row = 0;
+        using Tcsr = matrix::Csr<ValueType, IndexType>;
+        if (auto strategy =
+                std::dynamic_pointer_cast<const typename Tcsr::classical>(
+                    a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else if (auto strategy = std::dynamic_pointer_cast<
+                       const typename Tcsr::automatical>(a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else {
+            GKO_NOT_SUPPORTED(a->get_strategy());
+        }
+        host_kernel::select_classical_spmv(
+            classical_kernels(),
+            [&max_length_per_row](int compiled_info) {
+                return max_length_per_row >= compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+    } else if (a->get_strategy()->get_name() == "sparselib" ||
+               a->get_strategy()->get_name() == "cusparse") {
         if (cusparse::is_supported<ValueType, IndexType>::value) {
             // TODO: add implementation for int64 and multiple RHS
             auto handle = exec->get_cusparse_handle();
@@ -739,7 +349,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         const IndexType nwarps = a->get_num_srow_elements();
 
         if (nwarps > 0) {
-            const dim3 csr_block(cuda_config::warp_size, warps_in_block, 1);
+            const dim3 csr_block(config::warp_size, warps_in_block, 1);
             const dim3 csr_grid(ceildiv(nwarps, warps_in_block),
                                 b->get_size()[1]);
             kernel::abstract_spmv<<<csr_grid, csr_block>>>(
@@ -751,8 +361,11 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                 as_cuda_type(b->get_const_values()),
                 as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()),
                 as_cuda_type(c->get_stride()));
+        } else {
+            GKO_NOT_SUPPORTED(nwarps);
         }
-    } else if (a->get_strategy()->get_name() == "cusparse") {
+    } else if (a->get_strategy()->get_name() == "sparselib" ||
+               a->get_strategy()->get_name() == "cusparse") {
         if (cusparse::is_supported<ValueType, IndexType>::value) {
             // TODO: add implementation for int64 and multiple RHS
             auto descr = cusparse::create_mat_descr();
@@ -776,15 +389,25 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
             GKO_NOT_IMPLEMENTED;
         }
     } else if (a->get_strategy()->get_name() == "classical") {
-        const dim3 grid(ceildiv(a->get_size()[0], classical_block_size),
-                        b->get_size()[1]);
-        kernel::abstract_classical_spmv<<<grid, classical_block_size>>>(
-            a->get_size()[0], as_cuda_type(alpha->get_const_values()),
-            as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
-            as_cuda_type(a->get_const_row_ptrs()),
-            as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(beta->get_const_values()),
-            as_cuda_type(c->get_values()), c->get_stride());
+        IndexType max_length_per_row = 0;
+        using Tcsr = matrix::Csr<ValueType, IndexType>;
+        if (auto strategy =
+                std::dynamic_pointer_cast<const typename Tcsr::classical>(
+                    a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else if (auto strategy = std::dynamic_pointer_cast<
+                       const typename Tcsr::automatical>(a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else {
+            GKO_NOT_SUPPORTED(a->get_strategy());
+        }
+        host_kernel::select_classical_spmv(
+            classical_kernels(),
+            [&max_length_per_row](int compiled_info) {
+                return max_length_per_row >= compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
+            beta);
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
             host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
@@ -804,24 +427,228 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
-namespace kernel {
+template <typename ValueType, typename IndexType>
+void spgemm(std::shared_ptr<const CudaExecutor> exec,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_cusparse_handle();
+        cusparse::pointer_mode_guard pm_guard(handle);
+        auto a_descr = cusparse::create_mat_descr();
+        auto b_descr = cusparse::create_mat_descr();
+        auto c_descr = cusparse::create_mat_descr();
+        auto d_descr = cusparse::create_mat_descr();
+        auto info = cusparse::create_spgemm_info();
+
+        auto alpha = one<ValueType>();
+        auto a_nnz = IndexType(a->get_num_stored_elements());
+        auto a_vals = a->get_const_values();
+        auto a_row_ptrs = a->get_const_row_ptrs();
+        auto a_col_idxs = a->get_const_col_idxs();
+        auto b_nnz = IndexType(b->get_num_stored_elements());
+        auto b_vals = b->get_const_values();
+        auto b_row_ptrs = b->get_const_row_ptrs();
+        auto b_col_idxs = b->get_const_col_idxs();
+        auto null_value = static_cast<ValueType *>(nullptr);
+        auto null_index = static_cast<IndexType *>(nullptr);
+        auto zero_nnz = IndexType{};
+        auto m = IndexType(a->get_size()[0]);
+        auto n = IndexType(b->get_size()[1]);
+        auto k = IndexType(a->get_size()[1]);
+        auto c_row_ptrs = c->get_row_ptrs();
+        matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+        auto &c_col_idxs_array = c_builder.get_col_idx_array();
+        auto &c_vals_array = c_builder.get_value_array();
+
+        // allocate buffer
+        size_type buffer_size{};
+        cusparse::spgemm_buffer_size(
+            handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
+            zero_nnz, null_index, null_index, info, buffer_size);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+
+        // count nnz
+        IndexType c_nnz{};
+        cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
+                             a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
+                             d_descr, zero_nnz, null_index, null_index, c_descr,
+                             c_row_ptrs, &c_nnz, info, buffer);
+
+        // accumulate non-zeros
+        c_col_idxs_array.resize_and_reset(c_nnz);
+        c_vals_array.resize_and_reset(c_nnz);
+        auto c_col_idxs = c_col_idxs_array.get_data();
+        auto c_vals = c_vals_array.get_data();
+        cusparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+                         a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                         b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
+                         null_value, null_index, null_index, c_descr, c_vals,
+                         c_row_ptrs, c_col_idxs, info, buffer);
+
+        cusparse::destroy(info);
+        cusparse::destroy(d_descr);
+        cusparse::destroy(c_descr);
+        cusparse::destroy(b_descr);
+        cusparse::destroy(a_descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void convert_row_ptrs_to_idxs(
-    size_type num_rows, const IndexType *__restrict__ ptrs,
-    IndexType *__restrict__ idxs)
+
+template <typename ValueType, typename IndexType>
+void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
+                     const matrix::Dense<ValueType> *alpha,
+                     const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Csr<ValueType, IndexType> *b,
+                     const matrix::Dense<ValueType> *beta,
+                     const matrix::Csr<ValueType, IndexType> *d,
+                     matrix::Csr<ValueType, IndexType> *c)
 {
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (tidx < num_rows) {
-        for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) {
-            idxs[i] = tidx;
-        }
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_cusparse_handle();
+        cusparse::pointer_mode_guard pm_guard(handle);
+        auto a_descr = cusparse::create_mat_descr();
+        auto b_descr = cusparse::create_mat_descr();
+        auto c_descr = cusparse::create_mat_descr();
+        auto d_descr = cusparse::create_mat_descr();
+        auto info = cusparse::create_spgemm_info();
+
+        auto valpha = exec->copy_val_to_host(alpha->get_const_values());
+        auto a_nnz = IndexType(a->get_num_stored_elements());
+        auto a_vals = a->get_const_values();
+        auto a_row_ptrs = a->get_const_row_ptrs();
+        auto a_col_idxs = a->get_const_col_idxs();
+        auto b_nnz = IndexType(b->get_num_stored_elements());
+        auto b_vals = b->get_const_values();
+        auto b_row_ptrs = b->get_const_row_ptrs();
+        auto b_col_idxs = b->get_const_col_idxs();
+        auto vbeta = exec->copy_val_to_host(beta->get_const_values());
+        auto d_nnz = IndexType(d->get_num_stored_elements());
+        auto d_vals = d->get_const_values();
+        auto d_row_ptrs = d->get_const_row_ptrs();
+        auto d_col_idxs = d->get_const_col_idxs();
+        auto m = IndexType(a->get_size()[0]);
+        auto n = IndexType(b->get_size()[1]);
+        auto k = IndexType(a->get_size()[1]);
+        auto c_row_ptrs = c->get_row_ptrs();
+        matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+        auto &c_col_idxs_array = c_builder.get_col_idx_array();
+        auto &c_vals_array = c_builder.get_value_array();
+
+        // allocate buffer
+        size_type buffer_size{};
+        cusparse::spgemm_buffer_size(
+            handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+            b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz,
+            d_row_ptrs, d_col_idxs, info, buffer_size);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+
+        // count nnz
+        IndexType c_nnz{};
+        cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
+                             a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
+                             d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr,
+                             c_row_ptrs, &c_nnz, info, buffer);
+
+        // accumulate non-zeros
+        c_col_idxs_array.resize_and_reset(c_nnz);
+        c_vals_array.resize_and_reset(c_nnz);
+        auto c_col_idxs = c_col_idxs_array.get_data();
+        auto c_vals = c_vals_array.get_data();
+        cusparse::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals,
+                         a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                         b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals,
+                         d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs,
+                         c_col_idxs, info, buffer);
+
+        cusparse::destroy(info);
+        cusparse::destroy(d_descr);
+        cusparse::destroy(c_descr);
+        cusparse::destroy(b_descr);
+        cusparse::destroy(a_descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
     }
 }
 
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void spgeam(syn::value_list<int, subwarp_size>,
+            std::shared_ptr<const DefaultExecutor> exec, const ValueType *alpha,
+            const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
+            const ValueType *a_vals, const ValueType *beta,
+            const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
+            const ValueType *b_vals, matrix::Csr<ValueType, IndexType> *c)
+{
+    auto m = static_cast<IndexType>(c->get_size()[0]);
+    auto c_row_ptrs = c->get_row_ptrs();
+    // count nnz for alpha * A + beta * B
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(m, subwarps_per_block);
+    kernel::spgeam_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
+        a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, m + 1);
+
+    // accumulate non-zeros for alpha * A + beta * B
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
+    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
+    c_builder.get_value_array().resize_and_reset(c_nnz);
+    auto c_col_idxs = c->get_col_idxs();
+    auto c_vals = c->get_values();
+    kernel::spgeam<subwarp_size><<<num_blocks, default_block_size>>>(
+        as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals),
+        as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m,
+        c_row_ptrs, c_col_idxs, as_cuda_type(c_vals));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
+
 
-}  // namespace kernel
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto total_nnz =
+        a->get_num_stored_elements() + b->get_num_stored_elements();
+    auto nnz_per_row = total_nnz / a->get_size()[0];
+    select_spgeam(
+        spgeam_kernels(),
+        [&](int compiled_subwarp_size) {
+            return compiled_subwarp_size >= nnz_per_row ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec,
+        alpha->get_const_values(), a->get_const_row_ptrs(),
+        a->get_const_col_idxs(), a->get_const_values(),
+        beta->get_const_values(), b->get_const_row_ptrs(),
+        b->get_const_col_idxs(), b->get_const_values(), c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename IndexType>
@@ -838,8 +665,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -853,46 +680,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType>
-__global__
-    __launch_bounds__(cuda_config::max_block_size) void initialize_zero_dense(
-        size_type num_rows, size_type num_cols, size_type stride,
-        ValueType *__restrict__ result)
-{
-    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
-    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
-    if (tidx_x < num_cols && tidx_y < num_rows) {
-        result[tidx_y * stride + tidx_x] = zero<ValueType>();
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_dense(
-    size_type num_rows, const IndexType *__restrict__ row_ptrs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values, size_type stride,
-    ValueType *__restrict__ result)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (tidx < num_rows) {
-        for (auto i = row_ptrs[tidx]; i < row_ptrs[tidx + 1]; i++) {
-            result[stride * tidx + col_idxs[i]] = values[i];
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -901,9 +692,8 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
     const auto col_idxs = source->get_const_col_idxs();
     const auto vals = source->get_const_values();
 
-    const dim3 block_size(cuda_config::warp_size,
-                          cuda_config::max_block_size / cuda_config::warp_size,
-                          1);
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
     const dim3 init_grid_dim(ceildiv(stride, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
@@ -919,98 +709,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void calculate_nnz_per_row(
-    size_type num_rows, const IndexType *__restrict__ row_ptrs,
-    size_type *__restrict__ nnz_per_row)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tidx < num_rows) {
-        nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx];
-    }
-}
-
-
-__global__
-    __launch_bounds__(cuda_config::warp_size) void calculate_slice_lengths(
-        size_type num_rows, size_type slice_size, size_type stride_factor,
-        const size_type *__restrict__ nnz_per_row,
-        size_type *__restrict__ slice_lengths,
-        size_type *__restrict__ slice_sets)
-{
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto sliceid = blockIdx.x;
-    const auto tid_in_warp = threadIdx.x;
-
-    if (sliceid * slice_size + tid_in_warp < num_rows) {
-        size_type thread_result = 0;
-        for (auto i = tid_in_warp; i < slice_size; i += warp_size) {
-            thread_result =
-                (i + slice_size * sliceid < num_rows)
-                    ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
-                    : thread_result;
-        }
-
-        auto warp_tile =
-            group::tiled_partition<warp_size>(group::this_thread_block());
-        auto warp_result = gko::kernels::cuda::reduce(
-            warp_tile, thread_result,
-            [](const size_type &a, const size_type &b) { return max(a, b); });
-
-        if (tid_in_warp == 0) {
-            auto slice_length =
-                ceildiv(warp_result, stride_factor) * stride_factor;
-            slice_lengths[sliceid] = slice_length;
-            slice_sets[sliceid] = slice_length;
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_sellp(
-    size_type num_rows, size_type slice_size,
-    const ValueType *__restrict__ source_values,
-    const IndexType *__restrict__ source_row_ptrs,
-    const IndexType *__restrict__ source_col_idxs,
-    size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets,
-    IndexType *__restrict__ result_col_idxs,
-    ValueType *__restrict__ result_values)
-{
-    const auto global_row = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto row = global_row % slice_size;
-    const auto sliceid = global_row / slice_size;
-
-    if (global_row < num_rows) {
-        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
-
-        for (size_type csr_ind = source_row_ptrs[global_row];
-             csr_ind < source_row_ptrs[global_row + 1]; csr_ind++) {
-            result_values[sellp_ind] = source_values[csr_ind];
-            result_col_idxs[sellp_ind] = source_col_idxs[csr_ind];
-            sellp_ind += slice_size;
-        }
-        for (size_type i = sellp_ind;
-             i <
-             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
-             i += slice_size) {
-            result_col_idxs[i] = 0;
-            result_values[i] = zero<ValueType>();
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -1041,22 +743,12 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
 
     grid_dim = slice_num;
 
-    kernel::calculate_slice_lengths<<<grid_dim, cuda_config::warp_size>>>(
+    kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
         num_rows, slice_size, stride_factor,
         as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_const_data()));
+    components::prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
@@ -1064,66 +756,16 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
         as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
         as_cuda_type(result_col_idxs), as_cuda_type(result_values));
-
-    nnz_per_row.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_zero_ell(
-    size_type max_nnz_per_row, size_type stride, ValueType *__restrict__ values,
-    IndexType *__restrict__ col_idxs)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tidx < stride * max_nnz_per_row) {
-        values[tidx] = zero<ValueType>();
-        col_idxs[tidx] = 0;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_ell(
-    size_type num_rows, size_type stride,
-    const ValueType *__restrict__ source_values,
-    const IndexType *__restrict__ source_row_ptrs,
-    const IndexType *__restrict__ source_col_idxs,
-    ValueType *__restrict__ result_values,
-    IndexType *__restrict__ result_col_idxs)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto row = tidx / warp_size;
-    const auto local_tidx = tidx % warp_size;
-
-    if (row < num_rows) {
-        for (size_type i = local_tidx;
-             i < source_row_ptrs[row + 1] - source_row_ptrs[row];
-             i += warp_size) {
-            const auto result_idx = row + stride * i;
-            const auto source_idx = i + source_row_ptrs[row];
-            result_values[result_idx] = source_values[source_idx];
-            result_col_idxs[result_idx] = source_col_idxs[source_idx];
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     const auto source_values = source->get_const_values();
     const auto source_row_ptrs = source->get_const_row_ptrs();
@@ -1144,7 +786,7 @@ void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(result_col_idxs));
 
     const auto grid_dim =
-        ceildiv(num_rows * cuda_config::warp_size, default_block_size);
+        ceildiv(num_rows * config::warp_size, default_block_size);
 
     kernel::fill_in_ell<<<grid_dim, default_block_size>>>(
         num_rows, stride, as_cuda_type(source_values),
@@ -1156,57 +798,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
-namespace kernel {
-
-
-__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice(
-    size_type num_rows, size_type slice_size, size_type stride_factor,
-    const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto warpid = tidx / warp_size;
-    const auto tid_in_warp = tidx % warp_size;
-    const auto slice_num = ceildiv(num_rows, slice_size);
-
-    size_type thread_result = 0;
-    for (auto i = tid_in_warp; i < slice_size; i += warp_size) {
-        if (warpid * slice_size + i < num_rows) {
-            thread_result =
-                max(thread_result, nnz_per_row[warpid * slice_size + i]);
-        }
-    }
-
-    auto warp_tile =
-        group::tiled_partition<warp_size>(group::this_thread_block());
-    auto warp_result = gko::kernels::cuda::reduce(
-        warp_tile, thread_result,
-        [](const size_type &a, const size_type &b) { return max(a, b); });
-
-    if (tid_in_warp == 0 && warpid < slice_num) {
-        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
-    }
-}
-
-
-__global__ __launch_bounds__(default_block_size) void reduce_total_cols(
-    size_type num_slices, const size_type *__restrict__ max_nnz_per_slice,
-    size_type *__restrict__ result)
-{
-    extern __shared__ size_type block_result[];
-
-    reduce_array(num_slices, max_nnz_per_slice, block_result,
-                 [](const size_type &x, const size_type &y) { return x + y; });
-
-    if (threadIdx.x == 0) {
-        result[blockIdx.x] = block_result[0];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
                           const matrix::Csr<ValueType, IndexType> *source,
@@ -1223,7 +814,7 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
     kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
         num_rows, as_cuda_type(row_ptrs), as_cuda_type(nnz_per_row.get_data()));
 
-    grid_dim = ceildiv(slice_num * cuda_config::warp_size, default_block_size);
+    grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
 
     kernel::reduce_max_nnz_per_slice<<<grid_dim, default_block_size>>>(
@@ -1234,25 +825,17 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
     grid_dim = ceildiv(slice_num, default_block_size);
     auto block_results = Array<size_type>(exec, grid_dim);
 
-    kernel::reduce_total_cols<<<grid_dim, default_block_size,
-                                default_block_size * sizeof(size_type)>>>(
+    kernel::reduce_total_cols<<<grid_dim, default_block_size>>>(
         slice_num, as_cuda_type(max_nnz_per_slice.get_const_data()),
         as_cuda_type(block_results.get_data()));
 
     auto d_result = Array<size_type>(exec, 1);
 
-    kernel::reduce_total_cols<<<1, default_block_size,
-                                default_block_size * sizeof(size_type)>>>(
+    kernel::reduce_total_cols<<<1, default_block_size>>>(
         grid_dim, as_cuda_type(block_results.get_const_data()),
         as_cuda_type(d_result.get_data()));
 
-    exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
-                                  result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
+    *result = exec->copy_val_to_host(d_result.get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1261,8 +844,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
@@ -1279,33 +862,13 @@ void transpose(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
-namespace {
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void conjugate_kernel(
-    size_type num_nonzeros, ValueType *__restrict__ val)
-{
-    const auto tidx =
-        static_cast<size_type>(blockIdx.x) * default_block_size + threadIdx.x;
-
-    if (tidx < num_nonzeros) {
-        val[tidx] = conj(val[tidx]);
-    }
-}
-
-
-}  //  namespace
-
-
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         const dim3 block_size(default_block_size, 1, 1);
@@ -1334,26 +897,48 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
-namespace kernel {
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const CudaExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
+    GKO_NOT_IMPLEMENTED;
 
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
-__global__ __launch_bounds__(default_block_size) void reduce_max_nnz(
-    size_type size, const size_type *__restrict__ nnz_per_row,
-    size_type *__restrict__ result)
-{
-    extern __shared__ size_type block_max[];
 
-    reduce_array(
-        size, nnz_per_row, block_max,
-        [](const size_type &x, const size_type &y) { return max(x, y); });
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const CudaExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
 
-    if (threadIdx.x == 0) {
-        result[blockIdx.x] = block_max[0];
-    }
-}
 
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
+    GKO_NOT_IMPLEMENTED;
 
-}  // namespace kernel
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const CudaExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1374,93 +959,25 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
 
     const auto n = ceildiv(num_rows, default_block_size);
     const auto reduce_dim = n <= default_block_size ? n : default_block_size;
-    kernel::reduce_max_nnz<<<reduce_dim, default_block_size,
-                             default_block_size * sizeof(size_type)>>>(
+    kernel::reduce_max_nnz<<<reduce_dim, default_block_size>>>(
         num_rows, as_cuda_type(nnz_per_row.get_const_data()),
         as_cuda_type(block_results.get_data()));
 
-    kernel::reduce_max_nnz<<<1, default_block_size,
-                             default_block_size * sizeof(size_type)>>>(
+    kernel::reduce_max_nnz<<<1, default_block_size>>>(
         reduce_dim, as_cuda_type(block_results.get_const_data()),
         as_cuda_type(d_result.get_data()));
 
-    exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
-                                  result);
-
-    nnz_per_row.clear();
-    block_results.clear();
-    d_result.clear();
+    *result = exec->copy_val_to_host(d_result.get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename IndexType>
-__global__
-    __launch_bounds__(default_block_size) void calculate_hybrid_coo_row_nnz(
-        size_type num_rows, size_type ell_max_nnz_per_row,
-        IndexType *__restrict__ csr_row_idxs,
-        size_type *__restrict__ coo_row_nnz)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tidx < num_rows) {
-        const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx];
-        coo_row_nnz[tidx] =
-            (csr_nnz > ell_max_nnz_per_row) * (csr_nnz - ell_max_nnz_per_row);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_hybrid(
-    size_type num_rows, size_type stride, size_type ell_max_nnz_per_row,
-    const ValueType *__restrict__ source_values,
-    const IndexType *__restrict__ source_row_ptrs,
-    const IndexType *__restrict__ source_col_idxs,
-    const size_type *__restrict__ coo_offset,
-    ValueType *__restrict__ result_ell_val,
-    IndexType *__restrict__ result_ell_col,
-    ValueType *__restrict__ result_coo_val,
-    IndexType *__restrict__ result_coo_col,
-    IndexType *__restrict__ result_coo_row)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto row = tidx / warp_size;
-    const auto local_tidx = tidx % warp_size;
-
-    if (row < num_rows) {
-        for (size_type i = local_tidx;
-             i < source_row_ptrs[row + 1] - source_row_ptrs[row];
-             i += warp_size) {
-            const auto source_idx = i + source_row_ptrs[row];
-            if (i < ell_max_nnz_per_row) {
-                const auto result_idx = row + stride * i;
-                result_ell_val[result_idx] = source_values[source_idx];
-                result_ell_col[result_idx] = source_col_idxs[source_idx];
-            } else {
-                const auto result_idx =
-                    coo_offset[row] + i - ell_max_nnz_per_row;
-                result_coo_val[result_idx] = source_values[source_idx];
-                result_coo_col[result_idx] = source_col_idxs[source_idx];
-                result_coo_row[result_idx] = row;
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto ell_val = result->get_ell_values();
     auto ell_col = result->get_ell_col_idxs();
@@ -1482,17 +999,9 @@ void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
         num_rows, max_nnz_per_row, as_cuda_type(source->get_const_row_ptrs()),
         as_cuda_type(coo_offset.get_data()));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(num_rows, default_block_size));
-    grid_dim = ceildiv(num_rows, default_block_size);
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(coo_offset.get_data()),
-        as_cuda_type(add_values.get_data()));
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(coo_offset.get_data()),
-        as_cuda_type(add_values.get_const_data()));
-
-    grid_dim = ceildiv(num_rows * cuda_config::warp_size, default_block_size);
+    components::prefix_sum(exec, coo_offset.get_data(), num_rows);
+
+    grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
     kernel::fill_in_hybrid<<<grid_dim, default_block_size>>>(
         num_rows, stride, max_nnz_per_row,
         as_cuda_type(source->get_const_values()),
@@ -1527,7 +1036,46 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const CudaExecutor> exec,
                           matrix::Csr<ValueType, IndexType> *to_sort)
-    GKO_NOT_IMPLEMENTED;
+{
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_cusparse_handle();
+        auto descr = cusparse::create_mat_descr();
+        auto m = IndexType(to_sort->get_size()[0]);
+        auto n = IndexType(to_sort->get_size()[1]);
+        auto nnz = IndexType(to_sort->get_num_stored_elements());
+        auto row_ptrs = to_sort->get_const_row_ptrs();
+        auto col_idxs = to_sort->get_col_idxs();
+        auto vals = to_sort->get_values();
+
+        // copy values
+        Array<ValueType> tmp_vals_array(exec, nnz);
+        exec->copy(nnz, vals, tmp_vals_array.get_data());
+        auto tmp_vals = tmp_vals_array.get_const_data();
+
+        // init identity permutation
+        Array<IndexType> permutation_array(exec, nnz);
+        auto permutation = permutation_array.get_data();
+        cusparse::create_identity_permutation(handle, nnz, permutation);
+
+        // allocate buffer
+        size_type buffer_size{};
+        cusparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+                                      buffer_size);
+        Array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+
+        // sort column indices
+        cusparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+                          permutation, buffer);
+
+        // sort values
+        cusparse::gather(handle, nnz, tmp_vals, vals, permutation);
+
+        cusparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
@@ -1536,8 +1084,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *to_check,
-    bool *is_sorted) GKO_NOT_IMPLEMENTED;
+    const matrix::Csr<ValueType, IndexType> *to_check, bool *is_sorted)
+{
+    *is_sorted = true;
+    auto cpu_array = Array<bool>::view(exec->get_master(), 1, is_sorted);
+    auto gpu_array = Array<bool>{exec, cpu_array};
+    auto block_size = default_block_size;
+    auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
+    auto num_blocks = ceildiv(num_rows, block_size);
+    kernel::check_unsorted<<<num_blocks, block_size>>>(
+        to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+        num_rows, gpu_array.get_data());
+    cpu_array = gpu_array;
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index a74c431599a..b694ba6b42d 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,11 +42,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
 
@@ -64,6 +66,9 @@ namespace dense {
 constexpr auto default_block_size = 512;
 
 
+#include "common/matrix/dense_kernels.hpp.inc"
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const CudaExecutor> exec,
                   const matrix::Dense<ValueType> *a,
@@ -111,33 +116,6 @@ void apply(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-namespace kernel {
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void scale(
-    size_type num_rows, size_type num_cols, size_type num_alpha_cols,
-    const ValueType *__restrict__ alpha, ValueType *__restrict__ x,
-    size_type stride_x)
-{
-    constexpr auto warps_per_block = block_size / cuda_config::warp_size;
-    const auto global_id =
-        thread::get_thread_id<cuda_config::warp_size, warps_per_block>();
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
-    if (row_id < num_rows) {
-        x[row_id * stride_x + col_id] =
-            alpha[alpha_id] == zero<ValueType>()
-                ? zero<ValueType>()
-                : x[row_id * stride_x + col_id] * alpha[alpha_id];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const CudaExecutor> exec,
            const matrix::Dense<ValueType> *alpha, matrix::Dense<ValueType> *x)
@@ -151,8 +129,8 @@ void scale(std::shared_ptr<const CudaExecutor> exec,
         constexpr auto block_size = default_block_size;
         const dim3 grid_dim =
             ceildiv(x->get_size()[0] * x->get_size()[1], block_size);
-        const dim3 block_dim{cuda_config::warp_size, 1,
-                             block_size / cuda_config::warp_size};
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
         kernel::scale<block_size><<<grid_dim, block_dim>>>(
             x->get_size()[0], x->get_size()[1], alpha->get_size()[1],
             as_cuda_type(alpha->get_const_values()),
@@ -163,31 +141,6 @@ void scale(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
 
 
-namespace kernel {
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void add_scaled(
-    size_type num_rows, size_type num_cols, size_type num_alpha_cols,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ x,
-    size_type stride_x, ValueType *__restrict__ y, size_type stride_y)
-{
-    constexpr auto warps_per_block = block_size / cuda_config::warp_size;
-    const auto global_id =
-        thread::get_thread_id<cuda_config::warp_size, warps_per_block>();
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
-    if (row_id < num_rows && alpha[alpha_id] != zero<ValueType>()) {
-        y[row_id * stride_y + col_id] +=
-            x[row_id * stride_x + col_id] * alpha[alpha_id];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType>
 void add_scaled(std::shared_ptr<const CudaExecutor> exec,
                 const matrix::Dense<ValueType> *alpha,
@@ -202,8 +155,8 @@ void add_scaled(std::shared_ptr<const CudaExecutor> exec,
         constexpr auto block_size = default_block_size;
         const dim3 grid_dim =
             ceildiv(x->get_size()[0] * x->get_size()[1], block_size);
-        const dim3 block_dim{cuda_config::warp_size, 1,
-                             block_size / cuda_config::warp_size};
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
         kernel::add_scaled<block_size><<<grid_dim, block_dim>>>(
             x->get_size()[0], x->get_size()[1], alpha->get_size()[1],
             as_cuda_type(alpha->get_const_values()),
@@ -215,63 +168,6 @@ void add_scaled(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
 
 
-namespace kernel {
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void compute_partial_dot(
-    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
-    const ValueType *__restrict__ y, size_type stride_y,
-    ValueType *__restrict__ work)
-{
-    constexpr auto warps_per_block = block_size / cuda_config::warp_size;
-
-    const auto num_blocks = gridDim.x;
-    const auto local_id = thread::get_local_thread_id<cuda_config::warp_size>();
-    const auto global_id =
-        thread::get_thread_id<cuda_config::warp_size, warps_per_block>();
-
-    auto tmp = zero<ValueType>();
-    for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
-        tmp += x[i * stride_x] * y[i * stride_y];
-    }
-    __shared__ UninitializedArray<ValueType, block_size> tmp_work;
-    tmp_work[local_id] = tmp;
-
-    reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
-           [](const ValueType &x, const ValueType &y) { return x + y; });
-
-    if (local_id == 0) {
-        work[thread::get_block_id()] = tmp_work[0];
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void finalize_dot_computation(
-    size_type size, const ValueType *work, ValueType *result)
-{
-    const auto local_id = thread::get_local_thread_id<cuda_config::warp_size>();
-
-    ValueType tmp = zero<ValueType>();
-    for (auto i = local_id; i < size; i += block_size) {
-        tmp += work[i];
-    }
-    __shared__ UninitializedArray<ValueType, block_size> tmp_work;
-    tmp_work[local_id] = tmp;
-
-    reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
-           [](const ValueType &x, const ValueType &y) { return x + y; });
-
-    if (local_id == 0) {
-        *result = tmp_work[0];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType>
 void compute_dot(std::shared_ptr<const CudaExecutor> exec,
                  const matrix::Dense<ValueType> *x,
@@ -295,8 +191,8 @@ void compute_dot(std::shared_ptr<const CudaExecutor> exec,
 
         constexpr auto work_per_block = work_per_thread * block_size;
         const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{cuda_config::warp_size, 1,
-                             block_size / cuda_config::warp_size};
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
         Array<ValueType> work(exec, grid_dim.x);
         // TODO: write a kernel which does this more efficiently
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
@@ -311,32 +207,13 @@ void compute_dot(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void compute_sqrt(
-    size_type num_cols, ValueType *__restrict__ work)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    if (tidx < num_cols) {
-        work[tidx] = sqrt(abs(work[tidx]));
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType> *x,
-                   matrix::Dense<ValueType> *result)
+                   matrix::Dense<remove_complex<ValueType>> *result)
 {
     if (cublas::is_supported<ValueType>::value) {
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
@@ -345,51 +222,37 @@ void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
                           result->get_values() + col);
         }
     } else {
-        compute_dot(exec, x, x, result);
-        const dim3 block_size(default_block_size, 1, 1);
-        const dim3 grid_size(ceildiv(result->get_size()[1], block_size.x), 1,
-                             1);
-        kernel::compute_sqrt<<<grid_size, block_size, 0, 0>>>(
-            result->get_size()[1], as_cuda_type(result->get_values()));
-    }
-}
-
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-namespace kernel {
-
+        using norm_type = remove_complex<ValueType>;
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        constexpr auto block_size = 1024;
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_coo(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const size_type *__restrict__ row_ptrs,
-    const ValueType *__restrict__ source, IndexType *__restrict__ row_idxs,
-    IndexType *__restrict__ col_idxs, ValueType *__restrict__ values)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (tidx < num_rows) {
-        size_type write_to = row_ptrs[tidx];
-
-        for (size_type i = 0; i < num_cols; i++) {
-            if (source[stride * tidx + i] != zero<ValueType>()) {
-                values[write_to] = source[stride * tidx + i];
-                col_idxs[write_to] = i;
-                row_idxs[write_to] = tidx;
-                write_to++;
-            }
+        constexpr auto work_per_block = work_per_thread * block_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        Array<norm_type> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
+                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
+                x->get_stride(), as_cuda_type(work.get_data()));
+            kernel::finalize_norm2_computation<block_size><<<1, block_dim>>>(
+                grid_dim.x, as_cuda_type(work.get_const_data()),
+                as_cuda_type(result->get_values() + col));
         }
     }
 }
 
-
-}  // namespace kernel
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -403,88 +266,25 @@ void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
     auto nnz_prefix_sum = Array<size_type>(exec, num_rows);
     calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum);
 
-    const size_type grid_dim = ceildiv(num_rows, default_block_size);
-    auto add_values = Array<size_type>(exec, grid_dim);
+    components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
 
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(nnz_prefix_sum.get_data()),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(nnz_prefix_sum.get_data()),
-        as_cuda_type(add_values.get_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_coo<<<grid_dim, default_block_size>>>(
         num_rows, num_cols, stride,
         as_cuda_type(nnz_prefix_sum.get_const_data()),
         as_cuda_type(source->get_const_values()), as_cuda_type(row_idxs),
         as_cuda_type(col_idxs), as_cuda_type(values));
-
-    nnz_prefix_sum.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void count_nnz_per_row(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ work, IndexType *__restrict__ result)
-{
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto row_idx = tidx / warp_size;
-
-    if (row_idx < num_rows) {
-        IndexType part_result{};
-        for (auto i = threadIdx.x % warp_size; i < num_cols; i += warp_size) {
-            if (work[stride * row_idx + i] != zero<ValueType>()) {
-                part_result += 1;
-            }
-        }
-
-        auto warp_tile =
-            group::tiled_partition<warp_size>(group::this_thread_block());
-        result[row_idx] = reduce(
-            warp_tile, part_result,
-            [](const size_type &a, const size_type &b) { return a + b; });
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_csr(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs,
-    IndexType *__restrict__ col_idxs, ValueType *__restrict__ values)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-
-    if (tidx < num_rows) {
-        auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
-            if (source[stride * tidx + i] != zero<ValueType>()) {
-                values[write_to] = source[stride * tidx + i];
-                col_idxs[write_to] = i;
-                write_to++;
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -495,77 +295,30 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
 
     auto stride = source->get_stride();
 
-    const auto rows_per_block =
-        ceildiv(default_block_size, cuda_config::warp_size);
+    const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
     const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
 
     kernel::count_nnz_per_row<<<grid_dim_nnz, default_block_size>>>(
         num_rows, num_cols, stride, as_cuda_type(source->get_const_values()),
         as_cuda_type(row_ptrs));
 
-    size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_dim);
-
-    start_prefix_sum<default_block_size>
-        <<<grid_dim, default_block_size>>>(num_rows + 1, as_cuda_type(row_ptrs),
-                                           as_cuda_type(add_values.get_data()));
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
 
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
         num_rows, num_cols, stride, as_cuda_type(source->get_const_values()),
         as_cuda_type(row_ptrs), as_cuda_type(col_idxs), as_cuda_type(values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_ell(
-    size_type num_rows, size_type num_cols, size_type source_stride,
-    const ValueType *__restrict__ source, size_type max_nnz_per_row,
-    size_type result_stride, IndexType *__restrict__ col_ptrs,
-    ValueType *__restrict__ values)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (tidx < num_rows) {
-        IndexType col_idx = 0;
-        for (size_type col = 0; col < num_cols; col++) {
-            if (source[tidx * source_stride + col] != zero<ValueType>()) {
-                col_ptrs[col_idx * result_stride + tidx] = col;
-                values[col_idx * result_stride + tidx] =
-                    source[tidx * source_stride + col];
-                col_idx++;
-            }
-        }
-        for (size_type j = col_idx; j < max_nnz_per_row; j++) {
-            col_ptrs[j * result_stride + tidx] = 0;
-            values[j * result_stride + tidx] = zero<ValueType>();
-        }
-    } else if (tidx < result_stride) {
-        for (size_type j = 0; j < max_nnz_per_row; j++) {
-            col_ptrs[j * result_stride + tidx] = 0;
-            values[j * result_stride + tidx] = zero<ValueType>();
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -590,93 +343,18 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
-namespace kernel {
-
-
-__global__
-    __launch_bounds__(cuda_config::warp_size) void calculate_slice_lengths(
-        size_type num_rows, size_type slice_size, int slice_num,
-        size_type stride_factor, const size_type *__restrict__ nnz_per_row,
-        size_type *__restrict__ slice_lengths,
-        size_type *__restrict__ slice_sets)
-{
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto sliceid = blockIdx.x;
-    const auto tid_in_warp = threadIdx.x;
-
-    if (sliceid * slice_size + tid_in_warp < num_rows) {
-        size_type thread_result = 0;
-        for (auto i = tid_in_warp; i < slice_size; i += warp_size) {
-            thread_result =
-                (i + slice_size * sliceid < num_rows)
-                    ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
-                    : thread_result;
-        }
-
-        auto warp_tile =
-            group::tiled_partition<warp_size>(group::this_thread_block());
-        auto warp_result = reduce(
-            warp_tile, thread_result,
-            [](const size_type &a, const size_type &b) { return max(a, b); });
-
-        if (tid_in_warp == 0) {
-            auto slice_length =
-                ceildiv(warp_result, stride_factor) * stride_factor;
-            slice_lengths[sliceid] = slice_length;
-            slice_sets[sliceid] = slice_length;
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_sellp(
-    size_type num_rows, size_type num_cols, size_type slice_size,
-    size_type stride, const ValueType *__restrict__ source,
-    size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets,
-    IndexType *__restrict__ col_idxs, ValueType *__restrict__ vals)
-{
-    const auto global_row = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto row = global_row % slice_size;
-    const auto sliceid = global_row / slice_size;
-
-    if (global_row < num_rows) {
-        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
-
-        for (size_type col = 0; col < num_cols; col++) {
-            auto val = source[global_row * stride + col];
-            if (val != zero<ValueType>()) {
-                col_idxs[sellp_ind] = col;
-                vals[sellp_ind] = val;
-                sellp_ind += slice_size;
-            }
-        }
-        for (size_type i = sellp_ind;
-             i <
-             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
-             i += slice_size) {
-            col_idxs[i] = 0;
-            vals[i] = zero<ValueType>();
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     const auto stride = source->get_stride();
     const auto num_rows = result->get_size()[0];
@@ -700,31 +378,18 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
 
     auto grid_dim = slice_num;
 
-    kernel::calculate_slice_lengths<<<grid_dim, cuda_config::warp_size>>>(
+    kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
         num_rows, slice_size, slice_num, stride_factor,
         as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_const_data()));
+    components::prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
         num_rows, num_cols, slice_size, stride,
         as_cuda_type(source->get_const_values()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets), as_cuda_type(col_idxs), as_cuda_type(vals));
-
-    add_values.clear();
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -733,8 +398,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const CudaExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -751,34 +416,11 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
 
 
-namespace kernel {
-
-
-__global__ __launch_bounds__(default_block_size) void reduce_max_nnz(
-    size_type size, const size_type *__restrict__ nnz_per_row,
-    size_type *__restrict__ result)
-{
-    extern __shared__ size_type block_max[];
-
-    reduce_array(
-        size, nnz_per_row, block_max,
-        [](const size_type &x, const size_type &y) { return max(x, y); });
-
-    if (threadIdx.x == 0) {
-        result[blockIdx.x] = block_max[0];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType>
 void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
                                const matrix::Dense<ValueType> *source,
@@ -807,11 +449,7 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
         grid_dim, as_cuda_type(block_results.get_const_data()),
         as_cuda_type(d_result.get_data()));
 
-    exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
-                                  result);
-    d_result.clear();
-    block_results.clear();
-    nnz_per_row.clear();
+    *result = exec->copy_val_to_host(d_result.get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -824,7 +462,7 @@ void calculate_nonzeros_per_row(std::shared_ptr<const CudaExecutor> exec,
                                 Array<size_type> *result)
 {
     const dim3 block_size(default_block_size, 1, 1);
-    auto rows_per_block = ceildiv(default_block_size, cuda_config::warp_size);
+    auto rows_per_block = ceildiv(default_block_size, config::warp_size);
     const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
     const dim3 grid_size(grid_x, 1, 1);
     kernel::count_nnz_per_row<<<grid_size, block_size>>>(
@@ -837,57 +475,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL);
 
 
-namespace kernel {
-
-
-__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice(
-    size_type num_rows, size_type slice_size, size_type stride_factor,
-    const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto warpid = tidx / warp_size;
-    const auto tid_in_warp = tidx % warp_size;
-    const auto slice_num = ceildiv(num_rows, slice_size);
-
-    size_type thread_result = 0;
-    for (auto i = tid_in_warp; i < slice_size; i += warp_size) {
-        if (warpid * slice_size + i < num_rows) {
-            thread_result =
-                max(thread_result, nnz_per_row[warpid * slice_size + i]);
-        }
-    }
-
-    auto warp_tile =
-        group::tiled_partition<warp_size>(group::this_thread_block());
-    auto warp_result = reduce(
-        warp_tile, thread_result,
-        [](const size_type &a, const size_type &b) { return max(a, b); });
-
-    if (tid_in_warp == 0 && warpid < slice_num) {
-        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
-    }
-}
-
-
-__global__ __launch_bounds__(default_block_size) void reduce_total_cols(
-    size_type num_slices, const size_type *__restrict__ max_nnz_per_slice,
-    size_type *__restrict__ result)
-{
-    extern __shared__ size_type block_result[];
-
-    reduce_array(num_slices, max_nnz_per_slice, block_result,
-                 [](const size_type &x, const size_type &y) { return x + y; });
-
-    if (threadIdx.x == 0) {
-        result[blockIdx.x] = block_result[0];
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType>
 void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
                           const matrix::Dense<ValueType> *source,
@@ -904,8 +491,7 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
 
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
 
-    auto grid_dim =
-        ceildiv(slice_num * cuda_config::warp_size, default_block_size);
+    auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
 
     kernel::reduce_max_nnz_per_slice<<<grid_dim, default_block_size>>>(
         num_rows, slice_size, stride_factor,
@@ -927,13 +513,7 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
         grid_dim, as_cuda_type(block_results.get_const_data()),
         as_cuda_type(d_result.get_data()));
 
-    exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
-                                  result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
+    *result = exec->copy_val_to_host(d_result.get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -942,8 +522,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
     if (cublas::is_supported<ValueType>::value) {
         auto handle = exec->get_cublas_handle();
@@ -967,9 +547,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
-
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
     if (cublas::is_supported<ValueType>::value) {
         auto handle = exec->get_cublas_handle();
@@ -986,11 +565,96 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
     } else {
         GKO_NOT_IMPLEMENTED;
     }
-};
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const CudaExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    kernel::row_permute<block_size><<<grid_dim, block_dim>>>(
+        orig->get_size()[0], orig->get_size()[1],
+        as_cuda_type(permutation_indices->get_const_data()),
+        as_cuda_type(orig->get_const_values()), orig->get_stride(),
+        as_cuda_type(row_permuted->get_values()), row_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const CudaExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    kernel::column_permute<block_size><<<grid_dim, block_dim>>>(
+        orig->get_size()[0], orig->get_size()[1],
+        as_cuda_type(permutation_indices->get_const_data()),
+        as_cuda_type(orig->get_const_values()), orig->get_stride(),
+        as_cuda_type(column_permuted->get_values()),
+        column_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    kernel::inverse_row_permute<block_size><<<grid_dim, block_dim>>>(
+        orig->get_size()[0], orig->get_size()[1],
+        as_cuda_type(permutation_indices->get_const_data()),
+        as_cuda_type(orig->get_const_values()), orig->get_stride(),
+        as_cuda_type(row_permuted->get_values()), row_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const CudaExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    kernel::inverse_column_permute<block_size><<<grid_dim, block_dim>>>(
+        orig->get_size()[0], orig->get_size()[1],
+        as_cuda_type(permutation_indices->get_const_data()),
+        as_cuda_type(orig->get_const_values()), orig->get_stride(),
+        as_cuda_type(column_permuted->get_values()),
+        column_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
 }  // namespace dense
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 76abffe5859..aded7cb11ad 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,16 +43,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/format_conversion.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
-#include "cuda/components/zero_array.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -75,6 +77,8 @@ constexpr int default_block_size = 512;
  * `num_threads_per_core` threads assigned to each physical core.
  */
 constexpr int num_threads_per_core = 4;
+
+
 /**
  * ratio is the parameter to decide when to use threads to do reduction on each
  * row. (#cols/#rows > ratio)
@@ -82,128 +86,30 @@ constexpr int num_threads_per_core = 4;
 constexpr double ratio = 1e-2;
 
 
+/**
+ * max_thread_per_worker is the max number of thread per worker. The
+ * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
+ */
+constexpr int max_thread_per_worker = 32;
+
+
 /**
  * A compile-time list of sub-warp sizes for which the spmv kernels should be
  * compiled.
  * 0 is a special case where it uses a sub-warp size of warp_size in
  * combination with atomic_adds.
  */
-using compiled_kernels =
-    syn::value_list<int, 0, 1, 2, 4, 8, 16, 32, cuda_config::warp_size>;
-
+using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 
-namespace kernel {
-namespace {
 
-
-template <int subwarp_size, bool atomic, typename ValueType, typename IndexType,
-          typename Closure>
-__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row,
-                            const ValueType *__restrict__ val,
-                            const IndexType *__restrict__ col,
-                            const size_type stride,
-                            const size_type num_stored_elements_per_row,
-                            const ValueType *__restrict__ b,
-                            const size_type b_stride, ValueType *__restrict__ c,
-                            const size_type c_stride, Closure op)
-{
-    const auto tidx =
-        static_cast<IndexType>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const IndexType x = tidx / subwarp_size / nwarps_per_row;
-    const auto warp_id = tidx / subwarp_size % nwarps_per_row;
-    const auto y_start = tidx % subwarp_size +
-                         num_stored_elements_per_row * warp_id / nwarps_per_row;
-    const auto y_end =
-        num_stored_elements_per_row * (warp_id + 1) / nwarps_per_row;
-    if (x < num_rows) {
-        const auto tile_block =
-            group::tiled_partition<subwarp_size>(group::this_thread_block());
-        ValueType temp = zero<ValueType>();
-        const auto column_id = blockIdx.y;
-        for (IndexType idx = y_start; idx < y_end; idx += subwarp_size) {
-            const auto ind = x + idx * stride;
-            const auto col_idx = col[ind];
-            if (col_idx < idx) {
-                break;
-            } else {
-                temp += val[ind] * b[col_idx * b_stride + column_id];
-            }
-        }
-        const auto answer = reduce(
-            tile_block, temp, [](ValueType x, ValueType y) { return x + y; });
-        if (tile_block.thread_rank() == 0) {
-            if (atomic) {
-                atomic_add(&(c[x * c_stride + column_id]),
-                           op(answer, c[x * c_stride + column_id]));
-            } else {
-                c[x * c_stride + column_id] =
-                    op(answer, c[x * c_stride + column_id]);
-            }
-        }
-    }
-}
-
-
-template <int subwarp_size, bool atomic = false, typename ValueType,
-          typename IndexType>
-__global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int nwarps_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
-{
-    spmv_kernel<subwarp_size, atomic>(
-        num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row,
-        b, b_stride, c, c_stride,
-        [](const ValueType &x, const ValueType &y) { return x; });
-}
-
-
-template <int subwarp_size, bool atomic = false, typename ValueType,
-          typename IndexType>
-__global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int nwarps_per_row,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
-    const size_type c_stride)
-{
-    const ValueType alpha_val = alpha[0];
-    const ValueType beta_val = beta[0];
-    // Because the atomic operation changes the values of c during computation,
-    // it can not do the right alpha * a * b + beta * c operation.
-    // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
-    // operation.
-    if (atomic) {
-        spmv_kernel<subwarp_size, atomic>(
-            num_rows, nwarps_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val](const ValueType &x, const ValueType &y) {
-                return alpha_val * x;
-            });
-    } else {
-        spmv_kernel<subwarp_size, atomic>(
-            num_rows, nwarps_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
-                return alpha_val * x + beta_val * y;
-            });
-    }
-}
-
-
-}  // namespace
-}  // namespace kernel
+#include "common/matrix/ell_kernels.hpp.inc"
 
 
 namespace {
 
 
 template <int info, typename ValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
+void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                    const matrix::Ell<ValueType, IndexType> *a,
                    const matrix::Dense<ValueType> *b,
                    matrix::Dense<ValueType> *c,
@@ -211,27 +117,31 @@ void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
                    const matrix::Dense<ValueType> *beta = nullptr)
 {
     const auto nrows = a->get_size()[0];
-    constexpr int subwarp_size = (info == 0) ? cuda_config::warp_size : info;
+    constexpr int num_thread_per_worker =
+        (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
-    const dim3 block_size(default_block_size, 1, 1);
-    const dim3 grid_size(
-        ceildiv(nrows * subwarp_size * nwarps_per_row, block_size.x),
-        b->get_size()[1], 1);
+    const dim3 block_size(default_block_size / num_thread_per_worker,
+                          num_thread_per_worker, 1);
+    const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
+                         b->get_size()[1], 1);
     if (alpha == nullptr && beta == nullptr) {
-        kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
-            nrows, nwarps_per_row, as_cuda_type(a->get_const_values()),
-            a->get_const_col_idxs(), a->get_stride(),
-            a->get_num_stored_elements_per_row(),
-            as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(c->get_values()), c->get_stride());
+        kernel::spmv<num_thread_per_worker, atomic>
+            <<<grid_size, block_size, 0, 0>>>(
+                nrows, num_worker_per_row, as_cuda_type(a->get_const_values()),
+                a->get_const_col_idxs(), a->get_stride(),
+                a->get_num_stored_elements_per_row(),
+                as_cuda_type(b->get_const_values()), b->get_stride(),
+                as_cuda_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
-        kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
-            nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()),
-            as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
-            as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(beta->get_const_values()),
-            as_cuda_type(c->get_values()), c->get_stride());
+        kernel::spmv<num_thread_per_worker, atomic>
+            <<<grid_size, block_size, 0, 0>>>(
+                nrows, num_worker_per_row,
+                as_cuda_type(alpha->get_const_values()),
+                as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
+                a->get_stride(), a->get_num_stored_elements_per_row(),
+                as_cuda_type(b->get_const_values()), b->get_stride(),
+                as_cuda_type(beta->get_const_values()),
+                as_cuda_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -241,42 +151,43 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
 
 
 template <typename ValueType, typename IndexType>
-std::array<int, 3> compute_subwarp_size_and_atomicity(
+std::array<int, 3> compute_thread_worker_and_atomicity(
     std::shared_ptr<const CudaExecutor> exec,
     const matrix::Ell<ValueType, IndexType> *a)
 {
-    int subwarp_size = 1;
+    int num_thread_per_worker = 1;
     int atomic = 0;
-    int nwarps_per_row = 1;
+    int num_worker_per_row = 1;
 
     const auto nrows = a->get_size()[0];
     const auto ell_ncols = a->get_num_stored_elements_per_row();
-    const auto nwarps = exec->get_num_cores_per_sm() / cuda_config::warp_size *
+    // TODO: num_threads_per_core should be tuned for AMD gpu
+    const auto nwarps = exec->get_num_warps_per_sm() *
                         exec->get_num_multiprocessor() * num_threads_per_core;
 
     // Use multithreads to perform the reduction on each row when the matrix is
     // wide.
     // To make every thread have computation, so pick the value which is the
-    // power of 2 less than warp_size and is less than or equal to ell_ncols. If
-    // the subwarp_size is warp_size and allow more than one warps to work on
-    // the same row, use atomic add to handle the warps write the value into the
-    // same position. The #warps is decided according to the number of warps
-    // allowed on GPU.
+    // power of 2 less than max_thread_per_worker and is less than or equal to
+    // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and
+    // allow more than one worker to work on the same row, use atomic add to
+    // handle the worker write the value into the same position. The #worker is
+    // decided according to the number of worker allowed on GPU.
     if (static_cast<double>(ell_ncols) / nrows > ratio) {
-        while (subwarp_size < cuda_config::warp_size &&
-               (subwarp_size << 1) <= ell_ncols) {
-            subwarp_size <<= 1;
+        while (num_thread_per_worker < max_thread_per_worker &&
+               (num_thread_per_worker << 1) <= ell_ncols) {
+            num_thread_per_worker <<= 1;
         }
-        if (subwarp_size == cuda_config::warp_size) {
-            nwarps_per_row =
-                std::min(ell_ncols / cuda_config::warp_size, nwarps / nrows);
-            nwarps_per_row = std::max(nwarps_per_row, 1);
+        if (num_thread_per_worker == max_thread_per_worker) {
+            num_worker_per_row =
+                std::min(ell_ncols / max_thread_per_worker, nwarps / nrows);
+            num_worker_per_row = std::max(num_worker_per_row, 1);
         }
-        if (nwarps_per_row > 1) {
+        if (num_worker_per_row > 1) {
             atomic = 1;
         }
     }
-    return {subwarp_size, atomic, nwarps_per_row};
+    return {num_thread_per_worker, atomic, num_worker_per_row};
 }
 
 
@@ -288,24 +199,26 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Ell<ValueType, IndexType> *a,
           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
 {
-    const auto data = compute_subwarp_size_and_atomicity(exec, a);
-    const int subwarp_size = std::get<0>(data);
+    const auto data = compute_thread_worker_and_atomicity(exec, a);
+    const int num_thread_per_worker = std::get<0>(data);
     const int atomic = std::get<1>(data);
-    const int nwarps_per_row = std::get<2>(data);
+    const int num_worker_per_row = std::get<2>(data);
 
     /**
      * info is the parameter for selecting the cuda kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
-    const int info = (!atomic) * subwarp_size;
+    const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        zero_array(c->get_num_stored_elements(), c->get_values());
+        components::fill_array(exec, c->get_values(),
+                               c->get_num_stored_elements(), zero<ValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), nwarps_per_row, a, b, c);
+        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b,
+        c);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
@@ -319,24 +232,24 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType> *beta,
                    matrix::Dense<ValueType> *c)
 {
-    const auto data = compute_subwarp_size_and_atomicity(exec, a);
-    const int subwarp_size = std::get<0>(data);
+    const auto data = compute_thread_worker_and_atomicity(exec, a);
+    const int num_thread_per_worker = std::get<0>(data);
     const int atomic = std::get<1>(data);
-    const int nwarps_per_row = std::get<2>(data);
+    const int num_worker_per_row = std::get<2>(data);
 
     /**
      * info is the parameter for selecting the cuda kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
-    const int info = (!atomic) * subwarp_size;
+    const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         dense::scale(exec, beta, c);
     }
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), nwarps_per_row, a, b, c,
+        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b, c,
         alpha, beta);
 }
 
@@ -344,48 +257,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType>
-__global__
-    __launch_bounds__(cuda_config::max_block_size) void initialize_zero_dense(
-        size_type num_rows, size_type num_cols, size_type stride,
-        ValueType *__restrict__ result)
-{
-    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
-    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
-    if (tidx_x < num_cols && tidx_y < num_rows) {
-        result[tidx_y * stride + tidx_x] = zero<ValueType>();
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_dense(
-    size_type num_rows, size_type nnz, size_type source_stride,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values, size_type result_stride,
-    ValueType *__restrict__ result)
-{
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tidx < num_rows) {
-        for (auto col = 0; col < nnz; col++) {
-            result[tidx * result_stride +
-                   col_idxs[tidx + col * source_stride]] +=
-                values[tidx + col * source_stride];
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -394,9 +269,8 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
     const auto vals = source->get_const_values();
     const auto source_stride = source->get_stride();
 
-    const dim3 block_size(cuda_config::warp_size,
-                          cuda_config::max_block_size / cuda_config::warp_size,
-                          1);
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
     const dim3 init_grid_dim(ceildiv(result_stride, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
@@ -413,68 +287,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void count_nnz_per_row(
-    size_type num_rows, size_type max_nnz_per_row, size_type stride,
-    const ValueType *__restrict__ values, IndexType *__restrict__ result)
-{
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto row_idx = tidx / warp_size;
-
-    if (row_idx < num_rows) {
-        IndexType part_result{};
-        for (auto i = threadIdx.x % warp_size; i < max_nnz_per_row;
-             i += warp_size) {
-            if (values[stride * i + row_idx] != zero<ValueType>()) {
-                part_result += 1;
-            }
-        }
-
-        auto warp_tile =
-            group::tiled_partition<warp_size>(group::this_thread_block());
-        result[row_idx] = reduce(
-            warp_tile, part_result,
-            [](const size_type &a, const size_type &b) { return a + b; });
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_csr(
-    size_type num_rows, size_type max_nnz_per_row, size_type stride,
-    const ValueType *__restrict__ source_values,
-    const IndexType *__restrict__ source_col_idxs,
-    IndexType *__restrict__ result_row_ptrs,
-    IndexType *__restrict__ result_col_idxs,
-    ValueType *__restrict__ result_values)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-
-    if (tidx < num_rows) {
-        auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
-            const auto source_idx = tidx + stride * i;
-            if (source_values[source_idx] != zero<ValueType>()) {
-                result_values[write_to] = source_values[source_idx];
-                result_col_idxs[write_to] = source_col_idxs[source_idx];
-                write_to++;
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -486,31 +302,22 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
     const auto max_nnz_per_row = source->get_num_stored_elements_per_row();
 
     constexpr auto rows_per_block =
-        ceildiv(default_block_size, cuda_config::warp_size);
+        ceildiv(default_block_size, config::warp_size);
     const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
 
     kernel::count_nnz_per_row<<<grid_dim_nnz, default_block_size>>>(
         num_rows, max_nnz_per_row, stride,
         as_cuda_type(source->get_const_values()), as_cuda_type(row_ptrs));
 
-    size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_dim);
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
 
-    start_prefix_sum<default_block_size>
-        <<<grid_dim, default_block_size>>>(num_rows + 1, as_cuda_type(row_ptrs),
-                                           as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
         num_rows, max_nnz_per_row, stride,
         as_cuda_type(source->get_const_values()),
         as_cuda_type(source->get_const_col_idxs()), as_cuda_type(row_ptrs),
         as_cuda_type(col_idxs), as_cuda_type(values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -528,7 +335,6 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -545,7 +351,7 @@ void calculate_nonzeros_per_row(std::shared_ptr<const CudaExecutor> exec,
     const auto stride = source->get_stride();
     const auto values = source->get_const_values();
 
-    const auto warp_size = cuda_config::warp_size;
+    const auto warp_size = config::warp_size;
     const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size);
 
     kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu
index 92519dc02ba..7b731559672 100644
--- a/cuda/matrix/hybrid_kernels.cu
+++ b/cuda/matrix/hybrid_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -37,16 +37,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/format_conversion.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
-#include "cuda/components/zero_array.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -64,131 +66,22 @@ constexpr int default_block_size = 512;
 constexpr int warps_in_block = 4;
 
 
-template <typename ValueType, typename IndexType>
-void convert_to_dense(
-    std::shared_ptr<const CudaExecutor> exec, matrix::Dense<ValueType> *result,
-    const matrix::Hybrid<ValueType, IndexType> *source) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL);
-
-
-namespace kernel {
-
-
-/**
- * The global function for counting the number of nonzeros per row of COO.
- * It is almost like COO spmv routine.
- * It performs is_nonzeros(Coo) times the vector whose values are one
- *
- * @param nnz  the number of nonzeros in the matrix
- * @param num_line  the maximum round of each warp
- * @param val  the value array of the matrix
- * @param row  the row index array of the matrix
- * @param nnz_per_row  the output nonzeros per row
- */
-template <int subwarp_size = cuda_config::warp_size, typename ValueType,
-          typename IndexType>
-__global__ __launch_bounds__(default_block_size) void count_coo_row_nnz(
-    const size_type nnz, const size_type num_lines,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ row,
-    IndexType *__restrict__ nnz_per_row)
-{
-    IndexType temp_val = 0;
-    const auto start = static_cast<size_type>(blockDim.x) * blockIdx.x *
-                           blockDim.y * num_lines +
-                       threadIdx.y * blockDim.x * num_lines;
-    size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size);
-    num = min(num, num_lines);
-    const IndexType ind_start = start + threadIdx.x;
-    const IndexType ind_end = ind_start + (num - 1) * subwarp_size;
-    IndexType ind = ind_start;
-    IndexType curr_row = (ind < nnz) ? row[ind] : 0;
-    const auto tile_block =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    for (; ind < ind_end; ind += subwarp_size) {
-        temp_val += ind < nnz && val[ind] != zero<ValueType>();
-        auto next_row =
-            (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
-        // segmented scan
-        if (tile_block.any(curr_row != next_row)) {
-            bool is_first_in_segment =
-                segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
-            if (is_first_in_segment) {
-                atomic_add(&(nnz_per_row[curr_row]), temp_val);
-            }
-            temp_val = 0;
-        }
-        curr_row = next_row;
-    }
-    if (num > 0) {
-        ind = ind_end;
-        temp_val += ind < nnz && val[ind] != zero<ValueType>();
-        // segmented scan
-
-        bool is_first_in_segment =
-            segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
-        if (is_first_in_segment) {
-            atomic_add(&(nnz_per_row[curr_row]), temp_val);
-        }
-    }
-}
+#include "common/matrix/hybrid_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_csr(
-    size_type num_rows, size_type max_nnz_per_row, size_type stride,
-    const ValueType *__restrict__ ell_val,
-    const IndexType *__restrict__ ell_col,
-    const ValueType *__restrict__ coo_val,
-    const IndexType *__restrict__ coo_col,
-    const IndexType *__restrict__ coo_offset,
-    IndexType *__restrict__ result_row_ptrs,
-    IndexType *__restrict__ result_col_idxs,
-    ValueType *__restrict__ result_values)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-
-    if (tidx < num_rows) {
-        auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
-            const auto source_idx = tidx + stride * i;
-            if (ell_val[source_idx] != zero<ValueType>()) {
-                result_values[write_to] = ell_val[source_idx];
-                result_col_idxs[write_to] = ell_col[source_idx];
-                write_to++;
-            }
-        }
-        for (auto i = coo_offset[tidx]; i < coo_offset[tidx + 1]; i++) {
-            if (coo_val[i] != zero<ValueType>()) {
-                result_values[write_to] = coo_val[i];
-                result_col_idxs[write_to] = coo_col[i];
-                write_to++;
-            }
-        }
-    }
-}
-
-
-template <typename ValueType1, typename ValueType2>
-__global__ __launch_bounds__(default_block_size) void add(
-    size_type num, ValueType1 *__restrict__ val1,
-    const ValueType2 *__restrict__ val2)
-{
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (tidx < num) {
-        val1[tidx] += val2[tidx];
-    }
-}
-
+void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
 
-}  // namespace kernel
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     auto coo_offset = Array<IndexType>(exec, num_rows + 1);
@@ -211,20 +104,21 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
     auto row_ptrs = result->get_row_ptrs();
     auto coo_row_ptrs = Array<IndexType>(exec, num_rows);
 
-    zero_array(num_rows + 1, row_ptrs);
+    components::fill_array(exec, row_ptrs, num_rows + 1, zero<IndexType>());
     grid_num = ceildiv(num_rows, warps_in_block);
     ell::kernel::count_nnz_per_row<<<grid_num, default_block_size>>>(
         num_rows, max_nnz_per_row, stride, as_cuda_type(ell_val),
         as_cuda_type(row_ptrs));
 
-    zero_array(num_rows, coo_row_ptrs.get_data());
+    components::fill_array(exec, coo_row_ptrs.get_data(), num_rows,
+                           zero<IndexType>());
 
     auto nwarps =
         coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements);
     if (nwarps > 0) {
         int num_lines =
-            ceildiv(coo_num_stored_elements, nwarps * cuda_config::warp_size);
-        const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
+            ceildiv(coo_num_stored_elements, nwarps * config::warp_size);
+        const dim3 coo_block(config::warp_size, warps_in_block, 1);
         const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1);
 
         kernel::count_coo_row_nnz<<<coo_grid, coo_block>>>(
@@ -236,16 +130,7 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
         num_rows, as_cuda_type(row_ptrs),
         as_cuda_type(coo_row_ptrs.get_const_data()));
 
-    grid_num = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_num);
-
-    start_prefix_sum<default_block_size>
-        <<<grid_num, default_block_size>>>(num_rows + 1, as_cuda_type(row_ptrs),
-                                           as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_num, default_block_size>>>(
-        num_rows + 1, as_cuda_type(row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
 
     // Fill the value
     grid_num = ceildiv(num_rows, default_block_size);
@@ -273,12 +158,13 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
     auto nnz = source->get_coo_num_stored_elements();
     auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz);
     if (nwarps > 0) {
-        int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
-        const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
+        int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+        const dim3 coo_block(config::warp_size, warps_in_block, 1);
         const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1);
         const auto num_rows = source->get_size()[0];
         auto nnz_per_row = Array<IndexType>(exec, num_rows);
-        zero_array(num_rows, nnz_per_row.get_data());
+        components::fill_array(exec, nnz_per_row.get_data(), num_rows,
+                               zero<IndexType>());
         kernel::count_coo_row_nnz<<<coo_grid, coo_block>>>(
             nnz, num_lines, as_cuda_type(source->get_coo()->get_const_values()),
             as_cuda_type(source->get_coo()->get_const_row_idxs()),
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index 1f5a058ac9c..175fb65e078 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,10 +40,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -57,37 +59,10 @@ namespace cuda {
 namespace sellp {
 
 
-namespace {
-
 constexpr auto default_block_size = 512;
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(matrix::default_slice_size) void spmv_kernel(
-    size_type num_rows, size_type num_right_hand_sides, size_type b_stride,
-    size_type c_stride, const size_type *__restrict__ slice_lengths,
-    const size_type *__restrict__ slice_sets, const ValueType *__restrict__ a,
-    const IndexType *__restrict__ col, const ValueType *__restrict__ b,
-    ValueType *__restrict__ c)
-{
-    const auto slice_id = blockIdx.x;
-    const auto slice_size = blockDim.x;
-    const auto row_in_slice = threadIdx.x;
-    const auto global_row =
-        static_cast<size_type>(slice_size) * slice_id + row_in_slice;
-    const auto column_id = blockIdx.y;
-    ValueType val = 0;
-    IndexType ind = 0;
-    if (global_row < num_rows && column_id < num_right_hand_sides) {
-        for (size_type i = 0; i < slice_lengths[slice_id]; i++) {
-            ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size;
-            val += a[ind] * b[col[ind] * b_stride + column_id];
-        }
-        c[global_row * c_stride + column_id] = val;
-    }
-}
-
 
-}  // namespace
+#include "common/matrix/sellp_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
@@ -109,41 +84,6 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
-namespace {
-
-
-template <typename ValueType, typename IndexType>
-__global__
-    __launch_bounds__(matrix::default_slice_size) void advanced_spmv_kernel(
-        size_type num_rows, size_type num_right_hand_sides, size_type b_stride,
-        size_type c_stride, const size_type *__restrict__ slice_lengths,
-        const size_type *__restrict__ slice_sets,
-        const ValueType *__restrict__ alpha, const ValueType *__restrict__ a,
-        const IndexType *__restrict__ col, const ValueType *__restrict__ b,
-        const ValueType *__restrict__ beta, ValueType *__restrict__ c)
-{
-    const auto slice_id = blockIdx.x;
-    const auto slice_size = blockDim.x;
-    const auto row_in_slice = threadIdx.x;
-    const auto global_row =
-        static_cast<size_type>(slice_size) * slice_id + row_in_slice;
-    const auto column_id = blockIdx.y;
-    ValueType val = 0;
-    IndexType ind = 0;
-    if (global_row < num_rows && column_id < num_right_hand_sides) {
-        for (size_type i = 0; i < slice_lengths[slice_id]; i++) {
-            ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size;
-            val += alpha[0] * a[ind] * b[col[ind] * b_stride + column_id];
-        }
-        c[global_row * c_stride + column_id] =
-            beta[0] * c[global_row * c_stride + column_id] + val;
-    }
-}
-
-
-}  // namespace
-
-
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType> *alpha,
@@ -169,57 +109,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_zero_dense(
-    size_type num_rows, size_type num_cols, size_type stride,
-    ValueType *__restrict__ result)
-{
-    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
-    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
-    if (tidx_x < num_cols && tidx_y < num_rows) {
-        result[tidx_y * stride + tidx_x] = zero<ValueType>();
-    }
-}
-
-
-template <unsigned int threads_per_row, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_dense(
-    size_type num_rows, size_type num_cols, size_type stride,
-    size_type slice_size, const size_type *__restrict__ slice_lengths,
-    const size_type *__restrict__ slice_sets,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values, ValueType *__restrict__ result)
-{
-    const auto global_row =
-        (blockDim.x * blockIdx.x + threadIdx.x) / threads_per_row;
-    const auto row = global_row % slice_size;
-    const auto slice = global_row / slice_size;
-    const auto start_index = threadIdx.x % threads_per_row;
-
-    if (global_row < num_rows) {
-        for (auto i = start_index; i < slice_lengths[slice];
-             i += threads_per_row) {
-            if (values[(slice_sets[slice] + i) * slice_size + row] !=
-                zero<ValueType>()) {
-                result[global_row * stride +
-                       col_idxs[(slice_sets[slice] + i) * slice_size + row]] =
-                    values[(slice_sets[slice] + i) * slice_size + row];
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto num_cols = source->get_size()[1];
@@ -231,9 +124,8 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const auto slice_num = ceildiv(num_rows, slice_size);
 
-    const dim3 block_size(cuda_config::warp_size,
-                          cuda_config::max_block_size / cuda_config::warp_size,
-                          1);
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
     const dim3 init_grid_dim(ceildiv(result->get_stride(), block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
 
@@ -241,7 +133,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
         num_rows, num_cols, result->get_stride(),
         as_cuda_type(result->get_values()));
 
-    constexpr auto threads_per_row = cuda_config::warp_size;
+    constexpr auto threads_per_row = config::warp_size;
     const auto grid_dim =
         ceildiv(slice_size * slice_num * threads_per_row, default_block_size);
 
@@ -252,85 +144,14 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(result->get_values()));
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL);
 
 
-namespace kernel {
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void count_nnz_per_row(
-    size_type num_rows, size_type slice_size,
-    const size_type *__restrict__ slice_sets,
-    const ValueType *__restrict__ values, IndexType *__restrict__ result)
-{
-    constexpr auto warp_size = cuda_config::warp_size;
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto row_idx = tidx / warp_size;
-    const auto slice_id = row_idx / slice_size;
-    const auto tid_in_warp = tidx % warp_size;
-    const auto row_in_slice = row_idx % slice_size;
-
-    if (row_idx < num_rows) {
-        IndexType part_result{};
-        for (size_type sellp_ind =
-                 (slice_sets[slice_id] + tid_in_warp) * slice_size +
-                 row_in_slice;
-             sellp_ind < slice_sets[slice_id + 1] * slice_size;
-             sellp_ind += warp_size * slice_size) {
-            if (values[sellp_ind] != zero<ValueType>()) {
-                part_result += 1;
-            }
-        }
-
-        auto warp_tile =
-            group::tiled_partition<warp_size>(group::this_thread_block());
-        result[row_idx] = reduce(
-            warp_tile, part_result,
-            [](const size_type &a, const size_type &b) { return a + b; });
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_csr(
-    size_type num_rows, size_type slice_size,
-    const size_type *__restrict__ source_slice_sets,
-    const IndexType *__restrict__ source_col_idxs,
-    const ValueType *__restrict__ source_values,
-    IndexType *__restrict__ result_row_ptrs,
-    IndexType *__restrict__ result_col_idxs,
-    ValueType *__restrict__ result_values)
-{
-    const auto row = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto slice_id = row / slice_size;
-    const auto row_in_slice = row % slice_size;
-
-    if (row < num_rows) {
-        size_type csr_ind = result_row_ptrs[row];
-        for (size_type sellp_ind =
-                 source_slice_sets[slice_id] * slice_size + row_in_slice;
-             sellp_ind < source_slice_sets[slice_id + 1] * slice_size;
-             sellp_ind += slice_size) {
-            if (source_values[sellp_ind] != zero<ValueType>()) {
-                result_values[csr_ind] = source_values[sellp_ind];
-                result_col_idxs[csr_ind] = source_col_idxs[sellp_ind];
-                csr_ind++;
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto slice_size = source->get_slice_size();
@@ -345,8 +166,7 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
     auto result_col_idxs = result->get_col_idxs();
     auto result_row_ptrs = result->get_row_ptrs();
 
-    auto grid_dim =
-        ceildiv(num_rows * cuda_config::warp_size, default_block_size);
+    auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
 
     kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
         num_rows, slice_size, as_cuda_type(source_slice_sets),
@@ -355,13 +175,7 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
     grid_dim = ceildiv(num_rows + 1, default_block_size);
     auto add_values = Array<IndexType>(exec, grid_dim);
 
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(result_row_ptrs),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(result_row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    components::prefix_sum(exec, result_row_ptrs, num_rows + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
 
@@ -370,8 +184,6 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(source_col_idxs), as_cuda_type(source_values),
         as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs),
         as_cuda_type(result_values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -390,15 +202,13 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
 
     auto nnz_per_row = Array<size_type>(exec, num_rows);
 
-    auto grid_dim =
-        ceildiv(num_rows * cuda_config::warp_size, default_block_size);
+    auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
 
     kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
         num_rows, slice_size, as_cuda_type(slice_sets), as_cuda_type(values),
         as_cuda_type(nnz_per_row.get_data()));
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 2bdb8372030..69d2e53fe37 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,26 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
 
-#include <algorithm>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/segment_scan.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/zero_array.hpp"
 
 
 namespace gko {
@@ -99,10 +80,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void remove_diagonal_elements(std::shared_ptr<const CudaExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
-                              const IndexType *row_ptrs,
-                              const IndexType *col_idxs) GKO_NOT_IMPLEMENTED;
+void remove_diagonal_elements(
+    std::shared_ptr<const CudaExecutor> exec, const IndexType *row_ptrs,
+    const IndexType *col_idxs,
+    matrix::SparsityCsr<ValueType, IndexType> *matrix) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
@@ -110,11 +91,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
     GKO_NOT_IMPLEMENTED;
 
-
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
new file mode 100644
index 00000000000..858c82584d5
--- /dev/null
+++ b/cuda/preconditioner/isai_kernels.cu
@@ -0,0 +1,160 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/isai_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/merging.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Isai preconditioner namespace.
+ * @ref Isai
+ * @ingroup isai
+ */
+namespace isai {
+
+
+constexpr int subwarp_size{row_size_limit};
+constexpr int subwarps_per_block{2};
+constexpr int default_block_size{subwarps_per_block * subwarp_size};
+
+
+#include "common/preconditioner/isai_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *input,
+                          matrix::Csr<ValueType, IndexType> *inverse,
+                          IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs,
+                          bool lower)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    if (lower) {
+        kernel::generate_l_inverse<subwarp_size, subwarps_per_block>
+            <<<grid, block>>>(static_cast<IndexType>(num_rows),
+                              input->get_const_row_ptrs(),
+                              input->get_const_col_idxs(),
+                              as_cuda_type(input->get_const_values()),
+                              inverse->get_row_ptrs(), inverse->get_col_idxs(),
+                              as_cuda_type(inverse->get_values()),
+                              excess_rhs_ptrs, excess_nz_ptrs);
+    } else {
+        kernel::generate_u_inverse<subwarp_size, subwarps_per_block>
+            <<<grid, block>>>(static_cast<IndexType>(num_rows),
+                              input->get_const_row_ptrs(),
+                              input->get_const_col_idxs(),
+                              as_cuda_type(input->get_const_values()),
+                              inverse->get_row_ptrs(), inverse->get_col_idxs(),
+                              as_cuda_type(inverse->get_values()),
+                              excess_rhs_ptrs, excess_nz_ptrs);
+    }
+    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Csr<ValueType, IndexType> *input,
+                            const matrix::Csr<ValueType, IndexType> *inverse,
+                            const IndexType *excess_rhs_ptrs,
+                            const IndexType *excess_nz_ptrs,
+                            matrix::Csr<ValueType, IndexType> *excess_system,
+                            matrix::Dense<ValueType> *excess_rhs)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    kernel::generate_excess_system<subwarp_size><<<grid, block>>>(
+        static_cast<IndexType>(num_rows), input->get_const_row_ptrs(),
+        input->get_const_col_idxs(), as_cuda_type(input->get_const_values()),
+        inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(),
+        excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(),
+        excess_system->get_col_idxs(),
+        as_cuda_type(excess_system->get_values()),
+        as_cuda_type(excess_rhs->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
+                             const IndexType *excess_rhs_ptrs,
+                             const matrix::Dense<ValueType> *excess_solution,
+                             matrix::Csr<ValueType, IndexType> *inverse)
+{
+    const auto num_rows = inverse->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    kernel::copy_excess_solution<subwarp_size><<<grid, block>>>(
+        static_cast<IndexType>(num_rows), inverse->get_const_row_ptrs(),
+        excess_rhs_ptrs, as_cuda_type(excess_solution->get_const_values()),
+        as_cuda_type(inverse->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu
index 6cc3005f3d9..2dc9aeaf23b 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
@@ -57,84 +58,9 @@ namespace cuda {
  * @ingroup jacobi
  */
 namespace jacobi {
-namespace kernel {
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size)
-    advanced_apply(const ValueType *__restrict__ blocks,
-                   preconditioner::block_interleaved_storage_scheme<IndexType>
-                       storage_scheme,
-                   const IndexType *__restrict__ block_ptrs,
-                   size_type num_blocks, const ValueType *__restrict__ alpha,
-                   const ValueType *__restrict__ b, int32 b_stride,
-                   ValueType *__restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = alpha[0] *
-            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    multiply_vec<max_block_size>(
-        subwarp, block_size, v,
-        blocks + storage_scheme.get_global_block_offset(block_id) +
-            subwarp.thread_rank(),
-        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-        x_stride,
-        [](ValueType &result, const ValueType &out) { result += out; });
-}
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size)
-    advanced_adaptive_apply(
-        const ValueType *__restrict__ blocks,
-        preconditioner::block_interleaved_storage_scheme<IndexType>
-            storage_scheme,
-        const precision_reduction *__restrict__ block_precisions,
-        const IndexType *__restrict__ block_ptrs, size_type num_blocks,
-        const ValueType *__restrict__ alpha, const ValueType *__restrict__ b,
-        int32 b_stride, ValueType *__restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    auto alpha_val = alpha == nullptr ? one<ValueType>() : alpha[0];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = alpha[0] *
-            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-        ValueType, block_precisions[block_id],
-        multiply_vec<max_block_size>(
-            subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision *>(
-                blocks + storage_scheme.get_group_offset(block_id)) +
-                storage_scheme.get_block_offset(block_id) +
-                subwarp.thread_rank(),
-            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-            x_stride,
-            [](ValueType &result, const ValueType &out) { result += out; }));
-}
 
 
-}  // namespace kernel
+#include "common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
 
 
 namespace {
@@ -152,7 +78,7 @@ void advanced_apply(
     ValueType *x, size_type x_stride)
 {
     constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = cuda_config::warp_size / subwarp_size;
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
     const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
                          1, 1);
     const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
@@ -199,7 +125,7 @@ void apply(std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
             [&](int compiled_block_size) {
                 return max_block_size <= compiled_block_size;
             },
-            syn::value_list<int, cuda_config::min_warps_per_block>(),
+            syn::value_list<int, config::min_warps_per_block>(),
             syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
             block_pointers.get_const_data(), blocks.get_const_data(),
             storage_scheme, alpha->get_const_values(),
diff --git a/cuda/preconditioner/jacobi_common.hpp b/cuda/preconditioner/jacobi_common.hpp
index d224e4bdc15..3c76bb78388 100644
--- a/cuda/preconditioner/jacobi_common.hpp
+++ b/cuda/preconditioner/jacobi_common.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,11 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
+#include "cuda/base/config.hpp"
+
 namespace gko {
 namespace kernels {
 namespace cuda {
@@ -46,9 +47,10 @@ namespace jacobi {
  * kernels should be compiled.
  */
 #ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
-using compiled_kernels = syn::as_list<syn::range<1, 33, 1>>;
+using compiled_kernels = syn::as_list<syn::range<1, config::warp_size + 1, 1>>;
 #else
-using compiled_kernels = syn::value_list<int, 1, 2, 4, 8, 13, 16, 32>;
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 13, 16, 32, config::warp_size>;
 #endif
 
 
diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernel.cu
index a933a7b8398..0f1c52e9621 100644
--- a/cuda/preconditioner/jacobi_generate_kernel.cu
+++ b/cuda/preconditioner/jacobi_generate_kernel.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/extended_float.hpp"
+#include "core/components/fill_array.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
@@ -47,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 #include "cuda/components/warp_blas.cuh"
-#include "cuda/components/zero_array.hpp"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
@@ -60,185 +61,9 @@ namespace cuda {
  * @ingroup jacobi
  */
 namespace jacobi {
-namespace kernel {
 
 
-template <int max_block_size, typename ReducedType, typename Group,
-          typename ValueType, typename IndexType>
-__device__ __forceinline__ bool validate_precision_reduction_feasibility(
-    Group &__restrict__ group, IndexType block_size,
-    ValueType *__restrict__ row, ValueType *__restrict__ work, size_type stride)
-{
-    using gko::detail::float_traits;
-    // save original data and reduce precision
-    if (group.thread_rank() < block_size) {
-#pragma unroll
-        for (auto i = 0u; i < max_block_size; ++i) {
-            if (i >= block_size) {
-                break;
-            }
-            work[i * stride + group.thread_rank()] = row[i];
-            row[i] = static_cast<ValueType>(static_cast<ReducedType>(row[i]));
-        }
-    }
-
-    // compute the condition number
-    auto perm = group.thread_rank();
-    auto trans_perm = perm;
-    auto block_cond = compute_infinity_norm<max_block_size>(group, block_size,
-                                                            block_size, row);
-    auto succeeded =
-        invert_block<max_block_size>(group, block_size, row, perm, trans_perm);
-    block_cond *= compute_infinity_norm<max_block_size>(group, block_size,
-                                                        block_size, row);
-
-    // restore original data
-    if (group.thread_rank() < block_size) {
-#pragma unroll
-        for (auto i = 0u; i < max_block_size; ++i) {
-            if (i >= block_size) {
-                break;
-            }
-            row[i] = work[i * stride + group.thread_rank()];
-        }
-    }
-
-    return succeeded && block_cond >= 1.0 &&
-           block_cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
-}
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size)
-    generate(size_type num_rows, const IndexType *__restrict__ row_ptrs,
-             const IndexType *__restrict__ col_idxs,
-             const ValueType *__restrict__ values,
-             ValueType *__restrict__ block_data,
-             preconditioner::block_interleaved_storage_scheme<IndexType>
-                 storage_scheme,
-             const IndexType *__restrict__ block_ptrs, size_type num_blocks)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto block = group::this_thread_block();
-    ValueType row[max_block_size];
-    __shared__ UninitializedArray<ValueType, max_block_size * warps_per_block>
-        workspace;
-    csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
-        block, cuda_config::warp_size / subwarp_size, row_ptrs, col_idxs,
-        values, block_ptrs, num_blocks, row, 1,
-        workspace + threadIdx.z * max_block_size);
-    const auto subwarp = group::tiled_partition<subwarp_size>(block);
-    if (block_id < num_blocks) {
-        const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-        auto perm = subwarp.thread_rank();
-        auto trans_perm = subwarp.thread_rank();
-        invert_block<max_block_size>(subwarp, block_size, row, perm,
-                                     trans_perm);
-        copy_matrix<max_block_size, and_transpose>(
-            subwarp, block_size, row, 1, perm, trans_perm,
-            block_data + storage_scheme.get_global_block_offset(block_id),
-            storage_scheme.get_stride());
-    }
-}
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void
-__launch_bounds__(warps_per_block *cuda_config::warp_size) adaptive_generate(
-    size_type num_rows, const IndexType *__restrict__ row_ptrs,
-    const IndexType *__restrict__ col_idxs,
-    const ValueType *__restrict__ values, remove_complex<ValueType> accuracy,
-    ValueType *__restrict__ block_data,
-    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
-    remove_complex<ValueType> *__restrict__ conditioning,
-    precision_reduction *__restrict__ block_precisions,
-    const IndexType *__restrict__ block_ptrs, size_type num_blocks)
-{
-    // extract blocks
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto block = group::this_thread_block();
-    ValueType row[max_block_size];
-    __shared__ UninitializedArray<ValueType, max_block_size * warps_per_block>
-        workspace;
-    csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
-        block, cuda_config::warp_size / subwarp_size, row_ptrs, col_idxs,
-        values, block_ptrs, num_blocks, row, 1,
-        workspace + threadIdx.z * max_block_size);
-
-    // compute inverse and figure out the correct precision
-    const auto subwarp = group::tiled_partition<subwarp_size>(block);
-    const auto block_size =
-        block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id]
-                              : 0;
-    auto perm = subwarp.thread_rank();
-    auto trans_perm = subwarp.thread_rank();
-    auto prec_descriptor = ~uint32{};
-    if (block_id < num_blocks) {
-        auto block_cond = compute_infinity_norm<max_block_size>(
-            subwarp, block_size, block_size, row);
-        invert_block<max_block_size>(subwarp, block_size, row, perm,
-                                     trans_perm);
-        block_cond *= compute_infinity_norm<max_block_size>(subwarp, block_size,
-                                                            block_size, row);
-        conditioning[block_id] = block_cond;
-        const auto prec = block_precisions[block_id];
-        prec_descriptor =
-            preconditioner::detail::precision_reduction_descriptor::singleton(
-                prec);
-        if (prec == precision_reduction::autodetect()) {
-            using preconditioner::detail::get_supported_storage_reductions;
-            prec_descriptor = get_supported_storage_reductions<ValueType>(
-                accuracy, block_cond,
-                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
-                 &block_id] {
-                    using target = reduce_precision<ValueType>;
-                    return validate_precision_reduction_feasibility<
-                        max_block_size, target>(
-                        subwarp, block_size, row,
-                        block_data +
-                            storage_scheme.get_global_block_offset(block_id),
-                        storage_scheme.get_stride());
-                },
-                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
-                 &block_id] {
-                    using target =
-                        reduce_precision<reduce_precision<ValueType>>;
-                    return validate_precision_reduction_feasibility<
-                        max_block_size, target>(
-                        subwarp, block_size, row,
-                        block_data +
-                            storage_scheme.get_global_block_offset(block_id),
-                        storage_scheme.get_stride());
-                });
-        }
-    }
-
-    // make sure all blocks in the group have the same precision
-    const auto warp = group::tiled_partition<cuda_config::warp_size>(block);
-    const auto prec =
-        preconditioner::detail::get_optimal_storage_reduction(reduce(
-            warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; }));
-
-    // store the block back into memory
-    if (block_id < num_blocks) {
-        block_precisions[block_id] = prec;
-        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-            ValueType, prec,
-            copy_matrix<max_block_size, and_transpose>(
-                subwarp, block_size, row, 1, perm, trans_perm,
-                reinterpret_cast<resolved_precision *>(
-                    block_data + storage_scheme.get_group_offset(block_id)) +
-                    storage_scheme.get_block_offset(block_id),
-                storage_scheme.get_stride()));
-    }
-}
-
-
-}  // namespace kernel
+#include "common/preconditioner/jacobi_generate_kernel.hpp.inc"
 
 
 namespace {
@@ -256,7 +81,7 @@ void generate(syn::value_list<int, max_block_size>,
               const IndexType *block_ptrs, size_type num_blocks)
 {
     constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = cuda_config::warp_size / subwarp_size;
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
     const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
                          1, 1);
     const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
@@ -297,16 +122,17 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
               Array<precision_reduction> &block_precisions,
               const Array<IndexType> &block_pointers, Array<ValueType> &blocks)
 {
-    zero_array(blocks.get_num_elems(), blocks.get_data());
-    select_generate(compiled_kernels(),
-                    [&](int compiled_block_size) {
-                        return max_block_size <= compiled_block_size;
-                    },
-                    syn::value_list<int, cuda_config::min_warps_per_block>(),
-                    syn::type_list<>(), system_matrix, accuracy,
-                    blocks.get_data(), storage_scheme, conditioning.get_data(),
-                    block_precisions.get_data(),
-                    block_pointers.get_const_data(), num_blocks);
+    components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(),
+                           zero<ValueType>());
+    select_generate(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
+        system_matrix, accuracy, blocks.get_data(), storage_scheme,
+        conditioning.get_data(), block_precisions.get_data(),
+        block_pointers.get_const_data(), num_blocks);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index 4f3bcb17f97..e0662499762 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -37,9 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
@@ -55,98 +60,13 @@ namespace {
 
 
 // a total of 32 warps (1024 threads)
-constexpr int default_block_size = 32;
+constexpr int default_num_warps = 32;
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
 
 
-template <int warps_per_block>
-__global__
-__launch_bounds__(warps_per_block *cuda_config::warp_size) void duplicate_array(
-    const precision_reduction *__restrict__ source, size_type source_size,
-    precision_reduction *__restrict__ dest, size_type dest_size)
-{
-    auto grid = group::this_grid();
-    if (grid.thread_rank() >= dest_size) {
-        return;
-    }
-    for (auto i = grid.thread_rank(); i < dest_size; i += grid.size()) {
-        dest[i] = source[i % source_size];
-    }
-}
-
-
-template <typename IndexType>
-__global__ void compare_adjacent_rows(size_type num_rows, int32 max_block_size,
-                                      const IndexType *__restrict__ row_ptrs,
-                                      const IndexType *__restrict__ col_idx,
-                                      bool *__restrict__ matching_next_row)
-{
-    const auto global_tid = blockDim.x * blockIdx.x + threadIdx.x;
-    const auto local_tid = threadIdx.x % cuda_config::warp_size;
-    const auto warp_id = global_tid / cuda_config::warp_size;
-    const auto warp = group::tiled_partition<cuda_config::warp_size>(
-        group::this_thread_block());
-
-    if (warp_id >= num_rows - 1) {
-        return;
-    }
-
-    const auto curr_row_start = row_ptrs[warp_id];
-    const auto next_row_start = row_ptrs[warp_id + 1];
-    const auto next_row_end = row_ptrs[warp_id + 2];
-
-    const auto nz_this_row = next_row_end - next_row_start;
-    const auto nz_prev_row = next_row_start - curr_row_start;
-
-    if (nz_this_row != nz_prev_row) {
-        matching_next_row[warp_id] = false;
-        return;
-    }
-    size_type steps = ceildiv(nz_this_row, cuda_config::warp_size);
-    for (size_type i = 0; i < steps; i++) {
-        auto j = local_tid + i * cuda_config::warp_size;
-        auto prev_col = (curr_row_start + j < next_row_start)
-                            ? col_idx[curr_row_start + j]
-                            : 0;
-        auto this_col = (curr_row_start + j < next_row_start)
-                            ? col_idx[next_row_start + j]
-                            : 0;
-        if (warp.any(prev_col != this_col)) {
-            matching_next_row[warp_id] = false;
-            return;
-        }
-    }
-    matching_next_row[warp_id] = true;
-}
-
-
-template <typename IndexType>
-__global__ void generate_natural_block_pointer(
-    size_type num_rows, int32 max_block_size,
-    const bool *__restrict__ matching_next_row,
-    IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr)
-{
-    block_ptrs[0] = 0;
-    if (num_rows == 0) {
-        return;
-    }
-    size_type num_blocks = 1;
-    int32 current_block_size = 1;
-    for (size_type i = 0; i < num_rows - 1; ++i) {
-        if ((matching_next_row[i]) && (current_block_size < max_block_size)) {
-            ++current_block_size;
-        } else {
-            block_ptrs[num_blocks] =
-                block_ptrs[num_blocks - 1] + current_block_size;
-            ++num_blocks;
-            current_block_size = 1;
-        }
-    }
-    block_ptrs[num_blocks] = block_ptrs[num_blocks - 1] + current_block_size;
-    num_blocks_arr[0] = num_blocks;
-}
+#include "common/preconditioner/jacobi_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
@@ -159,10 +79,9 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
 
     Array<bool> matching_next_row(exec, mtx->get_size()[0] - 1);
 
-    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 block_size(config::warp_size, 1, 1);
     const dim3 grid_size(
-        ceildiv(mtx->get_size()[0] * cuda_config::warp_size, block_size.x), 1,
-        1);
+        ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1);
     compare_adjacent_rows<<<grid_size, block_size, 0, 0>>>(
         mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(),
         mtx->get_const_col_idxs(), matching_next_row.get_data());
@@ -174,32 +93,6 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
 }
 
 
-template <typename IndexType>
-__global__ void agglomerate_supervariables_kernel(
-    int32 max_block_size, size_type num_natural_blocks,
-    IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr)
-{
-    num_blocks_arr[0] = 0;
-    if (num_natural_blocks == 0) {
-        return;
-    }
-    size_type num_blocks = 1;
-    int32 current_block_size = block_ptrs[1] - block_ptrs[0];
-    for (size_type i = 1; i < num_natural_blocks; ++i) {
-        const int32 block_size = block_ptrs[i + 1] - block_ptrs[i];
-        if (current_block_size + block_size <= max_block_size) {
-            current_block_size += block_size;
-        } else {
-            block_ptrs[num_blocks] = block_ptrs[i];
-            ++num_blocks;
-            current_block_size = block_size;
-        }
-    }
-    block_ptrs[num_blocks] = block_ptrs[num_natural_blocks];
-    num_blocks_arr[0] = num_blocks;
-}
-
-
 template <typename IndexType>
 inline size_type agglomerate_supervariables(
     std::shared_ptr<const CudaExecutor> exec, int32 max_block_size,
@@ -222,11 +115,11 @@ void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
                            const Array<precision_reduction> &source,
                            Array<precision_reduction> &precisions)
 {
-    const auto block_size = default_block_size * cuda_config::warp_size;
+    const auto block_size = default_num_warps * config::warp_size;
     const auto grid_size = min(
         default_grid_size,
         static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
-    duplicate_array<default_block_size><<<grid_size, block_size>>>(
+    duplicate_array<default_num_warps><<<grid_size, block_size>>>(
         source.get_const_data(), source.get_num_elems(), precisions.get_data(),
         precisions.get_num_elems());
 }
@@ -248,6 +141,93 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
+namespace {
+
+
+template <bool conjugate, int warps_per_block, int max_block_size,
+          typename ValueType, typename IndexType>
+void transpose_jacobi(
+    syn::value_list<int, max_block_size>, size_type num_blocks,
+    const precision_reduction *block_precisions,
+    const IndexType *block_pointers, const ValueType *blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    ValueType *out_blocks)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        adaptive_transpose_jacobi<conjugate, max_block_size, subwarp_size,
+                                  warps_per_block>
+            <<<grid_size, block_size, 0, 0>>>(
+                as_cuda_type(blocks), storage_scheme, block_precisions,
+                block_pointers, num_blocks, as_cuda_type(out_blocks));
+    } else {
+        transpose_jacobi<conjugate, max_block_size, subwarp_size,
+                         warps_per_block><<<grid_size, block_size, 0, 0>>>(
+            as_cuda_type(blocks), storage_scheme, block_pointers, num_blocks,
+            as_cuda_type(out_blocks));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    select_transpose_jacobi(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, false, config::min_warps_per_block>(),
+        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        block_pointers.get_const_data(), blocks.get_const_data(),
+        storage_scheme, out_blocks.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    select_transpose_jacobi(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, true, config::min_warps_per_block>(),
+        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        block_pointers.get_const_data(), blocks.get_const_data(),
+        storage_scheme, out_blocks.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
     std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernel.cu
index a5dfd71fda7..fb6721bbdca 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernel.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernel.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
@@ -57,79 +58,9 @@ namespace cuda {
  * @ingroup jacobi
  */
 namespace jacobi {
-namespace kernel {
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size)
-    apply(const ValueType *__restrict__ blocks,
-          preconditioner::block_interleaved_storage_scheme<IndexType>
-              storage_scheme,
-          const IndexType *__restrict__ block_ptrs, size_type num_blocks,
-          const ValueType *__restrict__ b, int32 b_stride,
-          ValueType *__restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    multiply_vec<max_block_size>(
-        subwarp, block_size, v,
-        blocks + storage_scheme.get_global_block_offset(block_id) +
-            subwarp.thread_rank(),
-        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-        x_stride,
-        [](ValueType &result, const ValueType &out) { result = out; });
-}
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size)
-    adaptive_apply(const ValueType *__restrict__ blocks,
-                   preconditioner::block_interleaved_storage_scheme<IndexType>
-                       storage_scheme,
-                   const precision_reduction *__restrict__ block_precisions,
-                   const IndexType *__restrict__ block_ptrs,
-                   size_type num_blocks, const ValueType *__restrict__ b,
-                   int32 b_stride, ValueType *__restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-        ValueType, block_precisions[block_id],
-        multiply_vec<max_block_size>(
-            subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision *>(
-                blocks + storage_scheme.get_group_offset(block_id)) +
-                storage_scheme.get_block_offset(block_id) +
-                subwarp.thread_rank(),
-            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-            x_stride,
-            [](ValueType &result, const ValueType &out) { result = out; }));
-}
 
 
-}  // namespace kernel
+#include "common/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
 
 
 namespace {
@@ -146,7 +77,7 @@ void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
            size_type x_stride)
 {
     constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = cuda_config::warp_size / subwarp_size;
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
     const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
                          1, 1);
     const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
@@ -184,16 +115,16 @@ void simple_apply(
 {
     // TODO: write a special kernel for multiple RHS
     for (size_type col = 0; col < b->get_size()[1]; ++col) {
-        select_apply(compiled_kernels(),
-                     [&](int compiled_block_size) {
-                         return max_block_size <= compiled_block_size;
-                     },
-                     syn::value_list<int, cuda_config::min_warps_per_block>(),
-                     syn::type_list<>(), num_blocks,
-                     block_precisions.get_const_data(),
-                     block_pointers.get_const_data(), blocks.get_const_data(),
-                     storage_scheme, b->get_const_values() + col,
-                     b->get_stride(), x->get_values() + col, x->get_stride());
+        select_apply(
+            compiled_kernels(),
+            [&](int compiled_block_size) {
+                return max_block_size <= compiled_block_size;
+            },
+            syn::value_list<int, config::min_warps_per_block>(),
+            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+            block_pointers.get_const_data(), blocks.get_const_data(),
+            storage_scheme, b->get_const_values() + col, b->get_stride(),
+            x->get_values() + col, x->get_stride());
     }
 }
 
diff --git a/cuda/solver/bicg_kernels.cu b/cuda/solver/bicg_kernels.cu
new file mode 100644
index 00000000000..175198d26d9
--- /dev/null
+++ b/cuda/solver/bicg_kernels.cu
@@ -0,0 +1,144 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/bicg_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The BICG solver namespace.
+ *
+ * @ingroup bicg
+ */
+namespace bicg {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/bicg_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const CudaExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *r2,
+                matrix::Dense<ValueType> *z2, matrix::Dense<ValueType> *p2,
+                matrix::Dense<ValueType> *q2,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1);
+
+    initialize_kernel<<<grid_size, block_size, 0, 0>>>(
+        b->get_size()[0], b->get_size()[1], b->get_stride(),
+        as_cuda_type(b->get_const_values()), as_cuda_type(r->get_values()),
+        as_cuda_type(z->get_values()), as_cuda_type(p->get_values()),
+        as_cuda_type(q->get_values()), as_cuda_type(r2->get_values()),
+        as_cuda_type(z2->get_values()), as_cuda_type(p2->get_values()),
+        as_cuda_type(q2->get_values()), as_cuda_type(prev_rho->get_values()),
+        as_cuda_type(rho->get_values()), as_cuda_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const CudaExecutor> exec,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
+            matrix::Dense<ValueType> *p2, const matrix::Dense<ValueType> *z2,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    step_1_kernel<<<grid_size, block_size, 0, 0>>>(
+        p->get_size()[0], p->get_size()[1], p->get_stride(),
+        as_cuda_type(p->get_values()), as_cuda_type(z->get_const_values()),
+        as_cuda_type(p2->get_values()), as_cuda_type(z2->get_const_values()),
+        as_cuda_type(rho->get_const_values()),
+        as_cuda_type(prev_rho->get_const_values()),
+        as_cuda_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const CudaExecutor> exec,
+            matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
+            matrix::Dense<ValueType> *r2, const matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *q,
+            const matrix::Dense<ValueType> *q2,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    step_2_kernel<<<grid_size, block_size, 0, 0>>>(
+        p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(),
+        as_cuda_type(x->get_values()), as_cuda_type(r->get_values()),
+        as_cuda_type(r2->get_values()), as_cuda_type(p->get_const_values()),
+        as_cuda_type(q->get_const_values()),
+        as_cuda_type(q2->get_const_values()),
+        as_cuda_type(beta->get_const_values()),
+        as_cuda_type(rho->get_const_values()),
+        as_cuda_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+
+
+}  // namespace bicg
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/solver/bicgstab_kernels.cu b/cuda/solver/bicgstab_kernels.cu
index 1b3c4824019..a0e5376cf69 100644
--- a/cuda/solver/bicgstab_kernels.cu
+++ b/cuda/solver/bicgstab_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -55,42 +56,7 @@ namespace bicgstab {
 constexpr int default_block_size = 512;
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ b, ValueType *__restrict__ r,
-    ValueType *__restrict__ rr, ValueType *__restrict__ y,
-    ValueType *__restrict__ s, ValueType *__restrict__ t,
-    ValueType *__restrict__ z, ValueType *__restrict__ v,
-    ValueType *__restrict__ p, ValueType *__restrict__ prev_rho,
-    ValueType *__restrict__ rho, ValueType *__restrict__ alpha,
-    ValueType *__restrict__ beta, ValueType *__restrict__ gamma,
-    ValueType *__restrict__ omega, stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-
-    if (tidx < num_cols) {
-        prev_rho[tidx] = one<ValueType>();
-        rho[tidx] = one<ValueType>();
-        alpha[tidx] = one<ValueType>();
-        beta[tidx] = one<ValueType>();
-        gamma[tidx] = one<ValueType>();
-        omega[tidx] = one<ValueType>();
-        stop_status[tidx].reset();
-    }
-
-    if (tidx < num_rows * stride) {
-        r[tidx] = b[tidx];
-        rr[tidx] = zero<ValueType>();
-        y[tidx] = zero<ValueType>();
-        s[tidx] = zero<ValueType>();
-        t[tidx] = zero<ValueType>();
-        z[tidx] = zero<ValueType>();
-        v[tidx] = zero<ValueType>();
-        p[tidx] = zero<ValueType>();
-    }
-}
+#include "common/solver/bicgstab_kernels.hpp.inc"
 
 
 template <typename ValueType>
@@ -125,31 +91,6 @@ void initialize(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_1_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ r, ValueType *__restrict__ p,
-    const ValueType *__restrict__ v, const ValueType *__restrict__ rho,
-    const ValueType *__restrict__ prev_rho, const ValueType *__restrict__ alpha,
-    const ValueType *__restrict__ omega,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    auto res = r[tidx];
-    if (prev_rho[col] * omega[col] != zero<ValueType>()) {
-        const auto tmp = (rho[col] / prev_rho[col]) * (alpha[col] / omega[col]);
-        res += tmp * (p[tidx] - omega[col] * v[tidx]);
-    }
-    p[tidx] = res;
-}
-
-
 template <typename ValueType>
 void step_1(std::shared_ptr<const CudaExecutor> exec,
             const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *p,
@@ -178,32 +119,6 @@ void step_1(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_2_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ r, ValueType *__restrict__ s,
-    const ValueType *__restrict__ v, const ValueType *__restrict__ rho,
-    ValueType *__restrict__ alpha, const ValueType *__restrict__ beta,
-    const stopping_status *__restrict__ stop_status)
-{
-    const size_type tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const size_type col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    auto t_alpha = zero<ValueType>();
-    auto t_s = r[tidx];
-    if (beta[col] != zero<ValueType>()) {
-        t_alpha = rho[col] / beta[col];
-        t_s -= t_alpha * v[tidx];
-    }
-    alpha[col] = t_alpha;
-    s[tidx] = t_s;
-}
-
-
 template <typename ValueType>
 void step_2(std::shared_ptr<const CudaExecutor> exec,
             const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *s,
@@ -230,39 +145,6 @@ void step_2(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_3_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
-    const ValueType *__restrict__ s, const ValueType *__restrict__ t,
-    const ValueType *__restrict__ y, const ValueType *__restrict__ z,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ beta,
-    const ValueType *__restrict__ gamma, ValueType *__restrict__ omega,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto row = tidx / stride;
-    const auto col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    const auto x_pos = row * x_stride + col;
-    auto t_omega = zero<ValueType>();
-    auto t_x = x[x_pos] + alpha[col] * y[tidx];
-    auto t_r = s[tidx];
-    if (beta[col] != zero<ValueType>()) {
-        t_omega = gamma[col] / beta[col];
-        t_x += t_omega * z[tidx];
-        t_r -= t_omega * t[tidx];
-    }
-    omega[col] = t_omega;
-    x[x_pos] = t_x;
-    r[tidx] = t_r;
-}
-
-
 template <typename ValueType>
 void step_3(
     std::shared_ptr<const CudaExecutor> exec, matrix::Dense<ValueType> *x,
@@ -289,28 +171,8 @@ void step_3(
         as_cuda_type(omega->get_values()),
         as_cuda_type(stop_status->get_const_data()));
 }
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
-
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void finalize_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    size_type x_stride, ValueType *__restrict__ x,
-    const ValueType *__restrict__ y, const ValueType *__restrict__ alpha,
-    stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto row = tidx / stride;
-    const auto col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].is_finalized() || !stop_status[col].has_stopped()) {
-        return;
-    }
-    const auto x_pos = row * x_stride + col;
-    x[x_pos] = x[x_pos] + alpha[col] * y[tidx];
-    stop_status[col].finalize();
-}
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
 
 
 template <typename ValueType>
diff --git a/cuda/solver/cg_kernels.cu b/cuda/solver/cg_kernels.cu
index 2e39762e7f2..9adb589a9ea 100644
--- a/cuda/solver/cg_kernels.cu
+++ b/cuda/solver/cg_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -55,30 +56,7 @@ namespace cg {
 constexpr int default_block_size = 512;
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ b, ValueType *__restrict__ r,
-    ValueType *__restrict__ z, ValueType *__restrict__ p,
-    ValueType *__restrict__ q, ValueType *__restrict__ prev_rho,
-    ValueType *__restrict__ rho, stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-
-    if (tidx < num_cols) {
-        rho[tidx] = zero<ValueType>();
-        prev_rho[tidx] = one<ValueType>();
-        stop_status[tidx].reset();
-    }
-
-    if (tidx < num_rows * stride) {
-        r[tidx] = b[tidx];
-        z[tidx] = zero<ValueType>();
-        p[tidx] = zero<ValueType>();
-        q[tidx] = zero<ValueType>();
-    }
-}
+#include "common/solver/cg_kernels.hpp.inc"
 
 
 template <typename ValueType>
@@ -104,26 +82,6 @@ void initialize(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_1_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    ValueType *__restrict__ p, const ValueType *__restrict__ z,
-    const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    const auto tmp = rho[col] / prev_rho[col];
-    p[tidx] =
-        prev_rho[col] == zero<ValueType>() ? z[tidx] : z[tidx] + tmp * p[tidx];
-}
-
-
 template <typename ValueType>
 void step_1(std::shared_ptr<const CudaExecutor> exec,
             matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
@@ -146,31 +104,6 @@ void step_1(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_2_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
-    const ValueType *__restrict__ p, const ValueType *__restrict__ q,
-    const ValueType *__restrict__ beta, const ValueType *__restrict__ rho,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto row = tidx / stride;
-    const auto col = tidx % stride;
-
-    if (col >= num_cols || tidx >= num_rows * num_cols ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    if (beta[col] != zero<ValueType>()) {
-        const auto tmp = rho[col] / beta[col];
-        x[row * x_stride + col] += tmp * p[tidx];
-        r[tidx] -= tmp * q[tidx];
-    }
-}
-
-
 template <typename ValueType>
 void step_2(std::shared_ptr<const CudaExecutor> exec,
             matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
diff --git a/cuda/solver/cgs_kernels.cu b/cuda/solver/cgs_kernels.cu
index c36536d4b93..1c1b1af6b48 100644
--- a/cuda/solver/cgs_kernels.cu
+++ b/cuda/solver/cgs_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -55,41 +56,7 @@ namespace cgs {
 constexpr int default_block_size = 512;
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ b, ValueType *__restrict__ r,
-    ValueType *__restrict__ r_tld, ValueType *__restrict__ p,
-    ValueType *__restrict__ q, ValueType *__restrict__ u,
-    ValueType *__restrict__ u_hat, ValueType *__restrict__ v_hat,
-    ValueType *__restrict__ t, ValueType *__restrict__ alpha,
-    ValueType *__restrict__ beta, ValueType *__restrict__ gamma,
-    ValueType *__restrict__ rho_prev, ValueType *__restrict__ rho,
-    stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-
-    if (tidx < num_cols) {
-        rho[tidx] = zero<ValueType>();
-        alpha[tidx] = one<ValueType>();
-        beta[tidx] = one<ValueType>();
-        gamma[tidx] = one<ValueType>();
-        rho_prev[tidx] = one<ValueType>();
-        stop_status[tidx].reset();
-    }
-
-    if (tidx < num_rows * stride) {
-        r[tidx] = b[tidx];
-        r_tld[tidx] = b[tidx];
-        u[tidx] = zero<ValueType>();
-        p[tidx] = zero<ValueType>();
-        q[tidx] = zero<ValueType>();
-        u_hat[tidx] = zero<ValueType>();
-        v_hat[tidx] = zero<ValueType>();
-        t[tidx] = zero<ValueType>();
-    }
-}
+#include "common/solver/cgs_kernels.hpp.inc"
 
 
 template <typename ValueType>
@@ -124,31 +91,6 @@ void initialize(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_1_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ r, ValueType *__restrict__ u,
-    ValueType *__restrict__ p, const ValueType *__restrict__ q,
-    ValueType *__restrict__ beta, const ValueType *__restrict__ rho,
-    const ValueType *__restrict__ rho_prev,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto col = tidx % stride;
-
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    if (rho_prev[col] != zero<ValueType>()) {
-        beta[col] = rho[col] / rho_prev[col];
-        u[tidx] = r[tidx] + beta[col] * q[tidx];
-        p[tidx] = u[tidx] + beta[col] * (q[tidx] + beta[col] * p[tidx]);
-    }
-}
-
-
 template <typename ValueType>
 void step_1(std::shared_ptr<const CudaExecutor> exec,
             const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *u,
@@ -173,31 +115,6 @@ void step_1(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_2_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ u, const ValueType *__restrict__ v_hat,
-    ValueType *__restrict__ q, ValueType *__restrict__ t,
-    ValueType *__restrict__ alpha, const ValueType *__restrict__ rho,
-    const ValueType *__restrict__ gamma,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto col = tidx % stride;
-
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    if (gamma[col] != zero<ValueType>()) {
-        alpha[col] = rho[col] / gamma[col];
-        q[tidx] = u[tidx] - alpha[col] * v_hat[tidx];
-        t[tidx] = u[tidx] + q[tidx];
-    }
-}
-
-
 template <typename ValueType>
 void step_2(std::shared_ptr<const CudaExecutor> exec,
             const matrix::Dense<ValueType> *u,
@@ -224,30 +141,6 @@ void step_2(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_3_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    size_type x_stride, const ValueType *__restrict__ t,
-    const ValueType *__restrict__ v_hat, ValueType *__restrict__ r,
-    ValueType *__restrict__ x, const ValueType *__restrict__ alpha,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto row = tidx / stride;
-    const auto col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    const auto x_pos = row * x_stride + col;
-    auto t_x = x[x_pos] + alpha[col] * v_hat[tidx];
-    auto t_r = r[tidx] - alpha[col] * t[tidx];
-    x[x_pos] = t_x;
-    r[tidx] = t_r;
-}
-
-
 template <typename ValueType>
 void step_3(std::shared_ptr<const CudaExecutor> exec,
             const matrix::Dense<ValueType> *t,
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 3b1fb1f1fd2..f16be5ee0e1 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -49,13 +49,112 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/device_guard.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
 
 
 namespace gko {
+namespace solver {
+
+
+struct SolveStruct {
+    virtual void dummy() {}
+};
+
+
+namespace cuda {
+
+
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
+
+
+struct SolveStruct : gko::solver::SolveStruct {
+    int algorithm;
+    csrsm2Info_t solve_info;
+    cusparseSolvePolicy_t policy;
+    cusparseMatDescr_t factor_descr;
+    size_t factor_work_size;
+    void *factor_work_vec;
+    SolveStruct()
+    {
+        factor_work_vec = nullptr;
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&solve_info));
+        algorithm = 0;
+        policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+    }
+
+    SolveStruct(const SolveStruct &) = delete;
+
+    SolveStruct(SolveStruct &&) = delete;
+
+    SolveStruct &operator=(const SolveStruct &) = delete;
+
+    SolveStruct &operator=(SolveStruct &&) = delete;
+
+    ~SolveStruct()
+    {
+        cusparseDestroyMatDescr(factor_descr);
+        if (solve_info) {
+            cusparseDestroyCsrsm2Info(solve_info);
+        }
+        if (factor_work_vec != nullptr) {
+            cudaFree(factor_work_vec);
+            factor_work_vec = nullptr;
+        }
+    }
+};
+
+
+#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
+
+
+struct SolveStruct : gko::solver::SolveStruct {
+    cusparseSolveAnalysisInfo_t solve_info;
+    cusparseMatDescr_t factor_descr;
+    SolveStruct()
+    {
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseCreateSolveAnalysisInfo(&solve_info));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
+    }
+
+    SolveStruct(const SolveStruct &) = delete;
+
+    SolveStruct(SolveStruct &&) = delete;
+
+    SolveStruct &operator=(const SolveStruct &) = delete;
+
+    SolveStruct &operator=(SolveStruct &&) = delete;
+
+    ~SolveStruct()
+    {
+        cusparseDestroyMatDescr(factor_descr);
+        cusparseDestroySolveAnalysisInfo(solve_info);
+    }
+};
+
+
+#endif
+
+
+}  // namespace cuda
+}  // namespace solver
+
+
 namespace kernels {
 namespace cuda {
 namespace {
@@ -83,7 +182,7 @@ void should_perform_transpose_kernel(std::shared_ptr<const CudaExecutor> exec,
 void init_struct_kernel(std::shared_ptr<const CudaExecutor> exec,
                         std::shared_ptr<solver::SolveStruct> &solve_struct)
 {
-    solve_struct = std::make_shared<solver::SolveStruct>();
+    solve_struct = std::make_shared<solver::cuda::SolveStruct>();
 }
 
 
@@ -94,65 +193,73 @@ void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
                      const gko::size_type num_rhs, bool is_upper)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
-        if (is_upper) {
-            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode(
-                solve_struct->factor_descr, CUSPARSE_FILL_MODE_UPPER));
-        }
+        if (auto cuda_solve_struct =
+                dynamic_cast<solver::cuda::SolveStruct *>(solve_struct)) {
+            auto handle = exec->get_cusparse_handle();
+            if (is_upper) {
+                GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode(
+                    cuda_solve_struct->factor_descr, CUSPARSE_FILL_MODE_UPPER));
+            }
 
 
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
 
 
-        ValueType one = 1.0;
+            ValueType one = 1.0;
 
-        {
-            cusparse::pointer_mode_guard pm_guard(handle);
-            cusparse::buffer_size_ext(
-                handle, solve_struct->algorithm,
-                CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                matrix->get_size()[0], num_rhs,
-                matrix->get_num_stored_elements(), &one,
-                solve_struct->factor_descr, matrix->get_const_values(),
-                matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                nullptr, num_rhs, solve_struct->solve_info,
-                solve_struct->policy, &solve_struct->factor_work_size);
-
-            // allocate workspace
-            if (solve_struct->factor_work_vec != nullptr) {
-                exec->free(solve_struct->factor_work_vec);
+            {
+                cusparse::pointer_mode_guard pm_guard(handle);
+                cusparse::buffer_size_ext(
+                    handle, cuda_solve_struct->algorithm,
+                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
+                    num_rhs, matrix->get_num_stored_elements(), &one,
+                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
+                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+                    nullptr, num_rhs, cuda_solve_struct->solve_info,
+                    cuda_solve_struct->policy,
+                    &cuda_solve_struct->factor_work_size);
+
+                // allocate workspace
+                if (cuda_solve_struct->factor_work_vec != nullptr) {
+                    exec->free(cuda_solve_struct->factor_work_vec);
+                }
+                cuda_solve_struct->factor_work_vec =
+                    exec->alloc<void *>(cuda_solve_struct->factor_work_size);
+
+                cusparse::csrsm2_analysis(
+                    handle, cuda_solve_struct->algorithm,
+                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
+                    num_rhs, matrix->get_num_stored_elements(), &one,
+                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
+                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+                    nullptr, num_rhs, cuda_solve_struct->solve_info,
+                    cuda_solve_struct->policy,
+                    cuda_solve_struct->factor_work_vec);
             }
-            solve_struct->factor_work_vec =
-                exec->alloc<void *>(solve_struct->factor_work_size);
-
-            cusparse::csrsm2_analysis(
-                handle, solve_struct->algorithm,
-                CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                matrix->get_size()[0], num_rhs,
-                matrix->get_num_stored_elements(), &one,
-                solve_struct->factor_descr, matrix->get_const_values(),
-                matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                nullptr, num_rhs, solve_struct->solve_info,
-                solve_struct->policy, solve_struct->factor_work_vec);
-        }
 
 
 #elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
 
 
-        {
-            cusparse::pointer_mode_guard pm_guard(handle);
-            cusparse::csrsm_analysis(
-                handle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0],
-                matrix->get_num_stored_elements(), solve_struct->factor_descr,
-                matrix->get_const_values(), matrix->get_const_row_ptrs(),
-                matrix->get_const_col_idxs(), solve_struct->solve_info);
-        }
+            {
+                cusparse::pointer_mode_guard pm_guard(handle);
+                cusparse::csrsm_analysis(
+                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    matrix->get_size()[0], matrix->get_num_stored_elements(),
+                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
+                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+                    cuda_solve_struct->solve_info);
+            }
 
 
 #endif
 
 
+        } else {
+            GKO_NOT_SUPPORTED(solve_struct);
+        }
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -169,62 +276,72 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
                   matrix::Dense<ValueType> *x)
 {
     using vec = matrix::Dense<ValueType>;
+
     if (cusparse::is_supported<ValueType, IndexType>::value) {
-        ValueType one = 1.0;
-        auto handle = exec->get_cusparse_handle();
+        if (auto cuda_solve_struct =
+                dynamic_cast<const solver::cuda::SolveStruct *>(solve_struct)) {
+            ValueType one = 1.0;
+            auto handle = exec->get_cusparse_handle();
 
 
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
 
 
-        x->copy_from(gko::lend(b));
-        {
-            cusparse::pointer_mode_guard pm_guard(handle);
-            cusparse::csrsm2_solve(
-                handle, solve_struct->algorithm,
-                CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                matrix->get_size()[0], b->get_stride(),
-                matrix->get_num_stored_elements(), &one,
-                solve_struct->factor_descr, matrix->get_const_values(),
-                matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                x->get_values(), b->get_stride(), solve_struct->solve_info,
-                solve_struct->policy, solve_struct->factor_work_vec);
-        }
+            x->copy_from(gko::lend(b));
+            {
+                cusparse::pointer_mode_guard pm_guard(handle);
+                cusparse::csrsm2_solve(
+                    handle, cuda_solve_struct->algorithm,
+                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
+                    b->get_stride(), matrix->get_num_stored_elements(), &one,
+                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
+                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+                    x->get_values(), b->get_stride(),
+                    cuda_solve_struct->solve_info, cuda_solve_struct->policy,
+                    cuda_solve_struct->factor_work_vec);
+            }
 
 
 #elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
 
 
-        {
-            cusparse::pointer_mode_guard pm_guard(handle);
-            if (b->get_stride() == 1) {
-                auto temp_b = const_cast<ValueType *>(b->get_const_values());
-                cusparse::csrsm_solve(
-                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                    matrix->get_size()[0], b->get_stride(), &one,
-                    solve_struct->factor_descr, matrix->get_const_values(),
-                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                    solve_struct->solve_info, temp_b, b->get_size()[0],
-                    x->get_values(), x->get_size()[0]);
-            } else {
-                dense::transpose(exec, trans_b, b);
-                dense::transpose(exec, trans_x, x);
-                cusparse::csrsm_solve(
-                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                    matrix->get_size()[0], trans_b->get_size()[0], &one,
-                    solve_struct->factor_descr, matrix->get_const_values(),
-                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                    solve_struct->solve_info, trans_b->get_values(),
-                    trans_b->get_size()[1], trans_x->get_values(),
-                    trans_x->get_size()[1]);
-                dense::transpose(exec, x, trans_x);
+            {
+                cusparse::pointer_mode_guard pm_guard(handle);
+                if (b->get_stride() == 1) {
+                    cusparse::csrsm_solve(
+                        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        matrix->get_size()[0], b->get_stride(), &one,
+                        cuda_solve_struct->factor_descr,
+                        matrix->get_const_values(),
+                        matrix->get_const_row_ptrs(),
+                        matrix->get_const_col_idxs(),
+                        cuda_solve_struct->solve_info, b->get_const_values(),
+                        b->get_size()[0], x->get_values(), x->get_size()[0]);
+                } else {
+                    dense::transpose(exec, b, trans_b);
+                    dense::transpose(exec, x, trans_x);
+                    cusparse::csrsm_solve(
+                        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        matrix->get_size()[0], trans_b->get_size()[0], &one,
+                        cuda_solve_struct->factor_descr,
+                        matrix->get_const_values(),
+                        matrix->get_const_row_ptrs(),
+                        matrix->get_const_col_idxs(),
+                        cuda_solve_struct->solve_info, trans_b->get_values(),
+                        trans_b->get_size()[1], trans_x->get_values(),
+                        trans_x->get_size()[1]);
+                    dense::transpose(exec, trans_x, x);
+                }
             }
-        }
 
 
 #endif
 
 
+        } else {
+            GKO_NOT_SUPPORTED(solve_struct);
+        }
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -237,4 +354,4 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
 }  // namespace gko
 
 
-#endif
+#endif  // GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_
diff --git a/cuda/solver/fcg_kernels.cu b/cuda/solver/fcg_kernels.cu
index b85c14cff91..ed92ca19120 100644
--- a/cuda/solver/fcg_kernels.cu
+++ b/cuda/solver/fcg_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -54,33 +55,8 @@ namespace fcg {
 
 constexpr int default_block_size = 512;
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    const ValueType *__restrict__ b, ValueType *__restrict__ r,
-    ValueType *__restrict__ z, ValueType *__restrict__ p,
-    ValueType *__restrict__ q, ValueType *__restrict__ t,
-    ValueType *__restrict__ prev_rho, ValueType *__restrict__ rho,
-    ValueType *__restrict__ rho_t, stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-
-    if (tidx < num_cols) {
-        rho[tidx] = zero<ValueType>();
-        prev_rho[tidx] = one<ValueType>();
-        rho_t[tidx] = one<ValueType>();
-        stop_status[tidx].reset();
-    }
-
-    if (tidx < num_rows * stride) {
-        r[tidx] = b[tidx];
-        z[tidx] = zero<ValueType>();
-        p[tidx] = zero<ValueType>();
-        q[tidx] = zero<ValueType>();
-        t[tidx] = b[tidx];
-    }
-}
+
+#include "common/solver/fcg_kernels.hpp.inc"
 
 
 template <typename ValueType>
@@ -109,26 +85,6 @@ void initialize(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_1_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    ValueType *__restrict__ p, const ValueType *__restrict__ z,
-    const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto col = tidx % stride;
-    if (col >= num_cols || tidx >= num_rows * stride ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    const auto tmp = rho[col] / prev_rho[col];
-    p[tidx] =
-        prev_rho[col] == zero<ValueType>() ? z[tidx] : z[tidx] + tmp * p[tidx];
-}
-
-
 template <typename ValueType>
 void step_1(std::shared_ptr<const CudaExecutor> exec,
             matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
@@ -151,34 +107,6 @@ void step_1(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_2_kernel(
-    size_type num_rows, size_type num_cols, size_type stride,
-    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
-    ValueType *__restrict__ t, const ValueType *__restrict__ p,
-    const ValueType *__restrict__ q, const ValueType *__restrict__ beta,
-    const ValueType *__restrict__ rho,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto row = tidx / stride;
-    const auto col = tidx % stride;
-
-    if (col >= num_cols || tidx >= num_rows * num_cols ||
-        stop_status[col].has_stopped()) {
-        return;
-    }
-    if (beta[col] != zero<ValueType>()) {
-        const auto tmp = rho[col] / beta[col];
-        const auto prev_r = r[tidx];
-        x[row * x_stride + col] += tmp * p[tidx];
-        r[tidx] -= tmp * q[tidx];
-        t[tidx] = r[tidx] - prev_r;
-    }
-}
-
-
 template <typename ValueType>
 void step_2(std::shared_ptr<const CudaExecutor> exec,
             matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index 56496cf5dd8..0ddddfc74f7 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,12 +42,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
 
@@ -63,49 +66,18 @@ namespace gmres {
 
 
 constexpr int default_block_size = 512;
-constexpr int default_dot_dim = cuda_config::warp_size;
+// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
+// size limit.
+constexpr int default_dot_dim = 32;
 constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
 
-// Must be called with at least `max(stride_b * num_rows, krylov_dim *
-// num_cols)` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void initialize_1_kernel(
-    size_type num_rows, size_type num_cols, size_type krylov_dim,
-    const ValueType *__restrict__ b, size_type stride_b,
-    ValueType *__restrict__ residual, size_type stride_residual,
-    ValueType *__restrict__ givens_sin, size_type stride_sin,
-    ValueType *__restrict__ givens_cos, size_type stride_cos,
-    stopping_status *__restrict__ stop_status)
-{
-    const auto global_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    const auto row_idx = global_id / stride_b;
-    const auto col_idx = global_id % stride_b;
-
-    if (global_id < num_cols) {
-        stop_status[global_id].reset();
-    }
-
-    if (row_idx < num_rows && col_idx < num_cols) {
-        residual[row_idx * stride_residual + col_idx] =
-            b[row_idx * stride_b + col_idx];
-    }
-
-    if (global_id < krylov_dim * num_cols) {
-        const auto row_givens = global_id / num_cols;
-        const auto col_givens = global_id % num_cols;
-
-        givens_sin[row_givens * stride_sin + col_givens] = zero<ValueType>();
-        givens_cos[row_givens * stride_cos + col_givens] = zero<ValueType>();
-    }
-}
+#include "common/solver/gmres_kernels.hpp.inc"
 
 
 template <typename ValueType>
 void initialize_1(std::shared_ptr<const CudaExecutor> exec,
                   const matrix::Dense<ValueType> *b,
-                  matrix::Dense<ValueType> *b_norm,
                   matrix::Dense<ValueType> *residual,
                   matrix::Dense<ValueType> *givens_sin,
                   matrix::Dense<ValueType> *givens_cos,
@@ -117,7 +89,6 @@ void initialize_1(std::shared_ptr<const CudaExecutor> exec,
     const dim3 block_dim(default_block_size, 1, 1);
     constexpr auto block_size = default_block_size;
 
-    b->compute_norm2(b_norm);
     initialize_1_kernel<block_size><<<grid_dim, block_dim>>>(
         b->get_size()[0], b->get_size()[1], krylov_dim,
         as_cuda_type(b->get_const_values()), b->get_stride(),
@@ -130,60 +101,10 @@ void initialize_1(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL);
 
 
-// Must be called with at least `num_rows * stride_krylov` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void initialize_2_1_kernel(
-    size_type num_rows, size_type num_rhs, size_type krylov_dim,
-    ValueType *__restrict__ krylov_bases, size_type stride_krylov,
-    ValueType *__restrict__ residual_norm_collection,
-    size_type stride_residual_nc)
-{
-    const auto global_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const auto row_idx = global_id / stride_krylov;
-    const auto col_idx = global_id % stride_krylov;
-
-    if (row_idx < num_rows && col_idx < (krylov_dim + 1) * num_rhs) {
-        krylov_bases[row_idx * stride_krylov + col_idx] = zero<ValueType>();
-    }
-
-    if (row_idx < krylov_dim + 1 && col_idx < num_rhs) {
-        residual_norm_collection[row_idx * stride_residual_nc + col_idx] =
-            zero<ValueType>();
-    }
-}
-
-
-// Must be called with at least `num_rows * num_rhs` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void initialize_2_2_kernel(
-    size_type num_rows, size_type num_rhs,
-    const ValueType *__restrict__ residual, size_type stride_residual,
-    const ValueType *__restrict__ residual_norm,
-    ValueType *__restrict__ residual_norm_collection,
-    ValueType *__restrict__ krylov_bases, size_type stride_krylov,
-    size_type *__restrict__ final_iter_nums)
-{
-    const auto global_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const auto row_idx = global_id / num_rhs;
-    const auto col_idx = global_id % num_rhs;
-
-    if (global_id < num_rhs) {
-        residual_norm_collection[global_id] = residual_norm[global_id];
-        final_iter_nums[global_id] = 0;
-    }
-
-    if (row_idx < num_rows && col_idx < num_rhs) {
-        krylov_bases[row_idx * stride_krylov + col_idx] =
-            residual[row_idx * stride_residual + col_idx] /
-            residual_norm[col_idx];
-    }
-}
-
-
 template <typename ValueType>
 void initialize_2(std::shared_ptr<const CudaExecutor> exec,
                   const matrix::Dense<ValueType> *residual,
-                  matrix::Dense<ValueType> *residual_norm,
+                  matrix::Dense<remove_complex<ValueType>> *residual_norm,
                   matrix::Dense<ValueType> *residual_norm_collection,
                   matrix::Dense<ValueType> *krylov_bases,
                   Array<size_type> *final_iter_nums, size_type krylov_dim)
@@ -191,16 +112,12 @@ void initialize_2(std::shared_ptr<const CudaExecutor> exec,
     const auto num_rows = residual->get_size()[0];
     const auto num_rhs = residual->get_size()[1];
     const dim3 grid_dim_1(
-        ceildiv(num_rows * krylov_bases->get_stride(), default_block_size), 1,
-        1);
+        ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(),
+                default_block_size),
+        1, 1);
     const dim3 block_dim(default_block_size, 1, 1);
     constexpr auto block_size = default_block_size;
 
-    initialize_2_1_kernel<block_size><<<grid_dim_1, block_dim>>>(
-        residual->get_size()[0], residual->get_size()[1], krylov_dim,
-        as_cuda_type(krylov_bases->get_values()), krylov_bases->get_stride(),
-        as_cuda_type(residual_norm_collection->get_values()),
-        residual_norm_collection->get_stride());
     residual->compute_norm2(residual_norm);
 
     const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1,
@@ -217,200 +134,49 @@ void initialize_2(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL);
 
 
-__global__
-    __launch_bounds__(default_block_size) void increase_final_iteration_numbers_kernel(
-        size_type *__restrict__ final_iter_nums,
-        const stopping_status *__restrict__ stop_status, size_type total_number)
-{
-    const auto global_id = threadIdx.x + blockIdx.x * blockDim.x;
-    if (global_id < total_number) {
-        final_iter_nums[global_id] +=
-            (1 - stop_status[global_id].has_stopped());
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
-    size_type k, size_type num_rows, size_type num_cols,
-    const ValueType *__restrict__ next_krylov_basis,
-    size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
-    size_type stride_krylov, ValueType *__restrict__ hessenberg_iter,
-    size_type stride_hessenberg,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx = threadIdx.x;
-    const auto tidy = threadIdx.y;
-    const auto col_idx = blockIdx.x * default_dot_dim + tidx;
-    const auto num = ceildiv(num_rows, gridDim.y);
-    const auto start_row = blockIdx.y * num;
-    const auto end_row =
-        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
-    // Used that way to get around dynamic initialization warning and
-    // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__
-        UninitializedArray<ValueType, default_dot_dim *(default_dot_dim + 1)>
-            reduction_helper_array;
-    ValueType *__restrict__ reduction_helper = reduction_helper_array;
-
-    ValueType local_res = zero<ValueType>();
-    const auto krylov_col = k * num_cols + col_idx;
-    if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
-        for (size_type i = start_row + tidy; i < end_row;
-             i += default_dot_dim) {
-            const auto next_krylov_idx = i * stride_next_krylov + col_idx;
-            const auto krylov_idx = i * stride_krylov + krylov_col;
-            local_res +=
-                next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
-        }
-    }
-    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
-    __syncthreads();
-    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
-    const auto tile_block =
-        group::tiled_partition<default_dot_dim>(group::this_thread_block());
-    const auto sum =
-        reduce(tile_block, local_res,
-               [](const ValueType &a, const ValueType &b) { return a + b; });
-    const auto new_col_idx = blockIdx.x * default_dot_dim + tidy;
-    if (tidx == 0 && new_col_idx < num_cols &&
-        !stop_status[new_col_idx].has_stopped()) {
-        const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
-        atomic_add(hessenberg_iter + hessenberg_idx, sum);
-    }
-}
-
-
-// Must be called with at least `num_rows * stride_next_krylov` threads in
-// total.
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_next_krylov_kernel(
-    size_type k, size_type num_rows, size_type num_cols,
-    ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov,
-    const ValueType *__restrict__ krylov_bases, size_type stride_krylov,
-    const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto global_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const auto row_idx = global_id / stride_next_krylov;
-    const auto col_idx = global_id % stride_next_krylov;
-
-    if (row_idx < num_rows && col_idx < num_cols &&
-        !stop_status[col_idx].has_stopped()) {
-        const auto next_krylov_idx = row_idx * stride_next_krylov + col_idx;
-        const auto krylov_idx =
-            row_idx * stride_krylov + k * num_cols + col_idx;
-        const auto hessenberg_idx = k * stride_hessenberg + col_idx;
-
-        next_krylov_basis[next_krylov_idx] -=
-            hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx];
-    }
-}
-
-
-// Must be called with at least `num_cols` blocks, each with `block_size`
-// threads. `block_size` must be a power of 2.
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_hessenberg_2_kernel(
-    size_type iter, size_type num_rows, size_type num_cols,
-    const ValueType *__restrict__ next_krylov_basis,
-    size_type stride_next_krylov, ValueType *__restrict__ hessenberg_iter,
-    size_type stride_hessenberg,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx = threadIdx.x;
-    const auto col_idx = blockIdx.x;
-
-    // Used that way to get around dynamic initialization warning and
-    // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
-    ValueType *__restrict__ reduction_helper = reduction_helper_array;
-
-    if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
-        ValueType local_res{};
-        for (size_type i = tidx; i < num_rows; i += block_size) {
-            const auto next_krylov_idx = i * stride_next_krylov + col_idx;
-            const auto next_krylov_value = next_krylov_basis[next_krylov_idx];
-
-            local_res += next_krylov_value * next_krylov_value;
-        }
-
-        reduction_helper[tidx] = local_res;
-
-        // Perform thread block reduction. Result is in reduction_helper[0]
-        reduce(group::this_thread_block(), reduction_helper,
-               [](const ValueType &a, const ValueType &b) { return a + b; });
-
-        if (tidx == 0) {
-            hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] =
-                sqrt(reduction_helper[0]);
-        }
-    }
-}
-
-
-// Must be called with at least `num_rows * stride_next_krylov` threads in
-// total.
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_krylov_next_krylov_kernel(
-    size_type iter, size_type num_rows, size_type num_cols,
-    ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov,
-    ValueType *__restrict__ krylov_bases, size_type stride_krylov,
-    const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto global_id = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto row_idx = global_id / stride_next_krylov;
-    const auto col_idx = global_id % stride_next_krylov;
-    const auto hessenberg =
-        hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx];
-
-    if (row_idx < num_rows && col_idx < num_cols &&
-        !stop_status[col_idx].has_stopped()) {
-        const auto next_krylov_idx = row_idx * stride_next_krylov + col_idx;
-        const auto krylov_idx =
-            row_idx * stride_krylov + num_cols * (iter + 1) + col_idx;
-
-        const auto next_krylov_value =
-            next_krylov_basis[next_krylov_idx] / hessenberg;
-
-        next_krylov_basis[next_krylov_idx] = next_krylov_value;
-        krylov_bases[krylov_idx] = next_krylov_value;
-    }
-}
-
-
 template <typename ValueType>
 void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Dense<ValueType> *next_krylov_basis,
-                    matrix::Dense<ValueType> *krylov_bases,
+                    size_type num_rows, matrix::Dense<ValueType> *krylov_bases,
                     matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
                     const stopping_status *stop_status)
 {
-    const auto stride_next_krylov = next_krylov_basis->get_stride();
     const auto stride_krylov = krylov_bases->get_stride();
     const auto stride_hessenberg = hessenberg_iter->get_stride();
-    const auto dim_size = next_krylov_basis->get_size();
     auto cublas_handle = exec->get_cublas_handle();
-    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
-                         exec->get_num_multiprocessor() * 2);
+    const dim3 grid_size(
+        ceildiv(hessenberg_iter->get_size()[1], default_dot_dim),
+        exec->get_num_multiprocessor() * 2);
     const dim3 block_size(default_dot_dim, default_dot_dim);
+    auto next_krylov_basis =
+        krylov_bases->get_values() +
+        (iter + 1) * num_rows * hessenberg_iter->get_size()[1];
     for (size_type k = 0; k < iter + 1; ++k) {
-        zero_array(dim_size[1],
-                   hessenberg_iter->get_values() + k * stride_hessenberg);
-        multidot_kernel<<<grid_size, block_size>>>(
-            k, dim_size[0], dim_size[1],
-            as_cuda_type(next_krylov_basis->get_const_values()),
-            stride_next_krylov, as_cuda_type(krylov_bases->get_const_values()),
-            stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
-            stride_hessenberg, as_cuda_type(stop_status));
+        const auto k_krylov_bases =
+            krylov_bases->get_const_values() +
+            k * num_rows * hessenberg_iter->get_size()[1];
+        if (hessenberg_iter->get_size()[1] > 1) {
+            // TODO: this condition should be tuned
+            // single rhs will use vendor's dot, otherwise, use our own
+            // multidot_kernel which parallelize multiple rhs.
+            components::fill_array(
+                exec, hessenberg_iter->get_values() + k * stride_hessenberg,
+                hessenberg_iter->get_size()[1], zero<ValueType>());
+            multidot_kernel<<<grid_size, block_size>>>(
+                k, num_rows, hessenberg_iter->get_size()[1],
+                as_cuda_type(k_krylov_bases), as_cuda_type(next_krylov_basis),
+                stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
+                stride_hessenberg, as_cuda_type(stop_status));
+        } else {
+            cublas::dot(exec->get_cublas_handle(), num_rows, k_krylov_bases,
+                        stride_krylov, next_krylov_basis, stride_krylov,
+                        hessenberg_iter->get_values() + k * stride_hessenberg);
+        }
         update_next_krylov_kernel<default_block_size>
-            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+            <<<ceildiv(num_rows * stride_krylov, default_block_size),
                default_block_size>>>(
-                k, dim_size[0], dim_size[1],
-                as_cuda_type(next_krylov_basis->get_values()),
-                stride_next_krylov,
-                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
+                k, num_rows, hessenberg_iter->get_size()[1],
+                as_cuda_type(k_krylov_bases), as_cuda_type(next_krylov_basis),
+                stride_krylov,
                 as_cuda_type(hessenberg_iter->get_const_values()),
                 stride_hessenberg, as_cuda_type(stop_status));
     }
@@ -421,156 +187,32 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
 
 
     update_hessenberg_2_kernel<default_block_size>
-        <<<dim_size[1], default_block_size>>>(
-            iter, dim_size[0], dim_size[1],
-            as_cuda_type(next_krylov_basis->get_const_values()),
-            stride_next_krylov, as_cuda_type(hessenberg_iter->get_values()),
-            stride_hessenberg, as_cuda_type(stop_status));
-
-    update_krylov_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+        <<<hessenberg_iter->get_size()[1], default_block_size>>>(
+            iter, num_rows, hessenberg_iter->get_size()[1],
+            as_cuda_type(next_krylov_basis), stride_krylov,
+            as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
+            as_cuda_type(stop_status));
+
+    update_krylov_kernel<default_block_size>
+        <<<ceildiv(num_rows * stride_krylov, default_block_size),
            default_block_size>>>(
-            iter, dim_size[0], dim_size[1],
-            as_cuda_type(next_krylov_basis->get_values()), stride_next_krylov,
-            as_cuda_type(krylov_bases->get_values()), stride_krylov,
+            iter, num_rows, hessenberg_iter->get_size()[1],
+            as_cuda_type(next_krylov_basis), stride_krylov,
             as_cuda_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_cuda_type(stop_status));
     // next_krylov_basis /= hessenberg(iter, iter + 1)
-    // krylov_bases(:, iter + 1) = next_krylov_basis
     // End of arnoldi
 }
 
 
-template <typename ValueType>
-__device__ void calculate_sin_and_cos_kernel(
-    size_type col_idx, size_type num_cols, size_type iter,
-    const ValueType *hessenberg_iter, size_type stride_hessenberg,
-    ValueType *givens_sin, size_type stride_sin, ValueType *givens_cos,
-    size_type stride_cos)
-{
-    if (hessenberg_iter[iter * stride_hessenberg + col_idx] ==
-        zero<ValueType>()) {
-        givens_cos[iter * stride_cos + col_idx] = zero<ValueType>();
-        givens_sin[iter * stride_sin + col_idx] = one<ValueType>();
-    } else {
-        auto hypotenuse =
-            sqrt(hessenberg_iter[iter * stride_hessenberg + col_idx] *
-                     hessenberg_iter[iter * stride_hessenberg + col_idx] +
-                 hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] *
-                     hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]);
-        givens_cos[iter * stride_cos + col_idx] =
-            abs(hessenberg_iter[iter * stride_hessenberg + col_idx]) /
-            hypotenuse;
-        givens_sin[iter * stride_sin + col_idx] =
-            givens_cos[iter * stride_cos + col_idx] *
-            hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] /
-            hessenberg_iter[iter * stride_hessenberg + col_idx];
-    }
-}
-
-
-template <typename ValueType>
-__device__ void calculate_residual_norm_kernel(
-    size_type col_idx, size_type num_cols, size_type iter,
-    const ValueType *givens_sin, size_type stride_sin,
-    const ValueType *givens_cos, size_type stride_cos, ValueType *residual_norm,
-    ValueType *residual_norm_collection,
-    size_type stride_residual_norm_collection, const ValueType *b_norm)
-{
-    residual_norm_collection[(iter + 1) * stride_residual_norm_collection +
-                             col_idx] =
-        -givens_sin[iter * stride_sin + col_idx] *
-        residual_norm_collection[iter * stride_residual_norm_collection +
-                                 col_idx];
-    residual_norm_collection[iter * stride_residual_norm_collection + col_idx] =
-        givens_cos[iter * stride_cos + col_idx] *
-        residual_norm_collection[iter * stride_residual_norm_collection +
-                                 col_idx];
-    residual_norm[col_idx] =
-        abs(residual_norm_collection[(iter + 1) *
-                                         stride_residual_norm_collection +
-                                     col_idx]) /
-        b_norm[col_idx];
-}
-
-
-// Must be called with at least `num_cols` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void givens_rotation_kernel(
-    size_type num_rows, size_type num_cols, size_type iter,
-    ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
-    ValueType *__restrict__ givens_sin, size_type stride_sin,
-    ValueType *__restrict__ givens_cos, size_type stride_cos,
-    ValueType *__restrict__ residual_norm,
-    ValueType *__restrict__ residual_norm_collection,
-    size_type stride_residual_norm_collection,
-    const ValueType *__restrict__ b_norm,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto col_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (col_idx >= num_cols || stop_status[col_idx].has_stopped()) {
-        return;
-    }
-
-    const auto current_thread_block = group::this_thread_block();
-
-    for (size_type i = 0; i < iter; ++i) {
-        const auto tmp =
-            givens_cos[i * stride_cos + col_idx] *
-                hessenberg_iter[i * stride_hessenberg + col_idx] +
-            givens_sin[i * stride_sin + col_idx] *
-                hessenberg_iter[(i + 1) * stride_hessenberg + col_idx];
-        current_thread_block.sync();
-        hessenberg_iter[(i + 1) * stride_hessenberg + col_idx] =
-            givens_cos[i * stride_cos + col_idx] *
-                hessenberg_iter[(i + 1) * stride_hessenberg + col_idx] -
-            givens_sin[i * stride_sin + col_idx] *
-                hessenberg_iter[i * stride_hessenberg + col_idx];
-        hessenberg_iter[i * stride_hessenberg + col_idx] = tmp;
-        current_thread_block.sync();
-    }
-    // for j in 1:iter - 1
-    //     temp             =  cos(j)*hessenberg(j) +
-    //                         sin(j)*hessenberg(j+1)
-    //     hessenberg(j+1)  = -sin(j)*hessenberg(j) +
-    //                         cos(j)*hessenberg(j+1)
-    //     hessenberg(j)    =  temp;
-    // end
-
-    calculate_sin_and_cos_kernel(col_idx, num_cols, iter, hessenberg_iter,
-                                 stride_hessenberg, givens_sin, stride_sin,
-                                 givens_cos, stride_cos);
-    // Calculate sin and cos
-
-    hessenberg_iter[iter * stride_hessenberg + col_idx] =
-        givens_cos[iter * stride_cos + col_idx] *
-            hessenberg_iter[iter * stride_hessenberg + col_idx] +
-        givens_sin[iter * stride_sin + col_idx] *
-            hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx];
-    hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] =
-        zero<ValueType>();
-    // hessenberg(iter)   = cos(iter)*hessenberg(iter) +
-    //                      sin(iter)*hessenberg(iter)
-    // hessenberg(iter+1) = 0
-
-    calculate_residual_norm_kernel(col_idx, num_cols, iter, givens_sin,
-                                   stride_sin, givens_cos, stride_cos,
-                                   residual_norm, residual_norm_collection,
-                                   stride_residual_norm_collection, b_norm);
-    // Calculate residual norm
-}
-
-
 template <typename ValueType>
 void givens_rotation(std::shared_ptr<const CudaExecutor> exec,
                      matrix::Dense<ValueType> *givens_sin,
                      matrix::Dense<ValueType> *givens_cos,
                      matrix::Dense<ValueType> *hessenberg_iter,
-                     matrix::Dense<ValueType> *residual_norm,
+                     matrix::Dense<remove_complex<ValueType>> *residual_norm,
                      matrix::Dense<ValueType> *residual_norm_collection,
-                     const matrix::Dense<ValueType> *b_norm, size_type iter,
-                     const Array<stopping_status> *stop_status)
+                     size_type iter, const Array<stopping_status> *stop_status)
 {
     // TODO: tune block_size for optimal performance
     constexpr auto block_size = default_block_size;
@@ -587,21 +229,18 @@ void givens_rotation(std::shared_ptr<const CudaExecutor> exec,
         givens_cos->get_stride(), as_cuda_type(residual_norm->get_values()),
         as_cuda_type(residual_norm_collection->get_values()),
         residual_norm_collection->get_stride(),
-        as_cuda_type(b_norm->get_const_values()),
         as_cuda_type(stop_status->get_const_data()));
 }
 
 
 template <typename ValueType>
-void step_1(std::shared_ptr<const CudaExecutor> exec,
-            matrix::Dense<ValueType> *next_krylov_basis,
+void step_1(std::shared_ptr<const CudaExecutor> exec, size_type num_rows,
             matrix::Dense<ValueType> *givens_sin,
             matrix::Dense<ValueType> *givens_cos,
-            matrix::Dense<ValueType> *residual_norm,
+            matrix::Dense<remove_complex<ValueType>> *residual_norm,
             matrix::Dense<ValueType> *residual_norm_collection,
             matrix::Dense<ValueType> *krylov_bases,
-            matrix::Dense<ValueType> *hessenberg_iter,
-            const matrix::Dense<ValueType> *b_norm, size_type iter,
+            matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
             Array<size_type> *final_iter_nums,
             const Array<stopping_status> *stop_status)
 {
@@ -611,75 +250,15 @@ void step_1(std::shared_ptr<const CudaExecutor> exec,
         default_block_size>>>(as_cuda_type(final_iter_nums->get_data()),
                               as_cuda_type(stop_status->get_const_data()),
                               final_iter_nums->get_num_elems());
-    finish_arnoldi(exec, next_krylov_basis, krylov_bases, hessenberg_iter, iter,
+    finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter,
                    stop_status->get_const_data());
     givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
-                    residual_norm, residual_norm_collection, b_norm, iter,
-                    stop_status);
+                    residual_norm, residual_norm_collection, iter, stop_status);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL);
 
 
-// Must be called with at least `num_rhs` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void solve_upper_triangular_kernel(
-    size_type num_cols, size_type num_rhs,
-    const ValueType *__restrict__ residual_norm_collection,
-    size_type stride_residual_norm_collection,
-    const ValueType *__restrict__ hessenberg, size_type stride_hessenberg,
-    ValueType *__restrict__ y, size_type stride_y,
-    const size_type *__restrict__ final_iter_nums)
-{
-    const auto col_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (col_idx >= num_rhs) {
-        return;
-    }
-
-    for (int i = final_iter_nums[col_idx] - 1; i >= 0; --i) {
-        auto temp =
-            residual_norm_collection[i * stride_residual_norm_collection +
-                                     col_idx];
-        for (size_type j = i + 1; j < final_iter_nums[col_idx]; ++j) {
-            temp -= hessenberg[i * stride_hessenberg + j * num_rhs + col_idx] *
-                    y[j * stride_y + col_idx];
-        }
-
-        y[i * stride_y + col_idx] =
-            temp / hessenberg[i * stride_hessenberg + i * num_rhs + col_idx];
-    }
-    // Solve upper triangular.
-    // y = hessenberg \ residual_norm_collection
-}
-
-
-// Must be called with at least `stride_preconditioner * num_rows` threads in
-// total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void calculate_Qy_kernel(
-    size_type num_rows, size_type num_cols, size_type num_rhs,
-    const ValueType *__restrict__ krylov_bases, size_type stride_krylov,
-    const ValueType *__restrict__ y, size_type stride_y,
-    ValueType *__restrict__ before_preconditioner,
-    size_type stride_preconditioner,
-    const size_type *__restrict__ final_iter_nums)
-{
-    const auto global_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const auto row_id = global_id / stride_preconditioner;
-    const auto col_id = global_id % stride_preconditioner;
-
-    if (row_id < num_rows && col_id < num_cols) {
-        before_preconditioner[global_id] = zero<ValueType>();
-        for (size_type j = 0; j < final_iter_nums[col_id]; ++j) {
-            before_preconditioner[global_id] +=
-                krylov_bases[row_id * stride_krylov + j * num_rhs + col_id] *
-                y[j * stride_y + col_id];
-        }
-    }
-}
-
-
 template <typename ValueType>
 void solve_upper_triangular(
     const matrix::Dense<ValueType> *residual_norm_collection,
diff --git a/cuda/solver/ir_kernels.cu b/cuda/solver/ir_kernels.cu
index e1f0dbcdc46..7b26ab3527f 100644
--- a/cuda/solver/ir_kernels.cu
+++ b/cuda/solver/ir_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "cuda/components/thread_ids.cuh"
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {
@@ -50,16 +53,7 @@ namespace ir {
 constexpr int default_block_size = 512;
 
 
-__global__ __launch_bounds__(default_block_size) void initialize_kernel(
-    size_type num_cols, stopping_status *stop_status)
-{
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
-
-    if (tidx < num_cols) {
-        stop_status[tidx].reset();
-    }
-}
+#include "common/solver/ir_kernels.hpp.inc"
 
 
 void initialize(std::shared_ptr<const CudaExecutor> exec,
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 5f6549884d9..1cd2764d481 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,7 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/lower_trs.hpp>
 
 
-#include "core/solver/lower_trs_kernels.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index 083db4a94b2..0518b11bed7 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,7 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/upper_trs.hpp>
 
 
-#include "core/solver/upper_trs_kernels.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu
index a274bd95021..390f96cb2f2 100644
--- a/cuda/stop/criterion_kernels.cu
+++ b/cuda/stop/criterion_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -60,8 +61,7 @@ __global__ __launch_bounds__(default_block_size) void set_all_statuses(
     size_type num_elems, uint8 stoppingId, bool setFinalized,
     stopping_status *stop_status)
 {
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_elems) {
         stop_status[tidx].stop(stoppingId, setFinalized);
     }
diff --git a/cuda/stop/residual_norm_reduction_kernels.cu b/cuda/stop/residual_norm_kernels.cu
similarity index 63%
rename from cuda/stop/residual_norm_reduction_kernels.cu
rename to cuda/stop/residual_norm_kernels.cu
index 189f2269152..45f2c2336d5 100644
--- a/cuda/stop/residual_norm_reduction_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,44 +30,44 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/stop/residual_norm_reduction_kernels.hpp"
+#include "core/stop/residual_norm_kernels.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
+
 
 namespace gko {
 namespace kernels {
 namespace cuda {
 /**
- * @brief The Residual norm reduction stopping criterion namespace.
+ * @brief The Residual norm stopping criterion namespace.
  * @ref resnorm
  * @ingroup resnorm
  */
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 
 constexpr int default_block_size = 512;
 
 
 template <typename ValueType>
-__global__
-    __launch_bounds__(default_block_size) void residual_norm_reduction_kernel(
-        size_type num_cols, remove_complex<ValueType> rel_residual_goal,
-        const ValueType *__restrict__ tau,
-        const ValueType *__restrict__ orig_tau, uint8 stoppingId,
-        bool setFinalized, stopping_status *__restrict__ stop_status,
-        bool *__restrict__ device_storage)
+__global__ __launch_bounds__(default_block_size) void residual_norm_kernel(
+    size_type num_cols, ValueType rel_residual_goal,
+    const ValueType *__restrict__ tau, const ValueType *__restrict__ orig_tau,
+    uint8 stoppingId, bool setFinalized,
+    stopping_status *__restrict__ stop_status,
+    bool *__restrict__ device_storage)
 {
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
-        if (abs(tau[tidx]) < rel_residual_goal * abs(orig_tau[tidx])) {
+        if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) {
             stop_status[tidx].converge(stoppingId, setFinalized);
             device_storage[1] = true;
         }
@@ -89,21 +89,22 @@ __global__ __launch_bounds__(1) void init_kernel(
 
 
 template <typename ValueType>
-void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
-                             const matrix::Dense<ValueType> *tau,
-                             const matrix::Dense<ValueType> *orig_tau,
-                             remove_complex<ValueType> rel_residual_goal,
-                             uint8 stoppingId, bool setFinalized,
-                             Array<stopping_status> *stop_status,
-                             Array<bool> *device_storage, bool *all_converged,
-                             bool *one_changed)
+void residual_norm(std::shared_ptr<const CudaExecutor> exec,
+                   const matrix::Dense<ValueType> *tau,
+                   const matrix::Dense<ValueType> *orig_tau,
+                   ValueType rel_residual_goal, uint8 stoppingId,
+                   bool setFinalized, Array<stopping_status> *stop_status,
+                   Array<bool> *device_storage, bool *all_converged,
+                   bool *one_changed)
 {
+    static_assert(is_complex_s<ValueType>::value == false,
+                  "ValueType must not be complex in this function!");
     init_kernel<<<1, 1>>>(as_cuda_type(device_storage->get_data()));
 
     const dim3 block_size(default_block_size, 1, 1);
     const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1);
 
-    residual_norm_reduction_kernel<<<grid_size, block_size>>>(
+    residual_norm_kernel<<<grid_size, block_size>>>(
         tau->get_size()[1], rel_residual_goal,
         as_cuda_type(tau->get_const_values()),
         as_cuda_type(orig_tau->get_const_values()), stoppingId, setFinalized,
@@ -111,17 +112,15 @@ void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(device_storage->get_data()));
 
     /* Represents all_converged, one_changed */
-    bool tmp[2] = {true, false};
-    exec->get_master()->copy_from(exec.get(), 2,
-                                  device_storage->get_const_data(), tmp);
-    *all_converged = tmp[0];
-    *one_changed = tmp[1];
+    *all_converged = exec->copy_val_to_host(device_storage->get_const_data());
+    *one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/test/CMakeLists.txt b/cuda/test/CMakeLists.txt
index 44d4194c226..5b180f32c11 100644
--- a/cuda/test/CMakeLists.txt
+++ b/cuda/test/CMakeLists.txt
@@ -1,4 +1,7 @@
+include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
+
 add_subdirectory(base)
+add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
 add_subdirectory(preconditioner)
diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu
index fe40d1ee3aa..2bcf5961bbd 100644
--- a/cuda/test/base/cuda_executor.cu
+++ b/cuda/test/base/cuda_executor.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include <memory>
 #include <type_traits>
 
 
@@ -43,25 +44,36 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "cuda/test/utils.hpp"
+
+
 namespace {
 
 
 class ExampleOperation : public gko::Operation {
 public:
     explicit ExampleOperation(int &val) : value(val) {}
+
     void run(std::shared_ptr<const gko::OmpExecutor>) const override
     {
         value = -1;
     }
-    void run(std::shared_ptr<const gko::CudaExecutor> cuda) const override
-    {
-        cudaGetDevice(&value);
-    }
+
     void run(std::shared_ptr<const gko::ReferenceExecutor>) const override
     {
         value = -2;
     }
 
+    void run(std::shared_ptr<const gko::HipExecutor>) const override
+    {
+        value = -3;
+    }
+
+    void run(std::shared_ptr<const gko::CudaExecutor>) const override
+    {
+        cudaGetDevice(&value);
+    }
+
     int &value;
 };
 
@@ -107,7 +119,10 @@ TEST_F(CudaExecutor, MasterKnowsNumberOfDevices)
 {
     int count = 0;
     cudaGetDeviceCount(&count);
-    ASSERT_EQ(count, gko::CudaExecutor::get_num_devices());
+
+    auto num_devices = gko::CudaExecutor::get_num_devices();
+
+    ASSERT_EQ(count, num_devices);
 }
 
 
@@ -175,6 +190,7 @@ TEST_F(CudaExecutor, CopiesDataFromCuda)
     cuda->free(orig);
 }
 
+
 /* Properly checks if it works only when multiple GPUs exist */
 TEST_F(CudaExecutor, PreservesDeviceSettings)
 {
@@ -190,14 +206,18 @@ TEST_F(CudaExecutor, PreservesDeviceSettings)
     ASSERT_EQ(current_device, previous_device);
 }
 
+
 TEST_F(CudaExecutor, RunsOnProperDevice)
 {
     int value = -1;
+
     GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(0));
     cuda2->run(ExampleOperation(value));
+
     ASSERT_EQ(value, cuda2->get_device_id());
 }
 
+
 TEST_F(CudaExecutor, CopiesDataFromCudaToCuda)
 {
     int copy[2];
@@ -215,15 +235,15 @@ TEST_F(CudaExecutor, CopiesDataFromCudaToCuda)
     GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(0));
     cuda2->run(ExampleOperation(value));
     ASSERT_EQ(value, cuda2->get_device_id());
-
+    // Put the results on OpenMP and run CPU side assertions
     omp->copy_from(cuda2.get(), 2, copy_cuda2, copy);
-
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
     cuda->free(copy_cuda2);
     cuda->free(orig);
 }
 
+
 TEST_F(CudaExecutor, Synchronizes)
 {
     // Todo design a proper unit test once we support streams
diff --git a/cuda/test/base/exception_helpers.cu b/cuda/test/base/exception_helpers.cu
index 2d9e95ddf9b..1652594803a 100644
--- a/cuda/test/base/exception_helpers.cu
+++ b/cuda/test/base/exception_helpers.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,14 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include <gtest/gtest.h>
-
-
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
 
 
+#include <gtest/gtest.h>
+
+
 namespace {
 
 
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index a4d84e2b736..08deb9a29d9 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -47,50 +47,66 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
+namespace kernel {
 
 
-template <typename T>
-__global__ void test_real_isfinite(bool *result)
+template <typename T, typename FuncType>
+__device__ bool test_real_is_finite_function(FuncType isfin)
 {
-    constexpr T inf = INFINITY;
+    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T quiet_nan = NAN;
     bool test_true{};
     bool test_false{};
 
-    test_true =
-        gko::isfinite(T{0}) && gko::isfinite(-T{0}) && gko::isfinite(T{1});
-    test_false = gko::isfinite(inf) || gko::isfinite(-inf) ||
-                 gko::isfinite(NAN) || gko::isfinite(inf - inf) ||
-                 gko::isfinite(inf / inf) || gko::isfinite(inf * T{2}) ||
-                 gko::isfinite(T{1} / T{0}) || gko::isfinite(T{0} / T{0});
-    *result = test_true && !test_false;
+    test_true = isfin(T{0}) && isfin(-T{0}) && isfin(T{1});
+    test_false = isfin(inf) || isfin(-inf) || isfin(quiet_nan) ||
+                 isfin(inf - inf) || isfin(inf / inf) || isfin(inf * T{2}) ||
+                 isfin(T{1} / T{0}) || isfin(T{0} / T{0});
+    return test_true && !test_false;
 }
 
 
-template <typename ComplexType>
-__global__ void test_complex_isfinite(bool *result)
+template <typename ComplexType, typename FuncType>
+__device__ bool test_complex_is_finite_function(FuncType isfin)
 {
     static_assert(gko::is_complex_s<ComplexType>::value,
                   "Template type must be a complex type.");
     using T = gko::remove_complex<ComplexType>;
     using c_type = gko::kernels::cuda::cuda_type<ComplexType>;
-    constexpr T inf = INFINITY;
+    constexpr T inf = gko::device_numeric_limits<T>::inf;
     constexpr T quiet_nan = NAN;
     bool test_true{};
     bool test_false{};
 
-    test_true = gko::isfinite(c_type{T{0}, T{0}}) &&
-                gko::isfinite(c_type{-T{0}, -T{0}}) &&
-                gko::isfinite(c_type{T{1}, T{0}}) &&
-                gko::isfinite(c_type{T{0}, T{1}});
-    test_false =
-        gko::isfinite(c_type{inf, T{0}}) || gko::isfinite(c_type{-inf, T{0}}) ||
-        gko::isfinite(c_type{quiet_nan, T{0}}) ||
-        gko::isfinite(c_type{T{0}, inf}) || gko::isfinite(c_type{T{0}, -inf}) ||
-        gko::isfinite(c_type{T{0}, quiet_nan});
-    *result = test_true && !test_false;
+    test_true = isfin(c_type{T{0}, T{0}}) && isfin(c_type{-T{0}, -T{0}}) &&
+                isfin(c_type{T{1}, T{0}}) && isfin(c_type{T{0}, T{1}});
+    test_false = isfin(c_type{inf, T{0}}) || isfin(c_type{-inf, T{0}}) ||
+                 isfin(c_type{quiet_nan, T{0}}) || isfin(c_type{T{0}, inf}) ||
+                 isfin(c_type{T{0}, -inf}) || isfin(c_type{T{0}, quiet_nan});
+    return test_true && !test_false;
+}
+
+
+}  // namespace kernel
+
+
+template <typename T>
+__global__ void test_real_is_finite(bool *result)
+{
+    *result = kernel::test_real_is_finite_function<T>(
+        [](T val) { return gko::is_finite(val); });
+}
+
+
+template <typename ComplexType>
+__global__ void test_complex_is_finite(bool *result)
+{
+    *result = kernel::test_complex_is_finite_function<ComplexType>(
+        [](ComplexType val) { return gko::is_finite(val); });
 }
 
 
@@ -102,19 +118,19 @@ protected:
     {}
 
     template <typename T>
-    bool test_real_isfinite_kernel()
+    bool test_real_is_finite_kernel()
     {
         gko::Array<bool> result(cuda, 1);
-        test_real_isfinite<T><<<1, 1>>>(result.get_data());
+        test_real_is_finite<T><<<1, 1>>>(result.get_data());
         result.set_executor(ref);
         return *result.get_data();
     }
 
     template <typename T>
-    bool test_complex_isfinite_kernel()
+    bool test_complex_is_finite_kernel()
     {
         gko::Array<bool> result(cuda, 1);
-        test_complex_isfinite<T><<<1, 1>>>(result.get_data());
+        test_complex_is_finite<T><<<1, 1>>>(result.get_data());
         result.set_executor(ref);
         return *result.get_data();
     }
@@ -124,21 +140,21 @@ protected:
 };
 
 
-TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_isfinite_kernel<float>()); }
+TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel<float>()); }
 
 
-TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_isfinite_kernel<double>()); }
+TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel<double>()); }
 
 
 TEST_F(IsFinite, FloatComplex)
 {
-    ASSERT_TRUE(test_complex_isfinite_kernel<thrust::complex<float>>());
+    ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<float>>());
 }
 
 
 TEST_F(IsFinite, DoubleComplex)
 {
-    ASSERT_TRUE(test_complex_isfinite_kernel<thrust::complex<double>>());
+    ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<double>>());
 }
 
 
diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..154a39e963e
--- /dev/null
+++ b/cuda/test/components/CMakeLists.txt
@@ -0,0 +1,7 @@
+ginkgo_create_cuda_test(cooperative_groups_kernels)
+ginkgo_create_cuda_test(merging_kernels)
+ginkgo_create_cuda_test(searching_kernels)
+ginkgo_create_cuda_test(sorting_kernels)
+ginkgo_create_test(fill_array)
+ginkgo_create_test(precision_conversion)
+ginkgo_create_test(prefix_sum)
diff --git a/cuda/test/components/cooperative_groups_kernels.cu b/cuda/test/components/cooperative_groups_kernels.cu
new file mode 100644
index 00000000000..e565a6c9952
--- /dev/null
+++ b/cuda/test/components/cooperative_groups_kernels.cu
@@ -0,0 +1,262 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/components/cooperative_groups.cuh"
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/base/config.hpp"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::cuda;
+
+
+class CooperativeGroups : public ::testing::Test {
+protected:
+    CooperativeGroups()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref)),
+          result(ref, 1),
+          dresult(cuda)
+    {
+        *result.get_data() = true;
+        dresult = result;
+    }
+
+    template <typename Kernel>
+    void test(Kernel kernel)
+    {
+        kernel<<<1, config::warp_size>>>(dresult.get_data());
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    template <typename Kernel>
+    void test_subwarp(Kernel kernel)
+    {
+        kernel<<<1, config::warp_size / 2>>>(dresult.get_data());
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    gko::Array<bool> result;
+    gko::Array<bool> dresult;
+};
+
+
+constexpr static int subwarp_size = config::warp_size / 4;
+
+
+__device__ void test_assert(bool *success, bool partial)
+{
+    if (!partial) {
+        *success = false;
+    }
+}
+
+
+__global__ void cg_shuffle(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto i = int(group.thread_rank());
+    test_assert(s, group.shfl_up(i, 1) == max(0, i - 1));
+    test_assert(s, group.shfl_down(i, 1) == min(i + 1, config::warp_size - 1));
+    test_assert(s, group.shfl(i, 0) == 0);
+}
+
+TEST_F(CooperativeGroups, Shuffle) { test(cg_shuffle); }
+
+
+__global__ void cg_all(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    test_assert(s, group.all(true));
+    test_assert(s, !group.all(false));
+    test_assert(s, !group.all(threadIdx.x < 13));
+}
+
+TEST_F(CooperativeGroups, All) { test(cg_all); }
+
+
+__global__ void cg_any(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    test_assert(s, group.any(true));
+    test_assert(s, group.any(threadIdx.x == 0));
+    test_assert(s, !group.any(false));
+}
+
+TEST_F(CooperativeGroups, Any) { test(cg_any); }
+
+
+__global__ void cg_ballot(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    test_assert(s, group.ballot(false) == 0);
+    test_assert(s, group.ballot(true) == ~config::lane_mask_type{});
+    test_assert(s, group.ballot(threadIdx.x < 4) == 0xf);
+}
+
+TEST_F(CooperativeGroups, Ballot) { test(cg_ballot); }
+
+
+__global__ void cg_subwarp_shuffle(bool *s)
+{
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto i = int(group.thread_rank());
+    test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0));
+    test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1));
+    auto group_base = threadIdx.x / subwarp_size * subwarp_size;
+    test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base);
+    if (threadIdx.x / subwarp_size == 1) {
+        test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0));
+        test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1));
+        test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base);
+    } else {
+        test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1));
+        test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base);
+        test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0));
+    }
+}
+
+TEST_F(CooperativeGroups, SubwarpShuffle) { test(cg_subwarp_shuffle); }
+
+TEST_F(CooperativeGroups, SubwarpShuffle2) { test_subwarp(cg_subwarp_shuffle); }
+
+
+__global__ void cg_subwarp_all(bool *s)
+{
+    auto grp = threadIdx.x / subwarp_size;
+    bool test_grp = grp == 1;
+    auto i = threadIdx.x % subwarp_size;
+    // only test with test_grp, the other threads run 'interference'
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    test_assert(s, !test_grp || group.all(test_grp));
+    test_assert(s, !test_grp || !group.all(!test_grp));
+    test_assert(s, !test_grp || !group.all(i < subwarp_size - 3 || !test_grp));
+    if (test_grp) {
+        test_assert(s, group.all(true));
+        test_assert(s, !group.all(false));
+        test_assert(s, !group.all(i < subwarp_size - 3));
+    } else {
+        test_assert(s, !group.all(false));
+        test_assert(s, !group.all(i < subwarp_size - 3));
+        test_assert(s, group.all(true));
+    }
+}
+
+TEST_F(CooperativeGroups, SubwarpAll) { test(cg_subwarp_all); }
+
+TEST_F(CooperativeGroups, SubwarpAll2) { test_subwarp(cg_subwarp_all); }
+
+
+__global__ void cg_subwarp_any(bool *s)
+{
+    auto grp = threadIdx.x / subwarp_size;
+    bool test_grp = grp == 1;
+    // only test with test_grp, the other threads run 'interference'
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto i = group.thread_rank();
+    test_assert(s, !test_grp || group.any(test_grp));
+    test_assert(s, !test_grp || group.any(test_grp && i == 1));
+    test_assert(s, !test_grp || !group.any(!test_grp));
+    if (test_grp) {
+        test_assert(s, group.any(true));
+        test_assert(s, group.any(i == 1));
+        test_assert(s, !group.any(false));
+    } else {
+        test_assert(s, !group.any(false));
+        test_assert(s, group.any(true));
+        test_assert(s, group.any(i == 1));
+    }
+}
+
+TEST_F(CooperativeGroups, SubwarpAny) { test(cg_subwarp_any); }
+
+TEST_F(CooperativeGroups, SubwarpAny2) { test_subwarp(cg_subwarp_any); }
+
+
+__global__ void cg_subwarp_ballot(bool *s)
+{
+    auto grp = threadIdx.x / subwarp_size;
+    bool test_grp = grp == 1;
+    auto full_mask = (config::lane_mask_type{1} << subwarp_size) - 1;
+    // only test with test_grp, the other threads run 'interference'
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto i = group.thread_rank();
+    test_assert(s, !test_grp || group.ballot(!test_grp) == 0);
+    test_assert(s, !test_grp || group.ballot(test_grp) == full_mask);
+    test_assert(s, !test_grp || group.ballot(i < 4 || !test_grp) == 0xf);
+    if (test_grp) {
+        test_assert(s, group.ballot(false) == 0);
+        test_assert(s, group.ballot(true) == full_mask);
+        test_assert(s, group.ballot(i < 4) == 0xf);
+    } else {
+        test_assert(s, group.ballot(true) == full_mask);
+        test_assert(s, group.ballot(i < 4) == 0xf);
+        test_assert(s, group.ballot(false) == 0);
+    }
+}
+
+TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); }
+
+TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); }
+
+
+}  // namespace
diff --git a/cuda/test/components/fill_array.cpp b/cuda/test/components/fill_array.cpp
new file mode 100644
index 00000000000..f5a1f8734f1
--- /dev/null
+++ b/cuda/test/components/fill_array.cpp
@@ -0,0 +1,82 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/fill_array.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils/assertions.hpp"
+
+
+namespace {
+
+
+class FillArray : public ::testing::Test {
+protected:
+    using value_type = double;
+    FillArray()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::CudaExecutor::create(0, ref)),
+          total_size(6344),
+          vals(ref, total_size),
+          dvals(exec, total_size)
+    {
+        std::fill_n(vals.get_data(), total_size, 1234.0);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> exec;
+    gko::size_type total_size;
+    gko::Array<value_type> vals;
+    gko::Array<value_type> dvals;
+};
+
+
+TEST_F(FillArray, EqualsReference)
+{
+    gko::kernels::cuda::components::fill_array(exec, dvals.get_data(),
+                                               total_size, 1234.0);
+    GKO_ASSERT_ARRAY_EQ(vals, dvals);
+}
+
+
+}  // namespace
diff --git a/cuda/test/components/merging_kernels.cu b/cuda/test/components/merging_kernels.cu
new file mode 100644
index 00000000000..abd135b4d65
--- /dev/null
+++ b/cuda/test/components/merging_kernels.cu
@@ -0,0 +1,295 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/components/merging.cuh"
+
+
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::cuda;
+using namespace cooperative_groups;
+
+
+class Merging : public ::testing::Test {
+protected:
+    Merging()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref)),
+          rng(123456),
+          rng_runs{100},
+          max_size{1637},
+          sizes{0,  1,  2,   3,   4,   10,  15,   16,
+                31, 34, 102, 242, 534, 956, 1239, 1637},
+          data1(ref, max_size),
+          data2(ref, max_size),
+          outdata(ref, 2 * max_size),
+          idxs1(ref),
+          idxs2(ref),
+          idxs3(ref),
+          refidxs1(ref),
+          refidxs2(ref),
+          refidxs3(ref),
+          refdata(ref, 2 * max_size),
+          ddata1(cuda),
+          ddata2(cuda),
+          didxs1(cuda, 2 * max_size),
+          didxs2(cuda, 2 * max_size),
+          didxs3(cuda, 2 * max_size),
+          drefidxs1(cuda, 2 * max_size),
+          drefidxs2(cuda, 2 * max_size),
+          drefidxs3(cuda, 2 * max_size),
+          doutdata(cuda, 2 * max_size)
+    {}
+
+    void init_data(int rng_run)
+    {
+        std::uniform_int_distribution<gko::int32> dist(0, max_size);
+        std::fill_n(data1.get_data(), max_size, 0);
+        std::fill_n(data2.get_data(), max_size, 0);
+        for (int i = 0; i < max_size; ++i) {
+            // here we also want to test some corner cases
+            // first two runs: zero data1
+            if (rng_run > 1) data1.get_data()[i] = dist(rng);
+            // first and third run: zero data2
+            if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng);
+        }
+        std::sort(data1.get_data(), data1.get_data() + max_size);
+        std::sort(data2.get_data(), data2.get_data() + max_size);
+
+        ddata1 = data1;
+        ddata2 = data2;
+    }
+
+    void assert_eq_ref(int size, int eq_size)
+    {
+        outdata = doutdata;
+        auto out_ptr = outdata.get_const_data();
+        auto out_end = out_ptr + eq_size;
+        auto ref_ptr = refdata.get_data();
+        std::copy_n(data1.get_const_data(), size, ref_ptr);
+        std::copy_n(data2.get_const_data(), size, ref_ptr + size);
+        std::sort(ref_ptr, ref_ptr + 2 * size);
+
+        ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    std::default_random_engine rng;
+
+    int rng_runs;
+    int max_size;
+    std::vector<int> sizes;
+    gko::Array<gko::int32> data1;
+    gko::Array<gko::int32> data2;
+    gko::Array<gko::int32> idxs1;
+    gko::Array<gko::int32> idxs2;
+    gko::Array<gko::int32> idxs3;
+    gko::Array<gko::int32> refidxs1;
+    gko::Array<gko::int32> refidxs2;
+    gko::Array<gko::int32> refidxs3;
+    gko::Array<gko::int32> outdata;
+    gko::Array<gko::int32> refdata;
+    gko::Array<gko::int32> ddata1;
+    gko::Array<gko::int32> ddata2;
+    gko::Array<gko::int32> didxs1;
+    gko::Array<gko::int32> didxs2;
+    gko::Array<gko::int32> didxs3;
+    gko::Array<gko::int32> drefidxs1;
+    gko::Array<gko::int32> drefidxs2;
+    gko::Array<gko::int32> drefidxs3;
+    gko::Array<gko::int32> doutdata;
+};
+
+
+__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
+                                gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto i = warp.thread_rank();
+    auto result = group_merge_step<config::warp_size>(a[i], b[i], warp);
+    c[i] = min(result.a_val, result.b_val);
+}
+
+TEST_F(Merging, MergeStep)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        test_merge_step<<<1, config::warp_size>>>(ddata1.get_const_data(),
+                                                  ddata2.get_const_data(),
+                                                  doutdata.get_data());
+
+        assert_eq_ref(config::warp_size, config::warp_size);
+    }
+}
+
+
+__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
+                           gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                       return true;
+                                   });
+}
+
+TEST_F(Merging, FullMerge)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        for (auto size : sizes) {
+            test_merge<<<1, config::warp_size>>>(ddata1.get_const_data(),
+                                                 ddata2.get_const_data(), size,
+                                                 doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
+                                      int size, gko::int32 *c)
+{
+    sequential_merge(
+        a, size, b, size,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+            return true;
+        });
+}
+
+TEST_F(Merging, SequentialFullMerge)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        for (auto size : sizes) {
+            test_sequential_merge<<<1, 1>>>(ddata1.get_const_data(),
+                                            ddata2.get_const_data(), size,
+                                            doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
+                                int size, gko::int32 *c, gko::int32 *aidxs,
+                                gko::int32 *bidxs, gko::int32 *cidxs,
+                                gko::int32 *refaidxs, gko::int32 *refbidxs,
+                                gko::int32 *refcidxs)
+{
+    if (threadIdx.x == 0) {
+        sequential_merge(a, size, b, size,
+                         [&](int a_idx, gko::int32 a_val, int b_idx,
+                             gko::int32 b_val, int i) {
+                             refaidxs[i] = a_idx;
+                             refbidxs[i] = b_idx;
+                             refcidxs[i] = i;
+                             return true;
+                         });
+    }
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           aidxs[i] = a_idx;
+                                           bidxs[i] = b_idx;
+                                           cidxs[i] = i;
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                       return true;
+                                   });
+}
+
+TEST_F(Merging, FullMergeIdxs)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        for (auto size : sizes) {
+            test_merge_idxs<<<1, config::warp_size>>>(
+                ddata1.get_const_data(), ddata2.get_const_data(), size,
+                doutdata.get_data(), didxs1.get_data(), didxs2.get_data(),
+                didxs3.get_data(), drefidxs1.get_data(), drefidxs2.get_data(),
+                drefidxs3.get_data());
+
+            assert_eq_ref(size, 2 * size);
+            idxs1 = didxs1;
+            idxs2 = didxs2;
+            idxs3 = didxs3;
+            refidxs1 = drefidxs1;
+            refidxs2 = drefidxs2;
+            refidxs3 = drefidxs3;
+            auto idxs1_ptr = idxs1.get_const_data();
+            auto idxs2_ptr = idxs2.get_const_data();
+            auto idxs3_ptr = idxs3.get_const_data();
+            auto refidxs1_ptr = refidxs1.get_const_data();
+            auto refidxs2_ptr = refidxs2.get_const_data();
+            auto refidxs3_ptr = refidxs3.get_const_data();
+
+            ASSERT_TRUE(
+                std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr));
+        }
+    }
+}
+
+
+}  // namespace
diff --git a/cuda/test/components/precision_conversion.cpp b/cuda/test/components/precision_conversion.cpp
new file mode 100644
index 00000000000..73751dbc1d9
--- /dev/null
+++ b/cuda/test/components/precision_conversion.cpp
@@ -0,0 +1,173 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+class PrecisionConversion : public ::testing::Test {
+protected:
+    PrecisionConversion()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::CudaExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          cvals(ref, total_size),
+          vals2(ref, 1),
+          expected_float(ref, 1),
+          expected_double(ref, 1),
+          dvals(exec),
+          dcvals(exec),
+          dvals2(exec)
+    {
+        auto maxval = 1e10f;
+        std::uniform_real_distribution<float> dist(-maxval, maxval);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+            cvals.get_data()[i] = {dist(rand), dist(rand)};
+        }
+        dvals = vals;
+        dcvals = cvals;
+        gko::uint64 rawdouble{0x4218888000889111ULL};
+        gko::uint32 rawfloat{0x50c44400UL};
+        gko::uint64 rawrounded{0x4218888000000000ULL};
+        std::memcpy(vals2.get_data(), &rawdouble, sizeof(double));
+        std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float));
+        std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double));
+        dvals2 = vals2;
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<float> vals;
+    gko::Array<float> dvals;
+    gko::Array<double> vals2;
+    gko::Array<double> dvals2;
+    gko::Array<float> expected_float;
+    gko::Array<double> expected_double;
+    gko::Array<std::complex<float>> cvals;
+    gko::Array<std::complex<float>> dcvals;
+};
+
+
+TEST_F(PrecisionConversion, ConvertsReal)
+{
+    gko::Array<double> dtmp;
+    gko::Array<float> dout;
+
+    dtmp = dvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealViaRef)
+{
+    gko::Array<double> tmp{ref};
+    gko::Array<float> dout;
+
+    tmp = dvals;
+    dout = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplex)
+{
+    gko::Array<std::complex<double>> dtmp;
+    gko::Array<std::complex<float>> dout;
+
+    dtmp = dcvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dcvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConversionRounds)
+{
+    gko::Array<float> dtmp;
+    gko::Array<double> dout;
+
+    dtmp = dvals2;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dtmp, expected_float);
+    GKO_ASSERT_ARRAY_EQ(dout, expected_double);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealFromRef)
+{
+    gko::Array<double> dtmp;
+    gko::Array<float> dout;
+
+    dtmp = vals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplexFromRef)
+{
+    gko::Array<std::complex<double>> dtmp;
+    gko::Array<std::complex<float>> dout;
+
+    dtmp = cvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dcvals, dout);
+}
+
+
+}  // namespace
diff --git a/cuda/test/components/prefix_sum.cpp b/cuda/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..6c3ad82f21e
--- /dev/null
+++ b/cuda/test/components/prefix_sum.cpp
@@ -0,0 +1,96 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::CudaExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::components::prefix_sum(ref, vals.get_data(),
+                                                        size);
+        gko::kernels::cuda::components::prefix_sum(exec, dvals.get_data(),
+                                                   size);
+
+        GKO_ASSERT_ARRAY_EQ(vals, dvals);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/cuda/test/components/searching_kernels.cu b/cuda/test/components/searching_kernels.cu
new file mode 100644
index 00000000000..d4f92099f4a
--- /dev/null
+++ b/cuda/test/components/searching_kernels.cu
@@ -0,0 +1,246 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/components/searching.cuh"
+
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::cuda;
+using cooperative_groups::this_thread_block;
+
+
+class Searching : public ::testing::Test {
+protected:
+    Searching()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref)),
+          result(ref, 1),
+          dresult(cuda),
+          sizes(14203)
+    {
+        std::iota(sizes.begin(), sizes.end(), 0);
+    }
+
+    template <typename Kernel>
+    void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1)
+    {
+        *result.get_data() = true;
+        dresult = result;
+        kernel<<<num_blocks, config::warp_size>>>(dresult.get_data(), offset,
+                                                  size);
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    gko::Array<bool> result;
+    gko::Array<bool> dresult;
+    std::vector<int> sizes;
+};
+
+
+__device__ void test_assert(bool *success, bool predicate)
+{
+    if (!predicate) {
+        *success = false;
+    }
+}
+
+
+__global__ void test_binary_search(bool *success, int offset, int size)
+{
+    // test binary search on [offset, offset + size)
+    // for all possible partition points
+    auto result = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset;
+    });
+    auto result2 = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset + 1;
+    });
+    test_assert(success, result == threadIdx.x + offset);
+    test_assert(success, result2 == threadIdx.x + offset + 1);
+}
+
+TEST_F(Searching, BinaryNoOffset)
+{
+    run_test(test_binary_search, 0, config::warp_size);
+}
+
+TEST_F(Searching, BinaryOffset)
+{
+    run_test(test_binary_search, 5, config::warp_size);
+}
+
+
+__global__ void test_empty_binary_search(bool *success, int offset, int)
+{
+    auto result = binary_search(offset, 0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == offset);
+}
+
+TEST_F(Searching, BinaryEmptyNoOffset)
+{
+    run_test(test_empty_binary_search, 0, 0);
+}
+
+TEST_F(Searching, BinaryEmptyOffset)
+{
+    run_test(test_empty_binary_search, 5, 0);
+}
+
+
+__global__ void test_sync_binary_search(bool *success, int, int size)
+{
+    // test binary search on [0, size)
+    // for all possible partition points
+    auto result = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x;
+    });
+    auto result2 = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x + 1;
+    });
+    test_assert(success, result == threadIdx.x);
+    test_assert(success, result2 == threadIdx.x + 1);
+}
+
+TEST_F(Searching, SyncBinary)
+{
+    run_test(test_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_empty_sync_binary_search(bool *success, int, int)
+{
+    auto result = synchronous_binary_search(0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == 0);
+}
+
+TEST_F(Searching, EmptySyncBinary)
+{
+    run_test(test_empty_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_warp_ary_search(bool *success, int offset, int size)
+{
+    // test binary search on [offset, offset + size)
+    // for all possible partition points
+    auto warp = group::tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_ary_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpAryNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpAryOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 134, size, size + 1);
+    }
+}
+
+
+__global__ void test_warp_wide_search(bool *success, int offset, int size)
+{
+    // test binary search on [offset, offset + size)
+    // for all possible partition points
+    auto warp = group::tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_wide_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpWideNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpWideOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 142, size, size + 1);
+    }
+}
+
+
+}  // namespace
diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
new file mode 100644
index 00000000000..cc50281177b
--- /dev/null
+++ b/cuda/test/components/sorting_kernels.cu
@@ -0,0 +1,144 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/components/sorting.cuh"
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+using gko::kernels::cuda::bitonic_sort;
+using gko::kernels::cuda::config;
+
+
+constexpr auto num_elements = 2048;
+constexpr auto num_local = 4;
+constexpr auto num_threads = num_elements / num_local;
+
+
+__global__ void test_sort_shared(gko::int32 *data)
+{
+    gko::int32 local[num_local];
+    __shared__ gko::int32 sh_local[num_elements];
+    for (int i = 0; i < num_local; ++i) {
+        local[i] = data[threadIdx.x * num_local + i];
+    }
+    bitonic_sort<num_elements, num_local>(local, sh_local);
+    for (int i = 0; i < num_local; ++i) {
+        data[threadIdx.x * num_local + i] = local[i];
+    }
+}
+
+
+__global__ void test_sort_warp(gko::int32 *data)
+{
+    gko::int32 local[num_local];
+    for (int i = 0; i < num_local; ++i) {
+        local[i] = data[threadIdx.x * num_local + i];
+    }
+    bitonic_sort<config::warp_size * num_local, num_local>(
+        local, static_cast<gko::int32 *>(nullptr));
+    for (int i = 0; i < num_local; ++i) {
+        data[threadIdx.x * num_local + i] = local[i];
+    }
+}
+
+
+class Sorting : public ::testing::Test {
+protected:
+    Sorting()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref)),
+          rng(123456),
+          ref_shared(ref, num_elements),
+          ref_warp(ref),
+          ddata(cuda)
+    {
+        // we want some duplicate elements
+        std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
+        for (auto i = 0; i < num_elements; ++i) {
+            ref_shared.get_data()[i] = dist(rng);
+        }
+        ddata = gko::Array<gko::int32>{cuda, ref_shared};
+        ref_warp = ref_shared;
+        std::sort(ref_shared.get_data(), ref_shared.get_data() + num_elements);
+        std::sort(ref_warp.get_data(),
+                  ref_warp.get_data() + (config::warp_size * num_local));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    std::default_random_engine rng;
+    gko::Array<gko::int32> ref_shared;
+    gko::Array<gko::int32> ref_warp;
+    gko::Array<gko::int32> ddata;
+};
+
+
+TEST_F(Sorting, CudaBitonicSortWarp)
+{
+    test_sort_warp<<<1, config::warp_size>>>(ddata.get_data());
+    ddata.set_executor(ref);
+    auto data_ptr = ddata.get_const_data();
+    auto ref_ptr = ref_warp.get_const_data();
+
+    ASSERT_TRUE(std::equal(data_ptr, data_ptr + (num_local * config::warp_size),
+                           ref_ptr));
+}
+
+
+TEST_F(Sorting, CudaBitonicSortShared)
+{
+    test_sort_shared<<<1, num_threads>>>(ddata.get_data());
+    ddata.set_executor(ref);
+    auto data_ptr = ddata.get_const_data();
+    auto ref_ptr = ref_shared.get_const_data();
+
+    ASSERT_TRUE(std::equal(data_ptr, data_ptr + num_elements, ref_ptr));
+}
+
+
+}  // namespace
diff --git a/cuda/test/factorization/CMakeLists.txt b/cuda/test/factorization/CMakeLists.txt
index 36c21b93eea..5b494bf99b9 100644
--- a/cuda/test/factorization/CMakeLists.txt
+++ b/cuda/test/factorization/CMakeLists.txt
@@ -1 +1,4 @@
+ginkgo_create_test(ilu_kernels)
+ginkgo_create_test(par_ict_kernels)
 ginkgo_create_test(par_ilu_kernels)
+ginkgo_create_test(par_ilut_kernels)
diff --git a/cuda/test/factorization/ilu_kernels.cpp b/cuda/test/factorization/ilu_kernels.cpp
new file mode 100644
index 00000000000..4c1d356b0d0
--- /dev/null
+++ b/cuda/test/factorization/ilu_kernels.cpp
@@ -0,0 +1,121 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/ilu.hpp>
+
+
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/par_ilu.hpp>
+
+
+#include "cuda/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class Ilu : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    std::shared_ptr<Csr> csr_ref;
+    std::shared_ptr<Csr> csr_cuda;
+
+    Ilu()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref))
+    {}
+
+    void SetUp() override
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        csr_ref = gko::read<Csr>(input_file, ref);
+        csr_cuda = Csr::create(cuda);
+        csr_cuda->copy_from(gko::lend(csr_ref));
+    }
+};
+
+
+TEST_F(Ilu, ComputeILUIsEquivalentToRef)
+{
+    auto ref_fact =
+        gko::factorization::ParIlu<>::build().on(ref)->generate(csr_ref);
+    auto cuda_fact =
+        gko::factorization::Ilu<>::build().on(cuda)->generate(csr_cuda);
+
+    GKO_ASSERT_MTX_NEAR(ref_fact->get_l_factor(), cuda_fact->get_l_factor(),
+                        1e-14);
+    GKO_ASSERT_MTX_NEAR(ref_fact->get_u_factor(), cuda_fact->get_u_factor(),
+                        1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_l_factor(),
+                               cuda_fact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_u_factor(),
+                               cuda_fact->get_u_factor());
+}
+
+
+TEST_F(Ilu, SetsCorrectStrategy)
+{
+    auto hip_fact =
+        gko::factorization::Ilu<>::build()
+            .with_l_strategy(std::make_shared<Csr::merge_path>())
+            .with_u_strategy(std::make_shared<Csr::load_balance>(cuda))
+            .on(cuda)
+            ->generate(csr_cuda);
+
+    ASSERT_EQ(hip_fact->get_l_factor()->get_strategy()->get_name(),
+              "merge_path");
+    ASSERT_EQ(hip_fact->get_u_factor()->get_strategy()->get_name(),
+              "load_balance");
+}
+
+
+}  // namespace
diff --git a/cuda/test/factorization/par_ict_kernels.cpp b/cuda/test/factorization/par_ict_kernels.cpp
new file mode 100644
index 00000000000..f052ac4bc85
--- /dev/null
+++ b/cuda/test/factorization/par_ict_kernels.cpp
@@ -0,0 +1,177 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "cuda/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class ParIct : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    ParIct()
+        : mtx_size(436, 436),
+          rand_engine(45856),
+          ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref))
+    {
+        mtx = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<>(10, mtx_size[1]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+
+        dmtx_ani = Csr::create(cuda);
+        dmtx_l_ani = Csr::create(cuda);
+        dmtx = Csr::create(cuda);
+        dmtx->copy_from(lend(mtx));
+        dmtx_l = Csr::create(cuda);
+        dmtx_l->copy_from(lend(mtx_l));
+    }
+
+    void SetUp()
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        mtx_ani = gko::read<Csr>(input_file, ref);
+        mtx_ani->sort_by_column_index();
+
+        {
+            mtx_l_ani = Csr::create(ref, mtx_ani->get_size());
+            gko::matrix::CsrBuilder<value_type, index_type> l_builder(
+                lend(mtx_l_ani));
+            gko::kernels::reference::factorization::initialize_row_ptrs_l(
+                ref, lend(mtx_ani), mtx_l_ani->get_row_ptrs());
+            auto l_nnz =
+                mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+            l_builder.get_value_array().resize_and_reset(l_nnz);
+            gko::kernels::reference::factorization::initialize_l(
+                ref, lend(mtx_ani), lend(mtx_l_ani), true);
+        }
+        dmtx_ani->copy_from(lend(mtx_ani));
+        dmtx_l_ani->copy_from(lend(mtx_l_ani));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+
+    const gko::dim<2> mtx_size;
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> mtx_ani;
+    std::unique_ptr<Csr> mtx_l_ani;
+    std::unique_ptr<Csr> mtx_l;
+
+    std::unique_ptr<Csr> dmtx;
+    std::unique_ptr<Csr> dmtx_ani;
+    std::unique_ptr<Csr> dmtx_l_ani;
+    std::unique_ptr<Csr> dmtx_l;
+};
+
+
+TEST_F(ParIct, KernelAddCandidatesIsEquivalentToRef)
+{
+    auto mtx_llt = Csr::create(ref, mtx_size);
+    mtx_l->apply(lend(mtx_l->transpose()), lend(mtx_llt));
+    auto dmtx_llt = Csr::create(cuda, mtx_size);
+    dmtx_llt->copy_from(lend(mtx_llt));
+    auto res_mtx_l = Csr::create(ref, mtx_size);
+    auto dres_mtx_l = Csr::create(cuda, mtx_size);
+
+    gko::kernels::reference::par_ict_factorization::add_candidates(
+        ref, lend(mtx_llt), lend(mtx), lend(mtx_l), lend(res_mtx_l));
+    gko::kernels::cuda::par_ict_factorization::add_candidates(
+        cuda, lend(dmtx_llt), lend(dmtx), lend(dmtx_l), lend(dres_mtx_l));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14);
+}
+
+
+TEST_F(ParIct, KernelComputeFactorIsEquivalentToRef)
+{
+    auto square_size = mtx_ani->get_size();
+    auto mtx_l_coo = Coo::create(ref, square_size);
+    mtx_l_ani->convert_to(lend(mtx_l_coo));
+    auto dmtx_l_coo = Coo::create(cuda, square_size);
+    dmtx_l_coo->copy_from(lend(mtx_l_coo));
+
+    gko::kernels::reference::par_ict_factorization::compute_factor(
+        ref, lend(mtx_ani), lend(mtx_l_ani), lend(mtx_l_coo));
+    for (int i = 0; i < 20; ++i) {
+        gko::kernels::cuda::par_ict_factorization::compute_factor(
+            cuda, lend(dmtx_ani), lend(dmtx_l_ani), lend(dmtx_l_coo));
+    }
+
+    GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2);
+}
+
+
+}  // namespace
diff --git a/cuda/test/factorization/par_ilu_kernels.cpp b/cuda/test/factorization/par_ilu_kernels.cpp
index ee28409223c..f3ae4150924 100644
--- a/cuda/test/factorization/par_ilu_kernels.cpp
+++ b/cuda/test/factorization/par_ilu_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 #include <fstream>
 #include <memory>
+#include <random>
 #include <string>
 
 
@@ -49,7 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils.hpp"
+#include "core/factorization/factorization_kernels.hpp"
+#include "cuda/test/utils.hpp"
 #include "matrices/config.hpp"
 
 
@@ -64,8 +66,15 @@ class ParIlu : public ::testing::Test {
     using Coo = gko::matrix::Coo<value_type, index_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
 
+    std::ranlux48 rand_engine;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    std::shared_ptr<const Csr> csr_ref;
+    std::shared_ptr<const Csr> csr_cuda;
+
     ParIlu()
-        : ref(gko::ReferenceExecutor::create()),
+        : rand_engine(18),
+          ref(gko::ReferenceExecutor::create()),
           cuda(gko::CudaExecutor::create(0, ref)),
           csr_ref(nullptr),
           csr_cuda(nullptr)
@@ -79,25 +88,61 @@ class ParIlu : public ::testing::Test {
             FAIL() << "Could not find the file \"" << file_name
                    << "\", which is required for this test.\n";
         }
-        csr_ref = gko::read<Csr>(input_file, ref);
+        auto csr_ref_temp = gko::read<Csr>(input_file, ref);
         auto csr_cuda_temp = Csr::create(cuda);
-        csr_cuda_temp->copy_from(gko::lend(csr_ref));
+        csr_cuda_temp->copy_from(gko::lend(csr_ref_temp));
+        // Make sure there are diagonal elements present
+        gko::kernels::reference::factorization::add_diagonal_elements(
+            ref, gko::lend(csr_ref_temp), false);
+        gko::kernels::cuda::factorization::add_diagonal_elements(
+            cuda, gko::lend(csr_cuda_temp), false);
+        csr_ref = gko::give(csr_ref_temp);
         csr_cuda = gko::give(csr_cuda_temp);
     }
 
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<gko::CudaExecutor> cuda;
-    std::shared_ptr<const Csr> csr_ref;
-    std::shared_ptr<const Csr> csr_cuda;
+    template <typename Mtx>
+    std::unique_ptr<Mtx> gen_mtx(index_type num_rows, index_type num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<index_type>(0, num_cols - 1),
+            std::normal_distribution<value_type>(0.0, 1.0), rand_engine, ref);
+    }
+
+    std::unique_ptr<Csr> gen_unsorted_mtx(index_type num_rows,
+                                          index_type num_cols)
+    {
+        using std::swap;
+        auto mtx = gen_mtx<Csr>(num_rows, num_cols);
+        auto values = mtx->get_values();
+        auto col_idxs = mtx->get_col_idxs();
+        const auto row_ptrs = mtx->get_const_row_ptrs();
+        for (int row = 0; row < num_rows; ++row) {
+            const auto row_start = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            const int num_row_elements = row_end - row_start;
+            auto idx_dist = std::uniform_int_distribution<index_type>(
+                row_start, row_end - 1);
+            for (int i = 0; i < num_row_elements / 2; ++i) {
+                auto idx1 = idx_dist(rand_engine);
+                auto idx2 = idx_dist(rand_engine);
+                if (idx1 != idx2) {
+                    swap(values[idx1], values[idx2]);
+                    swap(col_idxs[idx1], col_idxs[idx2]);
+                }
+            }
+        }
+        return mtx;
+    }
 
     void initialize_row_ptrs(index_type *l_row_ptrs_ref,
                              index_type *u_row_ptrs_ref,
                              index_type *l_row_ptrs_cuda,
                              index_type *u_row_ptrs_cuda)
     {
-        gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u(
+        gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
             ref, gko::lend(csr_ref), l_row_ptrs_ref, u_row_ptrs_ref);
-        gko::kernels::cuda::par_ilu_factorization::initialize_row_ptrs_l_u(
+        gko::kernels::cuda::factorization::initialize_row_ptrs_l_u(
             cuda, gko::lend(csr_cuda), l_row_ptrs_cuda, u_row_ptrs_cuda);
     }
 
@@ -124,18 +169,18 @@ class ParIlu : public ::testing::Test {
         *l_cuda = Csr::create(cuda, csr_cuda->get_size(), l_nnz);
         *u_cuda = Csr::create(cuda, csr_cuda->get_size(), u_nnz);
         // Copy the already initialized `row_ptrs` to the new matrices
-        ref->copy_from(gko::lend(ref), num_row_ptrs, l_row_ptrs_ref.get_data(),
-                       (*l_ref)->get_row_ptrs());
-        ref->copy_from(gko::lend(ref), num_row_ptrs, u_row_ptrs_ref.get_data(),
-                       (*u_ref)->get_row_ptrs());
-        cuda->copy_from(gko::lend(cuda), num_row_ptrs,
-                        l_row_ptrs_cuda.get_data(), (*l_cuda)->get_row_ptrs());
-        cuda->copy_from(gko::lend(cuda), num_row_ptrs,
-                        u_row_ptrs_cuda.get_data(), (*u_cuda)->get_row_ptrs());
-
-        gko::kernels::reference::par_ilu_factorization::initialize_l_u(
+        ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(),
+                  (*l_ref)->get_row_ptrs());
+        ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(),
+                  (*u_ref)->get_row_ptrs());
+        cuda->copy(num_row_ptrs, l_row_ptrs_cuda.get_data(),
+                   (*l_cuda)->get_row_ptrs());
+        cuda->copy(num_row_ptrs, u_row_ptrs_cuda.get_data(),
+                   (*u_cuda)->get_row_ptrs());
+
+        gko::kernels::reference::factorization::initialize_l_u(
             ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref));
-        gko::kernels::cuda::par_ilu_factorization::initialize_l_u(
+        gko::kernels::cuda::factorization::initialize_l_u(
             cuda, gko::lend(csr_cuda), gko::lend(*l_cuda), gko::lend(*u_cuda));
     }
 
@@ -176,6 +221,63 @@ class ParIlu : public ::testing::Test {
 };
 
 
+TEST_F(ParIlu, CudaKernelAddDiagonalElementsSortedEquivalentToRef)
+{
+    index_type num_rows{600};
+    index_type num_cols{600};
+    auto mtx_ref = gen_mtx<Csr>(num_rows, num_cols);
+    auto mtx_cuda = Csr::create(cuda);
+    mtx_cuda->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        ref, gko::lend(mtx_ref), true);
+    gko::kernels::cuda::factorization::add_diagonal_elements(
+        cuda, gko::lend(mtx_cuda), true);
+
+    ASSERT_TRUE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_cuda, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_cuda);
+}
+
+
+TEST_F(ParIlu, CudaKernelAddDiagonalElementsUnsortedEquivalentToRef)
+{
+    index_type num_rows{600};
+    index_type num_cols{600};
+    auto mtx_ref = gen_unsorted_mtx(num_rows, num_cols);
+    auto mtx_cuda = Csr::create(cuda);
+    mtx_cuda->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        ref, gko::lend(mtx_ref), false);
+    gko::kernels::cuda::factorization::add_diagonal_elements(
+        cuda, gko::lend(mtx_cuda), false);
+
+    ASSERT_FALSE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_cuda, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_cuda);
+}
+
+
+TEST_F(ParIlu, CudaKernelAddDiagonalElementsNonSquareEquivalentToRef)
+{
+    index_type num_rows{600};
+    index_type num_cols{500};
+    auto mtx_ref = gen_mtx<Csr>(num_rows, num_cols);
+    auto mtx_cuda = Csr::create(cuda);
+    mtx_cuda->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        ref, gko::lend(mtx_ref), true);
+    gko::kernels::cuda::factorization::add_diagonal_elements(
+        cuda, gko::lend(mtx_cuda), true);
+
+    ASSERT_TRUE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_cuda, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_cuda);
+}
+
+
 TEST_F(ParIlu, KernelInitializeRowPtrsLUEquivalentToRef)
 {
     auto num_row_ptrs = csr_ref->get_size()[0] + 1;
@@ -188,8 +290,8 @@ TEST_F(ParIlu, KernelInitializeRowPtrsLUEquivalentToRef)
         l_row_ptrs_array_ref.get_data(), u_row_ptrs_array_ref.get_data(),
         l_row_ptrs_array_cuda.get_data(), u_row_ptrs_array_cuda.get_data());
 
-    GKO_ASSERT_ARRAY_EQ(&l_row_ptrs_array_ref, &l_row_ptrs_array_cuda);
-    GKO_ASSERT_ARRAY_EQ(&u_row_ptrs_array_ref, &u_row_ptrs_array_cuda);
+    GKO_ASSERT_ARRAY_EQ(l_row_ptrs_array_ref, l_row_ptrs_array_cuda);
+    GKO_ASSERT_ARRAY_EQ(u_row_ptrs_array_ref, u_row_ptrs_array_cuda);
 }
 
 
@@ -204,6 +306,8 @@ TEST_F(ParIlu, KernelInitializeParILUIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(l_ref, l_cuda, 1e-14);
     GKO_ASSERT_MTX_NEAR(u_ref, u_cuda, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_cuda);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_cuda);
 }
 
 
@@ -218,6 +322,8 @@ TEST_F(ParIlu, KernelComputeParILUIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(l_ref, l_cuda, 5e-2);
     GKO_ASSERT_MTX_NEAR(u_ref, u_cuda, 5e-2);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_cuda);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_cuda);
 }
 
 
@@ -233,6 +339,8 @@ TEST_F(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(l_ref, l_cuda, 1e-14);
     GKO_ASSERT_MTX_NEAR(u_ref, u_cuda, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_cuda);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_cuda);
 }
 
 
diff --git a/cuda/test/factorization/par_ilut_kernels.cpp b/cuda/test/factorization/par_ilut_kernels.cpp
new file mode 100644
index 00000000000..7a66ffe2ec0
--- /dev/null
+++ b/cuda/test/factorization/par_ilut_kernels.cpp
@@ -0,0 +1,541 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "cuda/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class ParIlut : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Dense = gko::matrix::Dense<value_type>;
+    using ComplexDense = gko::matrix::Dense<std::complex<value_type>>;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using ComplexCsr = gko::matrix::Csr<std::complex<value_type>, index_type>;
+
+    ParIlut()
+        : mtx_size(532, 423),
+          rand_engine(1337),
+          ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref))
+    {
+        mtx1 = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<>(10, mtx_size[1]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx2 = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<>(0, mtx_size[1]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_square = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[0],
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<>(10, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l2 = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], true,
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l_complex =
+            gko::test::generate_random_lower_triangular_matrix<ComplexCsr>(
+                mtx_size[0], mtx_size[0], false,
+                std::uniform_int_distribution<>(10, mtx_size[0]),
+                std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_u = gko::test::generate_random_upper_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<>(10, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_u_complex =
+            gko::test::generate_random_upper_triangular_matrix<ComplexCsr>(
+                mtx_size[0], mtx_size[0], false,
+                std::uniform_int_distribution<>(10, mtx_size[0]),
+                std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+
+        dmtx1 = Csr::create(cuda);
+        dmtx1->copy_from(mtx1.get());
+        dmtx2 = Csr::create(cuda);
+        dmtx2->copy_from(mtx2.get());
+        dmtx_square = Csr::create(cuda);
+        dmtx_square->copy_from(mtx_square.get());
+        dmtx_ani = Csr::create(cuda);
+        dmtx_l_ani = Csr::create(cuda);
+        dmtx_u_ani = Csr::create(cuda);
+        dmtx_ut_ani = Csr::create(cuda);
+        dmtx_l = Csr::create(cuda);
+        dmtx_l->copy_from(mtx_l.get());
+        dmtx_l2 = Csr::create(cuda);
+        dmtx_l2->copy_from(mtx_l2.get());
+        dmtx_u = Csr::create(cuda);
+        dmtx_u->copy_from(mtx_u.get());
+        dmtx_l_complex = ComplexCsr::create(cuda);
+        dmtx_l_complex->copy_from(mtx_l_complex.get());
+        dmtx_u_complex = ComplexCsr::create(cuda);
+        dmtx_u_complex->copy_from(mtx_u_complex.get());
+    }
+
+    void SetUp()
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        mtx_ani = gko::read<Csr>(input_file, ref);
+        mtx_ani->sort_by_column_index();
+
+        {
+            mtx_l_ani = Csr::create(ref, mtx_ani->get_size());
+            mtx_u_ani = Csr::create(ref, mtx_ani->get_size());
+            gko::matrix::CsrBuilder<value_type, index_type> l_builder(
+                mtx_l_ani.get());
+            gko::matrix::CsrBuilder<value_type, index_type> u_builder(
+                mtx_u_ani.get());
+            gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
+                ref, mtx_ani.get(), mtx_l_ani->get_row_ptrs(),
+                mtx_u_ani->get_row_ptrs());
+            auto l_nnz =
+                mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            auto u_nnz =
+                mtx_u_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+            l_builder.get_value_array().resize_and_reset(l_nnz);
+            u_builder.get_col_idx_array().resize_and_reset(u_nnz);
+            u_builder.get_value_array().resize_and_reset(u_nnz);
+            gko::kernels::reference::factorization::initialize_l_u(
+                ref, mtx_ani.get(), mtx_l_ani.get(), mtx_u_ani.get());
+            mtx_ut_ani = Csr::create(ref, mtx_ani->get_size(),
+                                     mtx_u_ani->get_num_stored_elements());
+            gko::kernels::reference::csr::transpose(ref, mtx_u_ani.get(),
+                                                    mtx_ut_ani.get());
+        }
+        dmtx_ani->copy_from(mtx_ani.get());
+        dmtx_l_ani->copy_from(mtx_l_ani.get());
+        dmtx_u_ani->copy_from(mtx_u_ani.get());
+        dmtx_ut_ani->copy_from(mtx_ut_ani.get());
+    }
+
+    template <typename Mtx>
+    void test_select(const std::unique_ptr<Mtx> &mtx,
+                     const std::unique_ptr<Mtx> &dmtx, index_type rank,
+                     value_type tolerance = 0.0)
+    {
+        auto size = index_type(mtx->get_num_stored_elements());
+        using ValueType = typename Mtx::value_type;
+
+        gko::remove_complex<ValueType> res{};
+        gko::remove_complex<ValueType> dres{};
+        gko::Array<ValueType> tmp(ref);
+        gko::Array<gko::remove_complex<ValueType>> tmp2(ref);
+        gko::Array<ValueType> dtmp(cuda);
+        gko::Array<gko::remove_complex<ValueType>> dtmp2(cuda);
+
+        gko::kernels::reference::par_ilut_factorization::threshold_select(
+            ref, mtx.get(), rank, tmp, tmp2, res);
+        gko::kernels::cuda::par_ilut_factorization::threshold_select(
+            cuda, dmtx.get(), rank, dtmp, dtmp2, dres);
+
+        ASSERT_NEAR(res, dres, tolerance);
+    }
+
+    template <typename Mtx,
+              typename Coo = gko::matrix::Coo<typename Mtx::value_type,
+                                              typename Mtx::index_type>>
+    void test_filter(const std::unique_ptr<Mtx> &mtx,
+                     const std::unique_ptr<Mtx> &dmtx, value_type threshold,
+                     bool lower)
+    {
+        auto res = Mtx::create(ref, mtx_size);
+        auto dres = Mtx::create(cuda, mtx_size);
+        auto res_coo = Coo::create(ref, mtx_size);
+        auto dres_coo = Coo::create(cuda, mtx_size);
+        auto local_mtx = gko::as<Mtx>(lower ? mtx->clone() : mtx->transpose());
+        auto local_dmtx =
+            gko::as<Mtx>(lower ? dmtx->clone() : dmtx->transpose());
+
+        gko::kernels::reference::par_ilut_factorization::threshold_filter(
+            ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower);
+        gko::kernels::cuda::par_ilut_factorization::threshold_filter(
+            cuda, local_dmtx.get(), threshold, dres.get(), dres_coo.get(),
+            lower);
+
+        GKO_ASSERT_MTX_NEAR(res, dres, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+        GKO_ASSERT_MTX_NEAR(res, res_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo);
+        GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo);
+    }
+
+    template <typename Mtx,
+              typename Coo = gko::matrix::Coo<typename Mtx::value_type,
+                                              typename Mtx::index_type>>
+    void test_filter_approx(const std::unique_ptr<Mtx> &mtx,
+                            const std::unique_ptr<Mtx> &dmtx, index_type rank,
+                            value_type tolerance = 0.0)
+    {
+        auto res = Mtx::create(ref, mtx_size);
+        auto dres = Mtx::create(cuda, mtx_size);
+        auto res_coo = Coo::create(ref, mtx_size);
+        auto dres_coo = Coo::create(cuda, mtx_size);
+        using ValueType = typename Mtx::value_type;
+
+        gko::Array<ValueType> tmp(ref);
+        gko::Array<ValueType> dtmp(cuda);
+        gko::remove_complex<ValueType> threshold{};
+        gko::remove_complex<ValueType> dthreshold{};
+
+        gko::kernels::reference::par_ilut_factorization::
+            threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold,
+                                    res.get(), res_coo.get());
+        gko::kernels::cuda::par_ilut_factorization::threshold_filter_approx(
+            cuda, dmtx.get(), rank, dtmp, dthreshold, dres.get(),
+            dres_coo.get());
+
+        GKO_ASSERT_MTX_NEAR(res, dres, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+        GKO_ASSERT_MTX_NEAR(res, res_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo);
+        GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo);
+        ASSERT_NEAR(threshold, dthreshold, tolerance);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+
+    const gko::dim<2> mtx_size;
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx1;
+    std::unique_ptr<Csr> mtx2;
+    std::unique_ptr<Csr> mtx_square;
+    std::unique_ptr<Csr> mtx_ani;
+    std::unique_ptr<Csr> mtx_l_ani;
+    std::unique_ptr<Csr> mtx_u_ani;
+    std::unique_ptr<Csr> mtx_ut_ani;
+    std::unique_ptr<Csr> mtx_l;
+    std::unique_ptr<Csr> mtx_l2;
+    std::unique_ptr<ComplexCsr> mtx_l_complex;
+    std::unique_ptr<Csr> mtx_u;
+    std::unique_ptr<ComplexCsr> mtx_u_complex;
+
+    std::unique_ptr<Csr> dmtx1;
+    std::unique_ptr<Csr> dmtx2;
+    std::unique_ptr<Csr> dmtx_square;
+    std::unique_ptr<Csr> dmtx_ani;
+    std::unique_ptr<Csr> dmtx_l_ani;
+    std::unique_ptr<Csr> dmtx_u_ani;
+    std::unique_ptr<Csr> dmtx_ut_ani;
+    std::unique_ptr<Csr> dmtx_l;
+    std::unique_ptr<Csr> dmtx_l2;
+    std::unique_ptr<ComplexCsr> dmtx_l_complex;
+    std::unique_ptr<Csr> dmtx_u;
+    std::unique_ptr<ComplexCsr> dmtx_u_complex;
+};
+
+
+TEST_F(ParIlut, KernelThresholdSelectIsEquivalentToRef)
+{
+    test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 3);
+}
+
+
+TEST_F(ParIlut, KernelThresholdSelectMinIsEquivalentToRef)
+{
+    test_select(mtx_l, dmtx_l, 0);
+}
+
+
+TEST_F(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef)
+{
+    test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdSelectIsEquivalentToRef)
+{
+    test_select(mtx_l_complex, dmtx_l_complex,
+                mtx_l_complex->get_num_stored_elements() / 3, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdSelectMinIsEquivalentToRef)
+{
+    test_select(mtx_l_complex, dmtx_l_complex, 0, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdSelectMaxLowerIsEquivalentToRef)
+{
+    test_select(mtx_l_complex, dmtx_l_complex,
+                mtx_l_complex->get_num_stored_elements() - 1, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef)
+{
+    auto res = Csr::create(ref, mtx_size);
+    auto dres = Csr::create(cuda, mtx_size);
+    Coo *null_coo = nullptr;
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter(
+        ref, mtx_l.get(), 0.5, res.get(), null_coo, true);
+    gko::kernels::cuda::par_ilut_factorization::threshold_filter(
+        cuda, dmtx_l.get(), 0.5, dres.get(), null_coo, true);
+
+    GKO_ASSERT_MTX_NEAR(res, dres, 0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0.5, true);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0.5, false);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterNoneLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0, true);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterNoneUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0, false);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterAllLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 1e6, true);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterAllUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 1e6, false);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0.5, true);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0.5, false);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterNoneLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0, true);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterNoneUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0, false);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterAllLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 1e6, true);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterAllUppererIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 1e6, false);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0.5, true);
+    auto res = Csr::create(ref, mtx_size);
+    auto dres = Csr::create(cuda, mtx_size);
+    Coo *null_coo = nullptr;
+    gko::Array<value_type> tmp(ref);
+    gko::Array<value_type> dtmp(cuda);
+    gko::remove_complex<value_type> threshold{};
+    gko::remove_complex<value_type> dthreshold{};
+    index_type rank{};
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter_approx(
+        ref, mtx_l.get(), rank, tmp, threshold, res.get(), null_coo);
+    gko::kernels::cuda::par_ilut_factorization::threshold_filter_approx(
+        cuda, dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), null_coo);
+
+    GKO_ASSERT_MTX_NEAR(res, dres, 0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+    ASSERT_EQ(threshold, dthreshold);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 2);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l, dmtx_l, 0);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterApproxLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l_complex, dmtx_l_complex,
+                       mtx_l_complex->get_num_stored_elements() / 2,
+                       r<value_type>::value);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterApproxNoneLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l_complex, dmtx_l_complex, 0, r<value_type>::value);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterApproxAllLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l_complex, dmtx_l_complex,
+                       mtx_l_complex->get_num_stored_elements() - 1,
+                       r<value_type>::value);
+}
+
+
+TEST_F(ParIlut, KernelAddCandidatesIsEquivalentToRef)
+{
+    auto square_size = mtx_square->get_size();
+    auto mtx_lu = Csr::create(ref, square_size);
+    mtx_l2->apply(mtx_u.get(), mtx_lu.get());
+    auto dmtx_lu = Csr::create(cuda, square_size);
+    dmtx_lu->copy_from(mtx_lu.get());
+    auto res_mtx_l = Csr::create(ref, square_size);
+    auto res_mtx_u = Csr::create(ref, square_size);
+    auto dres_mtx_l = Csr::create(cuda, square_size);
+    auto dres_mtx_u = Csr::create(cuda, square_size);
+
+    gko::kernels::reference::par_ilut_factorization::add_candidates(
+        ref, mtx_lu.get(), mtx_square.get(), mtx_l2.get(), mtx_u.get(),
+        res_mtx_l.get(), res_mtx_u.get());
+    gko::kernels::cuda::par_ilut_factorization::add_candidates(
+        cuda, dmtx_lu.get(), dmtx_square.get(), dmtx_l2.get(), dmtx_u.get(),
+        dres_mtx_l.get(), dres_mtx_u.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, dres_mtx_u);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14);
+    GKO_ASSERT_MTX_NEAR(res_mtx_u, dres_mtx_u, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelComputeLUIsEquivalentToRef)
+{
+    auto square_size = mtx_ani->get_size();
+    auto mtx_l_coo = Coo::create(ref, square_size);
+    auto mtx_u_coo = Coo::create(ref, square_size);
+    mtx_l_ani->convert_to(mtx_l_coo.get());
+    mtx_u_ani->convert_to(mtx_u_coo.get());
+    auto dmtx_l_coo = Coo::create(cuda, square_size);
+    auto dmtx_u_coo = Coo::create(cuda, square_size);
+    dmtx_l_coo->copy_from(mtx_l_coo.get());
+    dmtx_u_coo->copy_from(mtx_u_coo.get());
+
+    gko::kernels::reference::par_ilut_factorization::compute_l_u_factors(
+        ref, mtx_ani.get(), mtx_l_ani.get(), mtx_l_coo.get(), mtx_u_ani.get(),
+        mtx_u_coo.get(), mtx_ut_ani.get());
+    for (int i = 0; i < 20; ++i) {
+        gko::kernels::cuda::par_ilut_factorization::compute_l_u_factors(
+            cuda, dmtx_ani.get(), dmtx_l_ani.get(), dmtx_l_coo.get(),
+            dmtx_u_ani.get(), dmtx_u_coo.get(), dmtx_ut_ani.get());
+    }
+    auto dmtx_utt_ani = gko::as<Csr>(dmtx_ut_ani->transpose());
+
+    GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2);
+    GKO_ASSERT_MTX_NEAR(mtx_u_ani, dmtx_u_ani, 1e-2);
+    GKO_ASSERT_MTX_NEAR(dmtx_u_ani, dmtx_utt_ani, 0);
+}
+
+
+}  // namespace
diff --git a/cuda/test/matrix/coo_kernels.cpp b/cuda/test/matrix/coo_kernels.cpp
index d1974a64530..7d6051d9f63 100644
--- a/cuda/test/matrix/coo_kernels.cpp
+++ b/cuda/test/matrix/coo_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/coo_kernels.hpp"
+#include <ginkgo/core/matrix/coo.hpp>
 
 
 #include <random>
@@ -42,12 +42,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils.hpp"
+#include "core/matrix/coo_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
index fd08a070cdb..39608505cbd 100644
--- a/cuda/test/matrix/csr_kernels.cpp
+++ b/cuda/test/matrix/csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/csr_kernels.hpp"
+#include <ginkgo/core/matrix/csr.hpp>
 
 
 #include <random>
@@ -42,15 +42,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include "core/test/utils.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
@@ -63,7 +64,7 @@ class Csr : public ::testing::Test {
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
     using ComplexMtx = gko::matrix::Csr<std::complex<double>>;
 
-    Csr() : rand_engine(42) {}
+    Csr() : mtx_size(532, 231), rand_engine(42) {}
 
     void SetUp()
     {
@@ -93,13 +94,17 @@ class Csr : public ::testing::Test {
                            int num_vectors = 1)
     {
         mtx = Mtx::create(ref, strategy);
-        mtx->copy_from(gen_mtx<Vec>(532, 231, 1));
-        expected = gen_mtx<Vec>(532, num_vectors, 1);
-        y = gen_mtx<Vec>(231, num_vectors, 1);
+        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
+        square_mtx = Mtx::create(ref, strategy);
+        square_mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[0], 1));
+        expected = gen_mtx<Vec>(mtx_size[0], num_vectors, 1);
+        y = gen_mtx<Vec>(mtx_size[1], num_vectors, 1);
         alpha = gko::initialize<Vec>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
         dmtx = Mtx::create(cuda, strategy);
         dmtx->copy_from(mtx.get());
+        square_dmtx = Mtx::create(cuda, strategy);
+        square_dmtx->copy_from(square_mtx.get());
         dresult = Vec::create(cuda);
         dresult->copy_from(expected.get());
         dy = Vec::create(cuda);
@@ -114,18 +119,53 @@ class Csr : public ::testing::Test {
         std::shared_ptr<ComplexMtx::strategy_type> strategy)
     {
         complex_mtx = ComplexMtx::create(ref, strategy);
-        complex_mtx->copy_from(gen_mtx<ComplexVec>(532, 231, 1));
+        complex_mtx->copy_from(
+            gen_mtx<ComplexVec>(mtx_size[0], mtx_size[1], 1));
         complex_dmtx = ComplexMtx::create(cuda, strategy);
         complex_dmtx->copy_from(complex_mtx.get());
     }
 
+    struct matrix_pair {
+        std::unique_ptr<Mtx> ref;
+        std::unique_ptr<Mtx> cuda;
+    };
+
+    matrix_pair gen_unsorted_mtx()
+    {
+        constexpr int min_nnz_per_row = 2;  // Must be at least 2
+        auto local_mtx_ref =
+            gen_mtx<Mtx>(mtx_size[0], mtx_size[1], min_nnz_per_row);
+        for (size_t row = 0; row < mtx_size[0]; ++row) {
+            const auto row_ptrs = local_mtx_ref->get_const_row_ptrs();
+            const auto start_row = row_ptrs[row];
+            auto col_idx = local_mtx_ref->get_col_idxs() + start_row;
+            auto vals = local_mtx_ref->get_values() + start_row;
+            const auto nnz_in_this_row = row_ptrs[row + 1] - row_ptrs[row];
+            auto swap_idx_dist =
+                std::uniform_int_distribution<>(0, nnz_in_this_row - 1);
+            // shuffle `nnz_in_this_row / 2` times
+            for (size_t perm = 0; perm < nnz_in_this_row; perm += 2) {
+                const auto idx1 = swap_idx_dist(rand_engine);
+                const auto idx2 = swap_idx_dist(rand_engine);
+                std::swap(col_idx[idx1], col_idx[idx2]);
+                std::swap(vals[idx1], vals[idx2]);
+            }
+        }
+        auto local_mtx_cuda = Mtx::create(cuda);
+        local_mtx_cuda->copy_from(local_mtx_ref.get());
+
+        return {std::move(local_mtx_ref), std::move(local_mtx_cuda)};
+    }
+
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
+    const gko::dim<2> mtx_size;
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<ComplexMtx> complex_mtx;
+    std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
     std::unique_ptr<Vec> y;
     std::unique_ptr<Vec> alpha;
@@ -133,6 +173,7 @@ class Csr : public ::testing::Test {
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<ComplexMtx> complex_dmtx;
+    std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
     std::unique_ptr<Vec> dy;
     std::unique_ptr<Vec> dalpha;
@@ -142,7 +183,7 @@ class Csr : public ::testing::Test {
 
 TEST_F(Csr, StrategyAfterCopyIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::load_balance>(32));
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(cuda));
 
     ASSERT_EQ(mtx->get_strategy()->get_name(),
               dmtx->get_strategy()->get_name());
@@ -151,7 +192,7 @@ TEST_F(Csr, StrategyAfterCopyIsEquivalentToRef)
 
 TEST_F(Csr, SimpleApplyIsEquivalentToRefWithLoadBalance)
 {
-    set_up_apply_data(std::make_shared<Mtx::load_balance>(32));
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(cuda));
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -162,7 +203,7 @@ TEST_F(Csr, SimpleApplyIsEquivalentToRefWithLoadBalance)
 
 TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithLoadBalance)
 {
-    set_up_apply_data(std::make_shared<Mtx::load_balance>(32));
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(cuda));
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -173,7 +214,7 @@ TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithLoadBalance)
 
 TEST_F(Csr, SimpleApplyIsEquivalentToRefWithCusparse)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -184,7 +225,7 @@ TEST_F(Csr, SimpleApplyIsEquivalentToRefWithCusparse)
 
 TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithCusparse)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -239,7 +280,7 @@ TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithClassical)
 
 TEST_F(Csr, SimpleApplyIsEquivalentToRefWithAutomatical)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>(32));
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -250,7 +291,7 @@ TEST_F(Csr, SimpleApplyIsEquivalentToRefWithAutomatical)
 
 TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithLoadBalance)
 {
-    set_up_apply_data(std::make_shared<Mtx::load_balance>(32), 3);
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(cuda), 3);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -261,7 +302,7 @@ TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithLoadBalance)
 
 TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithLoadBalance)
 {
-    set_up_apply_data(std::make_shared<Mtx::load_balance>(32), 3);
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(cuda), 3);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -314,9 +355,61 @@ TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath)
 }
 
 
+TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    mtx->apply(alpha.get(), trans.get(), beta.get(), square_mtx.get());
+    dmtx->apply(dalpha.get(), d_trans.get(), dbeta.get(), square_dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
+    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    mtx->apply(trans.get(), square_mtx.get());
+    dmtx->apply(d_trans.get(), square_dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
+    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto da = Mtx::create(cuda);
+    auto db = Mtx::create(cuda);
+    da->copy_from(a.get());
+    db->copy_from(b.get());
+    auto id = gko::matrix::Identity<Mtx::value_type>::create(ref, mtx_size[1]);
+    auto did =
+        gko::matrix::Identity<Mtx::value_type>::create(cuda, mtx_size[1]);
+
+    a->apply(alpha.get(), id.get(), beta.get(), b.get());
+    da->apply(dalpha.get(), did.get(), dbeta.get(), db.get());
+
+    GKO_ASSERT_MTX_NEAR(b, db, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(b, db);
+    ASSERT_TRUE(db->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, TransposeIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>(32));
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
 
     auto trans = mtx->transpose();
     auto d_trans = dmtx->transpose();
@@ -328,7 +421,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
 
 TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 {
-    set_up_apply_complex_data(std::make_shared<ComplexMtx::automatical>(32));
+    set_up_apply_complex_data(std::make_shared<ComplexMtx::automatical>(cuda));
 
     auto trans = complex_mtx->conj_transpose();
     auto d_trans = complex_dmtx->conj_transpose();
@@ -340,7 +433,7 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 
 TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto dense_mtx = gko::matrix::Dense<>::create(ref);
     auto ddense_mtx = gko::matrix::Dense<>::create(cuda);
 
@@ -353,7 +446,7 @@ TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
 
 TEST_F(Csr, MoveToDenseIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto dense_mtx = gko::matrix::Dense<>::create(ref);
     auto ddense_mtx = gko::matrix::Dense<>::create(cuda);
 
@@ -366,7 +459,7 @@ TEST_F(Csr, MoveToDenseIsEquivalentToRef)
 
 TEST_F(Csr, ConvertToEllIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto ell_mtx = gko::matrix::Ell<>::create(ref);
     auto dell_mtx = gko::matrix::Ell<>::create(cuda);
 
@@ -379,7 +472,7 @@ TEST_F(Csr, ConvertToEllIsEquivalentToRef)
 
 TEST_F(Csr, MoveToEllIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto ell_mtx = gko::matrix::Ell<>::create(ref);
     auto dell_mtx = gko::matrix::Ell<>::create(cuda);
 
@@ -389,9 +482,10 @@ TEST_F(Csr, MoveToEllIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14);
 }
 
+
 TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
     auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda);
 
@@ -404,7 +498,7 @@ TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef)
 
 TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
     auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda);
 
@@ -417,7 +511,7 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef)
 
 TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     gko::size_type max_nnz_per_row;
     gko::size_type dmax_nnz_per_row;
 
@@ -432,7 +526,7 @@ TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef)
 
 TEST_F(Csr, ConvertToCooIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto coo_mtx = gko::matrix::Coo<>::create(ref);
     auto dcoo_mtx = gko::matrix::Coo<>::create(cuda);
 
@@ -445,7 +539,7 @@ TEST_F(Csr, ConvertToCooIsEquivalentToRef)
 
 TEST_F(Csr, MoveToCooIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto coo_mtx = gko::matrix::Coo<>::create(ref);
     auto dcoo_mtx = gko::matrix::Coo<>::create(cuda);
 
@@ -458,7 +552,7 @@ TEST_F(Csr, MoveToCooIsEquivalentToRef)
 
 TEST_F(Csr, ConvertToSellpIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
     auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda);
 
@@ -471,7 +565,7 @@ TEST_F(Csr, ConvertToSellpIsEquivalentToRef)
 
 TEST_F(Csr, MoveToSellpIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
     auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda);
 
@@ -482,9 +576,21 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(cuda);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(cuda->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
 TEST_F(Csr, CalculateTotalColsIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     gko::size_type total_cols;
     gko::size_type dtotal_cols;
 
@@ -499,7 +605,7 @@ TEST_F(Csr, CalculateTotalColsIsEquivalentToRef)
 
 TEST_F(Csr, CalculatesNonzerosPerRow)
 {
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     gko::Array<gko::size_type> row_nnz(ref, mtx->get_size()[0]);
     gko::Array<gko::size_type> drow_nnz(cuda, dmtx->get_size()[0]);
 
@@ -508,14 +614,14 @@ TEST_F(Csr, CalculatesNonzerosPerRow)
     gko::kernels::cuda::csr::calculate_nonzeros_per_row(cuda, dmtx.get(),
                                                         &drow_nnz);
 
-    GKO_ASSERT_ARRAY_EQ(&row_nnz, &drow_nnz);
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
 }
 
 
 TEST_F(Csr, ConvertToHybridIsEquivalentToRef)
 {
     using Hybrid_type = gko::matrix::Hybrid<>;
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto hybrid_mtx = Hybrid_type::create(
         ref, std::make_shared<Hybrid_type::column_limit>(2));
     auto dhybrid_mtx = Hybrid_type::create(
@@ -531,7 +637,7 @@ TEST_F(Csr, ConvertToHybridIsEquivalentToRef)
 TEST_F(Csr, MoveToHybridIsEquivalentToRef)
 {
     using Hybrid_type = gko::matrix::Hybrid<>;
-    set_up_apply_data(std::make_shared<Mtx::cusparse>());
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
     auto hybrid_mtx = Hybrid_type::create(
         ref, std::make_shared<Hybrid_type::column_limit>(2));
     auto dhybrid_mtx = Hybrid_type::create(
@@ -544,4 +650,79 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    bool is_sorted_cuda{};
+    bool is_sorted_ref{};
+
+    is_sorted_ref = mtx->is_sorted_by_column_index();
+    is_sorted_cuda = dmtx->is_sorted_by_column_index();
+
+    ASSERT_EQ(is_sorted_ref, is_sorted_cuda);
+}
+
+
+TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef)
+{
+    auto uns_mtx = gen_unsorted_mtx();
+    bool is_sorted_cuda{};
+    bool is_sorted_ref{};
+
+    is_sorted_ref = uns_mtx.ref->is_sorted_by_column_index();
+    is_sorted_cuda = uns_mtx.cuda->is_sorted_by_column_index();
+
+    ASSERT_EQ(is_sorted_ref, is_sorted_cuda);
+}
+
+
+TEST_F(Csr, SortSortedMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>());
+
+    mtx->sort_by_column_index();
+    dmtx->sort_by_column_index();
+
+    // Values must be unchanged, therefore, tolerance is `0`
+    GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0);
+}
+
+
+TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)
+{
+    auto uns_mtx = gen_unsorted_mtx();
+
+    uns_mtx.ref->sort_by_column_index();
+    uns_mtx.cuda->sort_by_column_index();
+
+    // Values must be unchanged, therefore, tolerance is `0`
+    GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.cuda, 0);
+}
+
+
+TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
+{
+    auto automatical = std::make_shared<Mtx::automatical>();
+    auto row_len_limit = std::max(automatical->nvidia_row_len_limit,
+                                  automatical->amd_row_len_limit);
+    auto load_balance_mtx = Mtx::create(ref);
+    auto classical_mtx = Mtx::create(ref);
+    load_balance_mtx->copy_from(
+        gen_mtx<Vec>(1, row_len_limit + 1000, row_len_limit + 1));
+    classical_mtx->copy_from(gen_mtx<Vec>(50, 50, 1));
+    auto load_balance_mtx_d = Mtx::create(cuda);
+    auto classical_mtx_d = Mtx::create(cuda);
+    load_balance_mtx_d->copy_from(load_balance_mtx.get());
+    classical_mtx_d->copy_from(classical_mtx.get());
+
+    load_balance_mtx_d->set_strategy(automatical);
+    classical_mtx_d->set_strategy(automatical);
+
+    EXPECT_EQ("load_balance", load_balance_mtx_d->get_strategy()->get_name());
+    EXPECT_EQ("classical", classical_mtx_d->get_strategy()->get_name());
+    ASSERT_NE(load_balance_mtx_d->get_strategy().get(),
+              classical_mtx_d->get_strategy().get());
+}
+
+
 }  // namespace
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index c153e1a5c74..34d2897641a 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/dense_kernels.hpp"
+#include <ginkgo/core/matrix/dense.hpp>
 
 
 #include <random>
@@ -40,14 +40,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "core/test/utils.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
@@ -55,8 +56,12 @@ namespace {
 
 class Dense : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using ComplexMtx = gko::matrix::Dense<std::complex<double>>;
+    using itype = int;
+    using vtype = double;
+    using Mtx = gko::matrix::Dense<vtype>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
+    using Arr = gko::Array<itype>;
+    using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
 
     Dense() : rand_engine(15) {}
 
@@ -123,6 +128,22 @@ class Dense : public ::testing::Test {
         dalpha->copy_from(alpha.get());
         dbeta = Mtx::create(cuda);
         dbeta->copy_from(beta.get());
+
+        std::vector<itype> tmp(x->get_size()[0], 0);
+        auto rng = std::default_random_engine{};
+        std::iota(tmp.begin(), tmp.end(), 0);
+        std::shuffle(tmp.begin(), tmp.end(), rng);
+        std::vector<itype> tmp2(x->get_size()[1], 0);
+        std::iota(tmp2.begin(), tmp2.end(), 0);
+        std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        rpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        drpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{cuda, tmp.begin(), tmp.end()});
+        cpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
+        dcpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{cuda, tmp2.begin(), tmp2.end()});
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -142,6 +163,10 @@ class Dense : public ::testing::Test {
     std::unique_ptr<Mtx> dy;
     std::unique_ptr<Mtx> dalpha;
     std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> drpermute_idxs;
+    std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Arr> dcpermute_idxs;
 };
 
 
@@ -238,11 +263,14 @@ TEST_F(Dense, MultipleVectorCudaComputeDotIsEquivalentToRef)
 TEST_F(Dense, CudaComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->cuda, norm_size);
 
-    x->compute_norm2(expected.get());
-    dx->compute_norm2(dresult.get());
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
 }
 
 
@@ -400,6 +428,18 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(cuda);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(cuda->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
 TEST_F(Dense, CountNNZIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -463,4 +503,52 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, IsRowPermutable)
+{
+    set_up_apply_data();
+
+    auto r_permute = x->row_permute(rpermute_idxs.get());
+    auto dr_permute = dx->row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
+                        static_cast<Mtx *>(dr_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsColPermutable)
+{
+    set_up_apply_data();
+
+    auto c_permute = x->column_permute(cpermute_idxs.get());
+    auto dc_permute = dx->column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
+                        static_cast<Mtx *>(dc_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseRowPermutable)
+{
+    set_up_apply_data();
+
+    auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
+                        static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseColPermutable)
+{
+    set_up_apply_data();
+
+    auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
+                        static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
+}
+
+
 }  // namespace
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index ff4ae0b8b88..d913d80e722 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -48,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/matrix/ell_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/matrix/hybrid_kernels.cpp b/cuda/test/matrix/hybrid_kernels.cpp
index cb7ab693899..f3225882021 100644
--- a/cuda/test/matrix/hybrid_kernels.cpp
+++ b/cuda/test/matrix/hybrid_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/hybrid_kernels.hpp"
+#include <ginkgo/core/matrix/hybrid.hpp>
 
 
 #include <random>
@@ -39,12 +39,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
+
+
+#include "core/matrix/hybrid_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/matrix/sellp_kernels.cpp b/cuda/test/matrix/sellp_kernels.cpp
index b213c1655db..08b276374a7 100644
--- a/cuda/test/matrix/sellp_kernels.cpp
+++ b/cuda/test/matrix/sellp_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -48,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/matrix/sellp_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/preconditioner/CMakeLists.txt b/cuda/test/preconditioner/CMakeLists.txt
index a0ca5a2e38a..a4473684560 100644
--- a/cuda/test/preconditioner/CMakeLists.txt
+++ b/cuda/test/preconditioner/CMakeLists.txt
@@ -1 +1,2 @@
 ginkgo_create_test(jacobi_kernels)
+ginkgo_create_test_cpp_cuda_header(isai_kernels)
diff --git a/cuda/test/preconditioner/isai_kernels.cpp b/cuda/test/preconditioner/isai_kernels.cpp
new file mode 100644
index 00000000000..fb8947e9ae6
--- /dev/null
+++ b/cuda/test/preconditioner/isai_kernels.cpp
@@ -0,0 +1,326 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/preconditioner/isai_kernels.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+enum struct matrix_type { lower, upper };
+class Isai : public ::testing::Test {
+protected:
+    using value_type = double;
+    using index_type = gko::int32;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    Isai() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        cuda = gko::CudaExecutor::create(0, ref);
+    }
+
+    std::unique_ptr<Csr> clone_allocations(const Csr *csr_mtx)
+    {
+        if (csr_mtx->get_executor() != ref) {
+            return {nullptr};
+        }
+        const auto num_elems = csr_mtx->get_num_stored_elements();
+        auto sparsity = csr_mtx->clone();
+
+        // values are now filled with invalid data to catch potential errors
+        auto begin_values = sparsity->get_values();
+        auto end_values = begin_values + num_elems;
+        std::fill(begin_values, end_values, -gko::one<value_type>());
+        return sparsity;
+    }
+
+    void initialize_data(matrix_type type, gko::size_type n,
+                         gko::size_type row_limit)
+    {
+        const bool for_lower_tm = type == matrix_type::lower;
+        auto nz_dist = std::uniform_int_distribution<index_type>(1, row_limit);
+        auto val_dist = std::uniform_real_distribution<value_type>(-1., 1.);
+        mtx = Csr::create(ref);
+        mtx = gko::test::generate_random_triangular_matrix<Csr>(
+            n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref,
+            gko::dim<2>{n, n});
+        inverse = clone_allocations(mtx.get());
+
+        d_mtx = Csr::create(cuda);
+        d_mtx->copy_from(mtx.get());
+        d_inverse = Csr::create(cuda);
+        d_inverse->copy_from(inverse.get());
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::CudaExecutor> cuda;
+
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> inverse;
+
+    std::unique_ptr<Csr> d_mtx;
+    std::unique_ptr<Csr> d_inverse;
+};
+
+
+TEST_F(Isai, CudaIsaiGenerateLinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 536, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(cuda, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::cuda::isai::generate_tri_inverse(
+        cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, CudaIsaiGenerateUinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 615, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(cuda, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::cuda::isai::generate_tri_inverse(
+        cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, CudaIsaiGenerateLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 554, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(cuda, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::cuda::isai::generate_tri_inverse(
+        cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, CudaIsaiGenerateUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 695, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(cuda, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::cuda::isai::generate_tri_inverse(
+        cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, CudaIsaiGenerateExcessLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 518, 40);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(cuda, a1);
+    gko::Array<index_type> da2(cuda, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(cuda, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(cuda, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get());
+    gko::kernels::cuda::isai::generate_excess_system(
+        cuda, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, CudaIsaiGenerateExcessUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 673, 51);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(cuda, a1);
+    gko::Array<index_type> da2(cuda, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(cuda, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(cuda, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get());
+    gko::kernels::cuda::isai::generate_excess_system(
+        cuda, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, CudaIsaiScatterExcessSolutionLIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 572, 52);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(cuda, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = Dense::create(cuda);
+    de_rhs->copy_from(lend(e_rhs));
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get());
+    gko::kernels::cuda::isai::scatter_excess_solution(
+        cuda, da1.get_const_data(), de_rhs.get(), d_inverse.get());
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, CudaIsaiScatterExcessSolutionUIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 702, 45);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(cuda, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = Dense::create(cuda);
+    de_rhs->copy_from(lend(e_rhs));
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get());
+    gko::kernels::cuda::isai::scatter_excess_solution(
+        cuda, da1.get_const_data(), de_rhs.get(), d_inverse.get());
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+}  // namespace
diff --git a/cuda/test/preconditioner/jacobi_kernels.cpp b/cuda/test/preconditioner/jacobi_kernels.cpp
index b98e61fc41c..05ea7d766e8 100644
--- a/cuda/test/preconditioner/jacobi_kernels.cpp
+++ b/cuda/test/preconditioner/jacobi_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,17 +33,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -323,6 +325,34 @@ TEST_F(Jacobi, CudaPreconditionerEquivalentToRefWithMPW)
 }
 
 
+TEST_F(Jacobi, CudaTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, CudaConjTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
 TEST_F(Jacobi, CudaApplyEquivalentToRefWithBlockSize32)
 {
     initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111);
@@ -561,6 +591,37 @@ TEST_F(Jacobi, CudaPreconditionerEquivalentToRefWithAdaptivePrecision)
 }
 
 
+TEST_F(Jacobi, CudaTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi,
+       CudaConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
 TEST_F(Jacobi, CudaApplyEquivalentToRefWithFullPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
diff --git a/cuda/test/solver/CMakeLists.txt b/cuda/test/solver/CMakeLists.txt
index e2a017962a5..32dbb96fe61 100644
--- a/cuda/test/solver/CMakeLists.txt
+++ b/cuda/test/solver/CMakeLists.txt
@@ -1,8 +1,9 @@
+ginkgo_create_test(bicg_kernels)
 ginkgo_create_test(bicgstab_kernels)
 ginkgo_create_test(cg_kernels)
 ginkgo_create_test(cgs_kernels)
 ginkgo_create_test(fcg_kernels)
 ginkgo_create_test(gmres_kernels)
 ginkgo_create_test(ir_kernels)
-ginkgo_create_test(lower_trs_kernels)
-ginkgo_create_test(upper_trs_kernels)
+ginkgo_create_test_cpp_cuda_header(lower_trs_kernels)
+ginkgo_create_test_cpp_cuda_header(upper_trs_kernels)
diff --git a/cuda/test/solver/bicg_kernels.cpp b/cuda/test/solver/bicg_kernels.cpp
new file mode 100644
index 00000000000..e58eef7e68f
--- /dev/null
+++ b/cuda/test/solver/bicg_kernels.cpp
@@ -0,0 +1,357 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicg.hpp>
+
+
+#include <fstream>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/bicg_kernels.hpp"
+#include "cuda/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class Bicg : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    Bicg() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        cuda = gko::CudaExecutor::create(0, ref);
+
+        std::string file_name(gko::matrices::location_ani1_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        csr_ref = gko::read<Csr>(input_file, ref);
+        auto csr_cuda_temp = Csr::create(cuda);
+        csr_cuda_temp->copy_from(gko::lend(csr_ref));
+        csr_cuda = gko::give(csr_cuda_temp);
+    }
+
+    void TearDown()
+    {
+        if (cuda != nullptr) {
+            ASSERT_NO_THROW(cuda->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 43;
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        q = gen_mtx(m, n);
+        r2 = gen_mtx(m, n);
+        z2 = gen_mtx(m, n);
+        p2 = gen_mtx(m, n);
+        q2 = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        beta = gen_mtx(1, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_b = Mtx::create(cuda);
+        d_b->copy_from(b.get());
+        d_r = Mtx::create(cuda);
+        d_r->copy_from(r.get());
+        d_z = Mtx::create(cuda);
+        d_z->copy_from(z.get());
+        d_p = Mtx::create(cuda);
+        d_p->copy_from(p.get());
+        d_q = Mtx::create(cuda);
+        d_q->copy_from(q.get());
+        d_r2 = Mtx::create(cuda);
+        d_r2->copy_from(r2.get());
+        d_z2 = Mtx::create(cuda);
+        d_z2->copy_from(z2.get());
+        d_p2 = Mtx::create(cuda);
+        d_p2->copy_from(p2.get());
+        d_q2 = Mtx::create(cuda);
+        d_q2->copy_from(q2.get());
+        d_x = Mtx::create(cuda);
+        d_x->copy_from(x.get());
+        d_beta = Mtx::create(cuda);
+        d_beta->copy_from(beta.get());
+        d_prev_rho = Mtx::create(cuda);
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho = Mtx::create(cuda);
+        d_rho->copy_from(rho.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(cuda, n));
+        *d_stop_status = *stop_status;
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::CudaExecutor> cuda;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> q;
+    std::unique_ptr<Mtx> r2;
+    std::unique_ptr<Mtx> z2;
+    std::unique_ptr<Mtx> p2;
+    std::unique_ptr<Mtx> q2;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_q;
+    std::unique_ptr<Mtx> d_r2;
+    std::unique_ptr<Mtx> d_z2;
+    std::unique_ptr<Mtx> d_p2;
+    std::unique_ptr<Mtx> d_q2;
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+    std::shared_ptr<const Csr> csr_ref;
+    std::shared_ptr<const Csr> csr_cuda;
+};
+
+
+TEST_F(Bicg, CudaBicgInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::initialize(
+        ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(),
+        rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get());
+    gko::kernels::cuda::bicg::initialize(
+        cuda, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
+        d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(),
+        d_q2.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Bicg, CudaBicgStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(),
+                                          z2.get(), rho.get(), prev_rho.get(),
+                                          stop_status.get());
+    gko::kernels::cuda::bicg::step_1(cuda, d_p.get(), d_z.get(), d_p2.get(),
+                                     d_z2.get(), d_rho.get(), d_prev_rho.get(),
+                                     d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14);
+}
+
+
+TEST_F(Bicg, CudaBicgStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::step_2(
+        ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(),
+        rho.get(), stop_status.get());
+    gko::kernels::cuda::bicg::step_2(
+        cuda, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(),
+        d_q2.get(), d_beta.get(), d_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14);
+}
+
+
+TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    make_spd(mtx.get());
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = Mtx::create(cuda);
+    d_mtx->copy_from(mtx.get());
+    auto d_x = Mtx::create(cuda);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(cuda);
+    d_b->copy_from(b.get());
+    auto bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(cuda),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(cuda))
+            .on(cuda);
+    auto solver = bicg_factory->generate(std::move(mtx));
+    auto d_solver = d_bicg_factory->generate(std::move(d_mtx));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Bicg, ApplyWithSuiteSparseMatrixIsEquivalentToRef)
+{
+    auto x = gen_mtx(36, 1);
+    auto b = gen_mtx(36, 1);
+    auto d_x = Mtx::create(cuda);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(cuda);
+    d_b->copy_from(b.get());
+    auto bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(cuda),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(cuda))
+            .on(cuda);
+    auto solver = bicg_factory->generate(std::move(csr_ref));
+    auto d_solver = d_bicg_factory->generate(std::move(csr_cuda));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/cuda/test/solver/bicgstab_kernels.cpp b/cuda/test/solver/bicgstab_kernels.cpp
index 0f1a0e190e5..c809ad6a17b 100644
--- a/cuda/test/solver/bicgstab_kernels.cpp
+++ b/cuda/test/solver/bicgstab_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,21 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/bicgstab_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/bicgstab_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
@@ -148,8 +150,6 @@ class Bicgstab : public ::testing::Test {
         d_omega = Mtx::create(cuda);
         d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
             new gko::Array<gko::stopping_status>(cuda));
-        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
-            new gko::Array<gko::stopping_status>(cuda));
 
         d_x->copy_from(x.get());
         d_b->copy_from(b.get());
@@ -259,7 +259,7 @@ TEST_F(Bicgstab, CudaBicgstabInitializeIsEquivalentToRef)
     GKO_EXPECT_MTX_NEAR(d_beta, beta, 1e-14);
     GKO_EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14);
     GKO_EXPECT_MTX_NEAR(d_omega, omega, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/cuda/test/solver/cg_kernels.cpp b/cuda/test/solver/cg_kernels.cpp
index 3c40c3f59c8..65f8d78781f 100644
--- a/cuda/test/solver/cg_kernels.cpp
+++ b/cuda/test/solver/cg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/cg.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/cg_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/cg_kernels.hpp"
+#include "cuda/test/utils.hpp"
+
 
 namespace {
 
@@ -193,7 +196,7 @@ TEST_F(Cg, CudaCgInitializeIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/cuda/test/solver/cgs_kernels.cpp b/cuda/test/solver/cgs_kernels.cpp
index d5a73474147..3e49804ddab 100644
--- a/cuda/test/solver/cgs_kernels.cpp
+++ b/cuda/test/solver/cgs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/cgs.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/cgs_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/cgs_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
@@ -249,7 +251,7 @@ TEST_F(Cgs, CudaCgsInitializeIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_gamma, gamma, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/cuda/test/solver/fcg_kernels.cpp b/cuda/test/solver/fcg_kernels.cpp
index 22d7e5702a4..2b5f3ac5441 100644
--- a/cuda/test/solver/fcg_kernels.cpp
+++ b/cuda/test/solver/fcg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/fcg.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/fcg_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/fcg_kernels.hpp"
+#include "cuda/test/utils.hpp"
+
 
 namespace {
 
@@ -207,7 +210,7 @@ TEST_F(Fcg, CudaFcgInitializeIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_rho_t, rho_t, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/cuda/test/solver/gmres_kernels.cpp b/cuda/test/solver/gmres_kernels.cpp
index 9f731464dac..2dcd4d2653c 100644
--- a/cuda/test/solver/gmres_kernels.cpp
+++ b/cuda/test/solver/gmres_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/gmres.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/gmres_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/gmres_kernels.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
@@ -54,7 +57,14 @@ namespace {
 
 class Gmres : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using norm_type = gko::remove_complex<value_type>;
+    using NormVector = gko::matrix::Dense<norm_type>;
+    template <typename T>
+    using Dense = typename gko::matrix::Dense<T>;
+
     Gmres() : rand_engine(30) {}
 
     void SetUp()
@@ -71,41 +81,39 @@ class Gmres : public ::testing::Test {
         }
     }
 
-    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    template <typename ValueType = value_type, typename IndexType = index_type>
+    std::unique_ptr<Dense<ValueType>> gen_mtx(int num_rows, int num_cols)
     {
-        return gko::test::generate_random_matrix<Mtx>(
+        return gko::test::generate_random_matrix<Dense<ValueType>>(
             num_rows, num_cols,
-            std::uniform_int_distribution<>(num_cols, num_cols),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::uniform_int_distribution<IndexType>(num_cols, num_cols),
+            std::normal_distribution<ValueType>(-1.0, 1.0), rand_engine, ref);
     }
 
-    void initialize_data()
+    void initialize_data(int nrhs = 43)
     {
         int m = 597;
-        int n = 43;
-        x = gen_mtx(m, n);
-        y = gen_mtx(gko::solver::default_krylov_dim, n);
+        x = gen_mtx(m, nrhs);
+        y = gen_mtx(gko::solver::default_krylov_dim, nrhs);
         before_preconditioner = Mtx::create_with_config_of(x.get());
-        b = gen_mtx(m, n);
-        b_norm = gen_mtx(1, n);
-        krylov_bases = gen_mtx(m, (gko::solver::default_krylov_dim + 1) * n);
-        next_krylov_basis = gen_mtx(m, n);
+        b = gen_mtx(m, nrhs);
+        krylov_bases = gen_mtx(m * (gko::solver::default_krylov_dim + 1), nrhs);
         hessenberg = gen_mtx(gko::solver::default_krylov_dim + 1,
-                             gko::solver::default_krylov_dim * n);
-        hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, n);
-        residual = gen_mtx(m, n);
-        residual_norm = gen_mtx(1, n);
+                             gko::solver::default_krylov_dim * nrhs);
+        hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, nrhs);
+        residual = gen_mtx(m, nrhs);
+        residual_norm = gen_mtx<norm_type>(1, nrhs);
         residual_norm_collection =
-            gen_mtx(gko::solver::default_krylov_dim + 1, n);
-        givens_sin = gen_mtx(gko::solver::default_krylov_dim, n);
-        givens_cos = gen_mtx(gko::solver::default_krylov_dim, n);
+            gen_mtx(gko::solver::default_krylov_dim + 1, nrhs);
+        givens_sin = gen_mtx(gko::solver::default_krylov_dim, nrhs);
+        givens_cos = gen_mtx(gko::solver::default_krylov_dim, nrhs);
         stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
-            new gko::Array<gko::stopping_status>(ref, n));
+            new gko::Array<gko::stopping_status>(ref, nrhs));
         for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
             stop_status->get_data()[i].reset();
         }
         final_iter_nums = std::unique_ptr<gko::Array<gko::size_type>>(
-            new gko::Array<gko::size_type>(ref, n));
+            new gko::Array<gko::size_type>(ref, nrhs));
         for (size_t i = 0; i < final_iter_nums->get_num_elems(); ++i) {
             final_iter_nums->get_data()[i] = 5;
         }
@@ -117,19 +125,15 @@ class Gmres : public ::testing::Test {
         d_y->copy_from(y.get());
         d_b = Mtx::create(cuda);
         d_b->copy_from(b.get());
-        d_b_norm = Mtx::create(cuda);
-        d_b_norm->copy_from(b_norm.get());
         d_krylov_bases = Mtx::create(cuda);
         d_krylov_bases->copy_from(krylov_bases.get());
-        d_next_krylov_basis = Mtx::create(cuda);
-        d_next_krylov_basis->copy_from(next_krylov_basis.get());
         d_hessenberg = Mtx::create(cuda);
         d_hessenberg->copy_from(hessenberg.get());
         d_hessenberg_iter = Mtx::create(cuda);
         d_hessenberg_iter->copy_from(hessenberg_iter.get());
         d_residual = Mtx::create(cuda);
         d_residual->copy_from(residual.get());
-        d_residual_norm = Mtx::create(cuda);
+        d_residual_norm = NormVector::create(cuda);
         d_residual_norm->copy_from(residual_norm.get());
         d_residual_norm_collection = Mtx::create(cuda);
         d_residual_norm_collection->copy_from(residual_norm_collection.get());
@@ -138,10 +142,10 @@ class Gmres : public ::testing::Test {
         d_givens_cos = Mtx::create(cuda);
         d_givens_cos->copy_from(givens_cos.get());
         d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
-            new gko::Array<gko::stopping_status>(cuda, n));
+            new gko::Array<gko::stopping_status>(cuda, nrhs));
         *d_stop_status = *stop_status;
         d_final_iter_nums = std::unique_ptr<gko::Array<gko::size_type>>(
-            new gko::Array<gko::size_type>(cuda, n));
+            new gko::Array<gko::size_type>(cuda, nrhs));
         *d_final_iter_nums = *final_iter_nums;
     }
 
@@ -154,13 +158,11 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<Mtx> x;
     std::unique_ptr<Mtx> y;
     std::unique_ptr<Mtx> b;
-    std::unique_ptr<Mtx> b_norm;
     std::unique_ptr<Mtx> krylov_bases;
-    std::unique_ptr<Mtx> next_krylov_basis;
     std::unique_ptr<Mtx> hessenberg;
     std::unique_ptr<Mtx> hessenberg_iter;
     std::unique_ptr<Mtx> residual;
-    std::unique_ptr<Mtx> residual_norm;
+    std::unique_ptr<NormVector> residual_norm;
     std::unique_ptr<Mtx> residual_norm_collection;
     std::unique_ptr<Mtx> givens_sin;
     std::unique_ptr<Mtx> givens_cos;
@@ -171,13 +173,11 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<Mtx> d_before_preconditioner;
     std::unique_ptr<Mtx> d_y;
     std::unique_ptr<Mtx> d_b;
-    std::unique_ptr<Mtx> d_b_norm;
     std::unique_ptr<Mtx> d_krylov_bases;
-    std::unique_ptr<Mtx> d_next_krylov_basis;
     std::unique_ptr<Mtx> d_hessenberg;
     std::unique_ptr<Mtx> d_hessenberg_iter;
     std::unique_ptr<Mtx> d_residual;
-    std::unique_ptr<Mtx> d_residual_norm;
+    std::unique_ptr<NormVector> d_residual_norm;
     std::unique_ptr<Mtx> d_residual_norm_collection;
     std::unique_ptr<Mtx> d_givens_sin;
     std::unique_ptr<Mtx> d_givens_cos;
@@ -191,18 +191,17 @@ TEST_F(Gmres, CudaGmresInitialize1IsEquivalentToRef)
     initialize_data();
 
     gko::kernels::reference::gmres::initialize_1(
-        ref, b.get(), b_norm.get(), residual.get(), givens_sin.get(),
-        givens_cos.get(), stop_status.get(), gko::solver::default_krylov_dim);
+        ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
+        stop_status.get(), gko::solver::default_krylov_dim);
     gko::kernels::cuda::gmres::initialize_1(
-        cuda, d_b.get(), d_b_norm.get(), d_residual.get(), d_givens_sin.get(),
+        cuda, d_b.get(), d_residual.get(), d_givens_sin.get(),
         d_givens_cos.get(), d_stop_status.get(),
         gko::solver::default_krylov_dim);
 
-    GKO_ASSERT_MTX_NEAR(d_b_norm, b_norm, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_residual, residual, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
@@ -223,7 +222,7 @@ TEST_F(Gmres, CudaGmresInitialize2IsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection,
                         1e-14);
     GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
 }
 
 
@@ -233,17 +232,41 @@ TEST_F(Gmres, CudaGmresStep1IsEquivalentToRef)
     int iter = 5;
 
     gko::kernels::reference::gmres::step_1(
-        ref, next_krylov_basis.get(), givens_sin.get(), givens_cos.get(),
+        ref, x->get_size()[0], givens_sin.get(), givens_cos.get(),
+        residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(),
+        hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get());
+    gko::kernels::cuda::gmres::step_1(
+        cuda, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(),
+        d_residual_norm.get(), d_residual_norm_collection.get(),
+        d_krylov_bases.get(), d_hessenberg_iter.get(), iter,
+        d_final_iter_nums.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection,
+                        1e-14);
+    GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
+}
+
+
+TEST_F(Gmres, CudaGmresStep1OnSingleRHSIsEquivalentToRef)
+{
+    initialize_data(1);
+    int iter = 5;
+
+    gko::kernels::reference::gmres::step_1(
+        ref, x->get_size()[0], givens_sin.get(), givens_cos.get(),
         residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(),
-        hessenberg_iter.get(), b_norm.get(), iter, final_iter_nums.get(),
-        stop_status.get());
+        hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get());
     gko::kernels::cuda::gmres::step_1(
-        cuda, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(),
+        cuda, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(),
         d_residual_norm.get(), d_residual_norm_collection.get(),
-        d_krylov_bases.get(), d_hessenberg_iter.get(), d_b_norm.get(), iter,
+        d_krylov_bases.get(), d_hessenberg_iter.get(), iter,
         d_final_iter_nums.get(), d_stop_status.get());
 
-    GKO_ASSERT_MTX_NEAR(d_next_krylov_basis, next_krylov_basis, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14);
@@ -251,7 +274,7 @@ TEST_F(Gmres, CudaGmresStep1IsEquivalentToRef)
                         1e-14);
     GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
 }
 
 
diff --git a/cuda/test/solver/ir_kernels.cpp b/cuda/test/solver/ir_kernels.cpp
index 1265f637f76..35b844274b9 100644
--- a/cuda/test/solver/ir_kernels.cpp
+++ b/cuda/test/solver/ir_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,21 +33,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/ir.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/ir_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/solver/ir_kernels.hpp"
+#include "cuda/test/utils.hpp"
+
+
 namespace {
 
 
@@ -133,4 +136,124 @@ TEST_F(Ir, ApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(cuda, mtx);
+    auto d_x = clone(cuda, x);
+    auto d_b = clone(cuda, b);
+
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            ref))
+                    .on(ref))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            cuda))
+                    .on(cuda))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(cuda))
+            .on(cuda);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres
+    // iteration gets amplified by the difference in IR.
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Ir, RichardsonApplyIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(cuda, mtx);
+    auto d_x = clone(cuda, x);
+    auto d_b = clone(cuda, b);
+    // Forget about accuracy - Richardson is not going to converge for a random
+    // matrix, just check that a couple of iterations gives the same result on
+    // both executors
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_relaxation_factor(0.9)
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(cuda))
+            .with_relaxation_factor(0.9)
+            .on(cuda);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(cuda, mtx);
+    auto d_x = clone(cuda, x);
+    auto d_b = clone(cuda, b);
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            ref))
+                    .on(ref))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_relaxation_factor(0.9)
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            cuda))
+                    .on(cuda))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(cuda))
+            .with_relaxation_factor(0.9)
+            .on(cuda);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres
+    // iteration gets amplified by the difference in IR.
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
 }  // namespace
diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
index c855d955635..a2cac176e8c 100644
--- a/cuda/test/solver/lower_trs_kernels.cpp
+++ b/cuda/test/solver/lower_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cuda.h>
+
+
 #include <gtest/gtest.h>
 
 
@@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/solver/lower_trs_kernels.hpp"
-#include "core/test/utils.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
index cd5584ff452..92a76b1e47b 100644
--- a/cuda/test/solver/upper_trs_kernels.cpp
+++ b/cuda/test/solver/upper_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cuda.h>
+
+
 #include <gtest/gtest.h>
 
 
@@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/solver/upper_trs_kernels.hpp"
-#include "core/test/utils.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/stop/CMakeLists.txt b/cuda/test/stop/CMakeLists.txt
index 5e686b8fbb4..0ba0781e077 100644
--- a/cuda/test/stop/CMakeLists.txt
+++ b/cuda/test/stop/CMakeLists.txt
@@ -1,2 +1,2 @@
 ginkgo_create_test(criterion_kernels)
-ginkgo_create_test(residual_norm_reduction_kernels)
+ginkgo_create_test(residual_norm_kernels)
diff --git a/cuda/test/stop/criterion_kernels.cpp b/cuda/test/stop/criterion_kernels.cpp
index a690cb1fafd..8265ffea284 100644
--- a/cuda/test/stop/criterion_kernels.cpp
+++ b/cuda/test/stop/criterion_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <ginkgo/core/stop/criterion.hpp>
-#include <ginkgo/core/stop/iteration.hpp>
 
 
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/stop/iteration.hpp>
+
+
+#include "cuda/test/utils.hpp"
+
+
 namespace {
 
 
diff --git a/cuda/test/stop/residual_norm_kernels.cpp b/cuda/test/stop/residual_norm_kernels.cpp
new file mode 100644
index 00000000000..ec5dc3bf511
--- /dev/null
+++ b/cuda/test/stop/residual_norm_kernels.cpp
@@ -0,0 +1,369 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+constexpr double tol = 1.0e-14;
+
+
+class ResidualNormReduction : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+
+    ResidualNormReduction()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        cuda_ = gko::CudaExecutor::create(0, ref_);
+        factory_ = gko::stop::ResidualNormReduction<>::build()
+                       .with_reduction_factor(tol)
+                       .on(cuda_);
+    }
+
+    std::unique_ptr<gko::stop::ResidualNormReduction<>::Factory> factory_;
+    std::shared_ptr<const gko::CudaExecutor> cuda_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+};
+
+
+TEST_F(ResidualNormReduction, WaitsTillResidualGoal)
+{
+    auto res = gko::initialize<Mtx>({100.0}, ref_);
+    auto d_res = Mtx::create(cuda_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(cuda_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+    stop_status.set_executor(cuda_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0) = tol * 1.1e+2;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_FALSE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(cuda_);
+    ASSERT_FALSE(one_changed);
+
+    res->at(0) = tol * 0.9e+2;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
+{
+    auto res = gko::initialize<Mtx>({{100.0, 100.0}}, ref_);
+    auto d_res = Mtx::create(cuda_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({{10.0, 10.0}}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(cuda_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.set_executor(cuda_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0, 0) = tol * 0.9e+2;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(cuda_);
+    ASSERT_TRUE(one_changed);
+
+    res->at(0, 1) = tol * 0.9e+2;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[1].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+class RelativeResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+
+    RelativeResidualNorm()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        cuda_ = gko::CudaExecutor::create(0, ref_);
+        factory_ =
+            gko::stop::RelativeResidualNorm<>::build().with_tolerance(tol).on(
+                cuda_);
+    }
+
+    std::unique_ptr<gko::stop::RelativeResidualNorm<>::Factory> factory_;
+    std::shared_ptr<const gko::CudaExecutor> cuda_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+};
+
+
+TEST_F(RelativeResidualNorm, WaitsTillResidualGoal)
+{
+    auto res = gko::initialize<Mtx>({100.0}, ref_);
+    auto d_res = Mtx::create(cuda_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(cuda_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+    stop_status.set_executor(cuda_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0) = tol * 1.1e+1;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_FALSE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(cuda_);
+    ASSERT_FALSE(one_changed);
+
+    res->at(0) = tol * 0.9e+1;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+TEST_F(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    auto res = gko::initialize<Mtx>({{100.0, 100.0}}, ref_);
+    auto d_res = Mtx::create(cuda_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({{10.0, 10.0}}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(cuda_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.set_executor(cuda_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0, 0) = tol * 0.9e+1;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(cuda_);
+    ASSERT_TRUE(one_changed);
+
+    res->at(0, 1) = tol * 0.9e+1;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[1].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+class AbsoluteResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+
+    AbsoluteResidualNorm()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        cuda_ = gko::CudaExecutor::create(0, ref_);
+        factory_ =
+            gko::stop::AbsoluteResidualNorm<>::build().with_tolerance(tol).on(
+                cuda_);
+    }
+
+    std::unique_ptr<gko::stop::AbsoluteResidualNorm<>::Factory> factory_;
+    std::shared_ptr<const gko::CudaExecutor> cuda_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+};
+
+
+TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoal)
+{
+    auto res = gko::initialize<Mtx>({100.0}, ref_);
+    auto d_res = Mtx::create(cuda_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(cuda_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+    stop_status.set_executor(cuda_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0) = tol * 1.1;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_FALSE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(cuda_);
+    ASSERT_FALSE(one_changed);
+
+    res->at(0) = tol * 0.9;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    auto res = gko::initialize<Mtx>({{100.0, 100.0}}, ref_);
+    auto d_res = Mtx::create(cuda_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({{10.0, 10.0}}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(cuda_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.set_executor(cuda_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0, 0) = tol * 0.9;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(cuda_);
+    ASSERT_TRUE(one_changed);
+
+    res->at(0, 1) = tol * 0.9;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[1].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+}  // namespace
diff --git a/cuda/test/stop/residual_norm_reduction_kernels.cpp b/cuda/test/stop/residual_norm_reduction_kernels.cpp
deleted file mode 100644
index 9190590ebd7..00000000000
--- a/cuda/test/stop/residual_norm_reduction_kernels.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
-
-
-#include <gtest/gtest.h>
-
-
-namespace {
-
-
-constexpr double reduction_factor = 1.0e-14;
-
-
-class ResidualNormReduction : public ::testing::Test {
-protected:
-    using Mtx = gko::matrix::Dense<>;
-
-    ResidualNormReduction()
-    {
-        ref_ = gko::ReferenceExecutor::create();
-        cuda_ = gko::CudaExecutor::create(0, ref_);
-        factory_ = gko::stop::ResidualNormReduction<>::build()
-                       .with_reduction_factor(reduction_factor)
-                       .on(cuda_);
-    }
-
-    std::unique_ptr<gko::stop::ResidualNormReduction<>::Factory> factory_;
-    std::shared_ptr<const gko::CudaExecutor> cuda_;
-    std::shared_ptr<gko::ReferenceExecutor> ref_;
-};
-
-
-TEST_F(ResidualNormReduction, WaitsTillResidualGoal)
-{
-    auto scalar = gko::initialize<Mtx>({1.0}, ref_);
-    auto d_scalar = Mtx::create(cuda_);
-    d_scalar->copy_from(scalar.get());
-    auto criterion =
-        factory_->generate(nullptr, nullptr, nullptr, d_scalar.get());
-    bool one_changed{};
-    constexpr gko::uint8 RelativeStoppingId{1};
-    gko::Array<gko::stopping_status> stop_status(ref_, 1);
-    stop_status.get_data()[0].reset();
-    stop_status.set_executor(cuda_);
-
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(d_scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-
-    scalar->at(0) = reduction_factor * 1.0e+2;
-    d_scalar->copy_from(scalar.get());
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(d_scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    stop_status.set_executor(ref_);
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
-    stop_status.set_executor(cuda_);
-    ASSERT_EQ(one_changed, false);
-
-    scalar->at(0) = reduction_factor * 1.0e-2;
-    d_scalar->copy_from(scalar.get());
-    ASSERT_TRUE(
-        criterion->update()
-            .residual_norm(d_scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    stop_status.set_executor(ref_);
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-}
-
-
-TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
-{
-    auto mtx = gko::initialize<Mtx>({{1.0, 1.0}}, ref_);
-    auto d_mtx = Mtx::create(cuda_);
-    d_mtx->copy_from(mtx.get());
-    auto criterion = factory_->generate(nullptr, nullptr, nullptr, d_mtx.get());
-    bool one_changed{};
-    constexpr gko::uint8 RelativeStoppingId{1};
-    gko::Array<gko::stopping_status> stop_status(ref_, 2);
-    stop_status.get_data()[0].reset();
-    stop_status.get_data()[1].reset();
-    stop_status.set_executor(cuda_);
-
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(d_mtx.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-
-    mtx->at(0, 0) = reduction_factor * 1.0e-2;
-    d_mtx->copy_from(mtx.get());
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(d_mtx.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    stop_status.set_executor(ref_);
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
-    stop_status.set_executor(cuda_);
-    ASSERT_EQ(one_changed, true);
-
-    mtx->at(0, 1) = reduction_factor * 1.0e-2;
-    d_mtx->copy_from(mtx.get());
-    ASSERT_TRUE(
-        criterion->update()
-            .residual_norm(d_mtx.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    stop_status.set_executor(ref_);
-    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-}
-
-
-}  // namespace
diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp
new file mode 100644
index 00000000000..903ed6a77c3
--- /dev/null
+++ b/cuda/test/utils.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_TEST_UTILS_HPP_
+#define GKO_CUDA_TEST_UTILS_HPP_
+
+
+#include "core/test/utils.hpp"
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace {
+
+
+// prevent device reset after each test
+auto no_reset_exec =
+    gko::CudaExecutor::create(0, gko::ReferenceExecutor::create(), true);
+
+
+}  // namespace
+
+
+#endif  // GKO_CUDA_TEST_UTILS_HPP_
diff --git a/cuda/test/utils/assertions_test.cpp b/cuda/test/utils/assertions_test.cpp
index d5e385eea8a..71a2fb0109b 100644
--- a/cuda/test/utils/assertions_test.cpp
+++ b/cuda/test/utils/assertions_test.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
-#include <core/test/utils/assertions.hpp>
+#include "core/test/utils/assertions.hpp"
 
 
 #include <gtest/gtest.h>
@@ -41,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "cuda/test/utils.hpp"
+
+
 namespace {
 
 
@@ -70,10 +72,10 @@ TEST_F(MatricesNear, CanPassCudaMatrix)
     auto mtx = gko::initialize<gko::matrix::Dense<>>(
         {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, ref);
     // TODO: CUDA conversion Dense -> Csr not yet implemented
-    auto csr_omp = gko::matrix::Csr<>::create(ref);
-    csr_omp->copy_from(mtx.get());
+    auto csr_ref = gko::matrix::Csr<>::create(ref);
+    csr_ref->copy_from(mtx.get());
     auto csr_mtx = gko::matrix::Csr<>::create(cuda);
-    csr_mtx->copy_from(std::move(csr_omp));
+    csr_mtx->copy_from(std::move(csr_ref));
 
     GKO_EXPECT_MTX_NEAR(csr_mtx, mtx, 0.0);
     GKO_ASSERT_MTX_NEAR(csr_mtx, mtx, 0.0);
diff --git a/dev_tools/containers/README.md b/dev_tools/containers/README.md
deleted file mode 100644
index 7e3a1818f2d..00000000000
--- a/dev_tools/containers/README.md
+++ /dev/null
@@ -1,161 +0,0 @@
-# Purpose
-The purpose of this file is to explain how to create or use containers for Ginkgo. 
-
-Custom containers are used in Ginkgo in order to test the correct functionality
-of the library. As Ginkgo is a C++ CUDA-enabled library, it is important to test
-both a wide variety of compilers and CUDA versions as part of the development
-process. This allows to ensure Ginkgo is and stays compatible with the specified
-compilers and CUDA versions.
-# Tools used
-To create and deploy containers, we will use:
-+ [NVIDIA's container registry](https://ngc.nvidia.com/registry/nvidia-cuda)
-+ [NVIDIA HPC Container Maker (HPCCM)](https://github.com/NVIDIA/hpc-container-maker/)
-+ [nvidia-docker2](https://github.com/NVIDIA/nvidia-docker) should be installed and available
-+ A [local docker registry](https://docs.docker.com/registry/deploying/#run-a-local-registry) should be up and running
-+ docker and gitlab-runner
-# Ginkgo containers
-Creating container images is a tedious task. The [usual
-process](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/)
-requires writing what is called a `Dockerfile` which contains all commands
-needed to build an image.
-
-To facilitate building new docker images, it is advised to start with an already
-existing container image (such as an ubuntu image), and extend it with new
-functionalities to generate a new container. In our context this is what we will
-be doing. Nevertheless, to facilitate container generation we have decided to
-rely on NVIDIA's HPCCM.
-
-## Ginkgo HPCCM recipes
-HPCCM facilitates the container creation process significantly through a
-high-level interface. HPCCM uses 'recipes', python files containing base
-instructions, similar to a cookbook, tailored to generate Dockerfiles. Recipes
-can take in arguments which allows to reuse the same recipe for building a wide
-variety of containers. By default, HPCCM supports multiple packages and Linux
-distributions which increases the portability of the HPCCM recipes. 
-
-### Description
-Ginkgo provides two recipes for creating containers. They are :
-+ ginkgo-cuda-base.py: based on [NVIDIA's docker images](https://ngc.nvidia.com/registry/nvidia-cuda)
-+ ginkgo-nocuda-base.py: based on the basic ubuntu image
-
-There is minor differences, but all of Ginkgo's recipes install the following
-packages:
-+ GNU compilers
-+ LLVM/Clang
-+ Intel Compilers
-+ OpenMP
-+ Python 2 and 3
-+ cmake
-+ git, openssh, doxygen, curl (these are required for some synchronization or
-  documentation building jobs)
-+ valgrind, graphviz, jq (documentation and debugging)
-
-### CUDA recipes
-Every container is tailored to have matching CUDA, GNU Compilers and LLVM/Clang
-versions. The information for compatible versions can usually be found in
-[NVIDIA's
-documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html).
-
-+ CUDA is provided by default from nvidia-cuda, and requires no particular setup.
-+ GNU and Clang compilers should use the extra_packages argument in order to
-  have access to a repository providing all compiler versions (otherwise the
-  default limit is gcc 5.4).
-+ Arguments can be provided for CUDA, GNU and LLVM version.
-+ It is required to use `libomp-dev` library for Clang+OpenMP to work.
-+ hwloc is built and the server's topology is added to the container.
-+ Finally, `LIBRARY_PATH` and `LD_LIBRARY_PATH` are properly setup for the CUDA
-  library. For proper CMake detection of the GPUs, this should maybe be
-  extended.
-  
-  
-The dockerfiles and container images already generated are:
-+ CUDA 9.0, GNU 5.5, LLVM 3.9, no Intel
-+ CUDA 9.1, GNU 6, LLVM 4.0, Intel 2017 update 4
-+ CUDA 9.2, GNU 7, LLVM 5.0, Intel 2017 update 4
-+ CUDA 10.0, GNU 7, LLVM 6.0, Intel 2018 update 1
-+ CUDA 10.1, GNU 8, LLVM 7, Intel 2019 update 4
-
-### No CUDA recipe
-Because CUDA limits the versions of compilers it can work with, it is good
-practice to provide non-CUDA containers, particularly for the more recent
-compilers.
-
-The base image for this recipe is the same Ubuntu version as NVIDIA's to keep
-the systems as similar as possible. There is only one extra difference for this
-recipe: the image is very light and does not include the `make` command by
-default, so it is necessary to add the `build-essential` package to the
-requirements.
-
-In addition to the previous argument, an extra `papi` argument can be given.
-This argument can be set to `True` to indicate that the image should be built
-for papi support. In this case, if papi files can be found, the library perfmon
-(`libpfm4`) is added do the docker container and papi files are copied to the
-container from a folder named `papi/` with the following format:
-+ `papi/include`: papi include files
-+ `papi/lib`: papi pre-built library files
-+ `papi/bin`: papi pre-built binary files
-
-The dockerfiles and container images already generated are:
-+ GNU 9, LLVM 8 , Intel 2019 update 4.
-## Using HPCCM recipes and docker to create containers
-The following explains how to use recipes and docker to create new containers.
-### Generate the Dockerfile
-This is done with the NVIDIA's HPCCM tool. A base recipe should be given and the
- output should be written to a file, the dockerfile.
-```bash
- hpccm --recipe ginkgo-cuda-base.py --userarg cuda=10.0 gnu=8 llvm=6.0 > gko-cuda100-gnu8-llvm60.baseimage
-```
-### Using docker to build the container
-The command simply uses `docker build` the standard command for container
-generation. 
-```bash
-docker build -t localhost:5000/gko-cuda100-gnu8-llvm60 -f gko-cuda100-gnu8-llvm60.baseimage .
-```
-A name is given to the image through `-t tag`. It is required to append
-`localhost:5000/` to designate our server's local container registry.
-The base image (or dockerfile) is given through the `-f` argument.
-The path given here is `.`. This is important if building an image from the
-`gko-nocuda-base` base image. This indicates the path where the relevant papi
-pre-built files (`papi/include/...`, etc) files to be put into the container can
-be found.
-### Test the generated container
-The created container should be tested to ensure all supposed functionalities
-are present and properly working. Here is a standard procedure for this:
-```bash
-# get interactive access to a container
-docker run --rm --runtime=nvidia -ti localhost:5000/gko-cuda100-gnu8-llvm60 
-nvidia-smi
-g++ --version
-clang++ --version
-```
-
-In addition, it can be useful to test the CUDA and OpenMP functionality, for
-this purpose a short C-program can be created such as:
-```c++
-	 #include <cuda.h>
-	 #include <omp.h>
-	 #define SOME_LIMIT 10000
-
-	 int main()
-	 {
-		 cuInit(0); //whatever other CUDA standard API call
-
-		 int acc = 0;
-	 #pragma omp parallel for
-		 for (int i=0; i<SOME_LIMIT; i++)
-			 acc+=i*4+1; //whatever
-		 return 0;
-	 }
-```
-
-This should be compiled and tested with
-+ g++ test.cpp -lcuda -I/usr/local/cuda/include -fopenmp
-+ clang++ test.cpp -lcuda -I/usr/local/cuda/include -fopenmp
-
-### Push the container to the local registry
-Assuming the local registry is up and running, then with the previous steps
-properly done (i.e. appending localhost:5000/ to all docker image names), then
-pushing the image to the registry is:
-```bash
-docker push localhost:5000/gko-cuda100-gnu8-llvm60
-```
diff --git a/dev_tools/containers/bin/include-what-you-use b/dev_tools/containers/bin/include-what-you-use
deleted file mode 100755
index 5e221b31c91..00000000000
Binary files a/dev_tools/containers/bin/include-what-you-use and /dev/null differ
diff --git a/dev_tools/containers/build_all_containers.sh b/dev_tools/containers/build_all_containers.sh
deleted file mode 100755
index c58fc08eae0..00000000000
--- a/dev_tools/containers/build_all_containers.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-
-hpccm --recipe ginkgo-nocuda-base.py --userarg gnu=8 llvm=7 papi=True > gko-nocuda-gnu8-llvm70.baseimage
-list=('gko-nocuda-gnu8-llvm70.baseimage')
-hpccm --recipe ginkgo-nocuda-base.py --userarg gnu=9 llvm=8 papi=True > gko-nocuda-gnu9-llvm8.baseimage
-list+=('gko-nocuda-gnu9-llvm8.baseimage')
-if [ "$HOSTNAME" = "amdci" ]; then
-  list+=('gko-amd-gnu7-llvm60.baseimage')
-else
-  hpccm --recipe ginkgo-cuda-base.py --userarg cuda=10.1 gnu=8 llvm=7 > gko-cuda101-gnu8-llvm70.baseimage
-  hpccm --recipe ginkgo-cuda-base.py --userarg cuda=10.0 gnu=7 llvm=6.0 > gko-cuda100-gnu7-llvm60.baseimage
-  hpccm --recipe ginkgo-cuda-base.py --userarg cuda=9.2 gnu=7 llvm=5.0 > gko-cuda92-gnu7-llvm50.baseimage
-  hpccm --recipe ginkgo-cuda-base.py --userarg cuda=9.1 gnu=6 llvm=4.0 > gko-cuda91-gnu6-llvm40.baseimage
-  hpccm --recipe ginkgo-cuda-base.py --userarg cuda=9.0 gnu=5 llvm=3.9 > gko-cuda90-gnu5-llvm39.baseimage
-  list+=(gko-cuda*.baseimage)
-fi
-
-for i in "${list[@]}"
-do
-  name=$(echo $i | cut -d"." -f1)
-  docker build -t localhost:5000/$name -f $i .
-  docker push localhost:5000/$name
-done
diff --git a/dev_tools/containers/ginkgo-cuda-base.py b/dev_tools/containers/ginkgo-cuda-base.py
deleted file mode 100644
index 636ccd02015..00000000000
--- a/dev_tools/containers/ginkgo-cuda-base.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""
-Ginkgo Base image
-Contents:
-    CUDA version set by the user
-    GNU compilers version set by the user
-    LLVM/Clang clang-tidy version set by the user
-    Intel ICC and ICPC version set according to the CUDA version
-    OpenMP latest apt version for Clang+OpenMP
-    Python 2 and 3 (upstream)
-    cmake (upstream)
-    git, openssh, doxygen, curl, valgrind, graphviz, jq latest apt version
-    build-essential, automake, pkg-config, libtool, latest apt version
-    iwyu precompiled version 6.0
-    libthrust-dev latest apt version
-    gnupg-agent: latest apt version, for adding custom keys
-"""
-# pylint: disable=invalid-name, undefined-variable, used-before-assignment
-
-import os
-
-cuda_version = USERARG.get('cuda', '10.0')
-
-release_name = 'xenial'
-image = 'nvidia/cuda:{}-devel-ubuntu16.04'.format(cuda_version)
-Stage0.baseimage(image)
-
-
-# Correctly set the LIBRARY_PATH
-Stage0 += environment(variables={'CUDA_INSTALL_PATH': '/usr/local/cuda/'})
-Stage0 += environment(variables={'CUDA_PATH': '/usr/local/cuda/'})
-Stage0 += environment(variables={'CUDA_ROOT': '/usr/local/cuda/'})
-Stage0 += environment(variables={'CUDA_SDK': '/usr/local/cuda/'})
-Stage0 += environment(variables={'CUDA_INC_PATH': '/usr/local/cuda/include'})
-Stage0 += environment(variables={'PATH': '$PATH:/usr/local/cuda/bin'})
-Stage0 += environment(variables={'LIBRARY_PATH': '$LIBRARY_PATH:/usr/local/cuda/lib64/stubs'})
-Stage0 += environment(variables={'LD_LIBRARY_PATH': '$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs'})
-Stage0 += environment(variables={'LD_RUN_PATH': 'usr/local/cuda/lib64/stubs'})
-Stage0 += environment(variables={'INCLUDEPATH': '/usr/local/cuda/include'})
-Stage0 += environment(variables={'CPATH': '/usr/local/cuda/include'})
-Stage0 += environment(variables={'MANPATH': '/usr/local/cuda/doc/man'})
-
-
-# Setup extra tools
-Stage0 += python()
-Stage0 += cmake(eula=True, version='3.14.5')
-Stage0 += apt_get(ospackages=['git', 'openssh-client', 'doxygen', 'curl', 'valgrind', 'graphviz'])
-Stage0 += apt_get(ospackages=['jq', 'iwyu'])
-Stage0 += apt_get(ospackages=['build-essential', 'automake', 'pkg-config', 'libtool'])
-Stage0 += apt_get(ospackages=['libthrust-dev'])
-Stage0 += apt_get(ospackages=['gnupg-agent'])
-Stage0 += apt_get(ospackages=['ca-certificates']) # weird github certificates problem
-
-# GNU compilers
-gnu_version = USERARG.get('gnu', '7')
-Stage0 += gnu(version=gnu_version, extra_repository=True)
-gcov_update = ['update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-{} 90'.format(gnu_version)]
-Stage0 += shell(commands=gcov_update)
-
-# Clang compilers
-llvm_version = USERARG.get('llvm', '7')
-clang_ver = 'clang-{}'.format(llvm_version)
-repo_ver = ['deb http://apt.llvm.org/{}/ llvm-toolchain-{}-{} main'.format(release_name, release_name, llvm_version)]
-Stage0 += apt_get(ospackages=[clang_ver, 'libomp-dev'], repositories=repo_ver, keys=['https://apt.llvm.org/llvm-snapshot.gpg.key'])
-clang_update = 'update-alternatives --install /usr/bin/clang clang /usr/bin/clang-{} 90'.format(llvm_version)
-clangpp_update = 'update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang-{} 90'.format(llvm_version)
-Stage0 += shell(commands=[clang_update, clangpp_update])
-
-
-# clang-tidy
-clangtidy = ['clang-tidy-{}'.format(llvm_version)]
-Stage0 += packages(apt_ppas=['ppa:xorg-edgers/ppa'], apt=clangtidy)
-clangtidyln = ['ln -s /usr/bin/clang-tidy-{} /usr/bin/clang-tidy'.format(llvm_version)]
-Stage0 += shell(commands=clangtidyln)
-
-# IWYU
-if os.path.isdir('bin/'):
-    Stage0 += copy(src='bin/*', dest='/usr/bin/')
-
-if os.path.isdir('sonar-scanner/') and float(cuda_version) >= float(10.0):
-    Stage0 += copy(src='sonar-scanner/', dest='/')
-
-# hwloc
-if float(cuda_version) >= float(9.2):
-    Stage0 += shell(commands=['cd /var/tmp',
-                              'git clone https://github.com/open-mpi/hwloc.git hwloc'])
-    Stage0 += shell(commands=['cd /var/tmp/hwloc', './autogen.sh',
-                              './configure --prefix=/usr --disable-nvml', 'make -j10', 'make install'])
-    Stage0 += shell(commands=['rm -rf /var/tmp/hwloc'])
-
-    # upload valid FineCI topology and set it for hwloc
-    if os.path.isfile('topology/fineci.xml'):
-        Stage0 += copy(src='topology/fineci.xml', dest='/')
-        Stage0 += environment(variables={'HWLOC_XMLFILE': '/fineci.xml'})
-        Stage0 += environment(variables={'HWLOC_THISSYSTEM': '1'})
-
-
-# Convert from CUDA version to Intel Compiler years
-intel_versions = {'9.0' : '2017', '9.1' : '2017', '9.2' : '2017', '10.0' : '2018', '10.1' : '2019'}
-intel_path = 'intel/parallel_studio_xe_{}/compilers_and_libraries/linux/'.format(intel_versions.get(cuda_version))
-if os.path.isdir(intel_path):
-    Stage0 += copy(src=intel_path+'bin/intel64/', dest='/opt/intel/bin/')
-    Stage0 += copy(src=intel_path+'lib/intel64/', dest='/opt/intel/lib/')
-    Stage0 += copy(src=intel_path+'include/', dest='/opt/intel/include/')
-    Stage0 += environment(variables={'INTEL_LICENSE_FILE': '28518@scclic1.scc.kit.edu'})
-    Stage0 += environment(variables={'PATH': '$PATH:/opt/intel/bin'})
-    Stage0 += environment(variables={'LIBRARY_PATH': '$LIBRARY_PATH:/opt/intel/lib'})
-    Stage0 += environment(variables={'LD_LIBRARY_PATH': '$LD_LIBRARY_PATH:/opt/intel/lib'})
-    Stage0 += environment(variables={'LD_RUN_PATH': '$LD_RUN_PATH:/opt/intel/lib'})
-
-
-# HIP
-Stage0 += shell(commands=['cd /var/tmp',
-                          'git clone https://github.com/ROCm-Developer-Tools/HIP.git'])
-Stage0 += shell(commands=['cd /var/tmp/HIP', 'mkdir build', 'cd build',
-                          'cmake ..', 'make install'])
-Stage0 += shell(commands=['rm -rf /var/tmp/HIP'])
-Stage0 += shell(commands=['cd /var/tmp',
-                          'git clone https://github.com/tcojean/hipBLAS.git'])
-Stage0 += shell(commands=['cd /var/tmp/hipBLAS', 'mkdir build', 'cd build',
-                          'cmake ..', 'make install'])
-Stage0 += shell(commands=['rm -rf /var/tmp/hipBLAS'])
-Stage0 += shell(commands=['cd /var/tmp',
-                          'git clone https://github.com/tcojean/hipSPARSE.git'])
-Stage0 += shell(commands=['cd /var/tmp/hipSPARSE', 'mkdir build', 'cd build',
-                          'cmake -DBUILD_CUDA=on ..', 'make install'])
-Stage0 += shell(commands=['rm -rf /var/tmp/hipSPARSE'])
diff --git a/dev_tools/containers/ginkgo-nocuda-base.py b/dev_tools/containers/ginkgo-nocuda-base.py
deleted file mode 100644
index 34b6dd78eb6..00000000000
--- a/dev_tools/containers/ginkgo-nocuda-base.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""
-Ginkgo Base image
-Contents:
-    GNU compilers version set by the user
-    LLVM/Clang version set by the user
-    Intel ICC and ICPC version set to the latest available version
-    OpenMP latest apt version for Clang+OpenMP
-    Python 2 and 3 (upstream)
-    cmake (upstream)
-    build-essential, git, openssh, curl, valgrind latest apt version
-    jq, graphviz, ghostscript, latest apt version
-    bison, flex latest apt version, required for doxygen compilation
-    doxygen: install the latest release
-    texlive: install the latest release
-    clang-tidy, iwyu: latest apt version
-    hwloc, libhwloc-dev, pkg-config latest apt version
-    papi: adds package libpfm4, and copy precompiled papi headers and files
-          from a directory called 'papi'
-    gpg-agent: latest apt version, for adding custom keys
-"""
-# pylint: disable=invalid-name, undefined-variable, used-before-assignment
-
-import os
-
-Stage0.baseimage('ubuntu:18.04')
-release_name = 'bionic'
-
-# Setup extra tools
-Stage0 += python()
-Stage0 += cmake(eula=True)
-Stage0 += apt_get(ospackages=['build-essential', 'git', 'openssh-client', 'curl', 'valgrind'])
-Stage0 += apt_get(ospackages=['jq', 'graphviz', 'ghostscript'])
-Stage0 += apt_get(ospackages=['clang-tidy', 'iwyu'])
-Stage0 += apt_get(ospackages=['hwloc', 'libhwloc-dev', 'pkg-config'])
-Stage0 += apt_get(ospackages=['gpg-agent'])
-Stage0 += apt_get(ospackages=['ca-certificates']) # weird github certificates problem
-Stage0 += apt_get(ospackages=['bison', 'flex'])
-
-# GNU compilers
-gnu_version = USERARG.get('gnu', '9')
-Stage0 += gnu(version=gnu_version, extra_repository=True)
-
-# Clang compilers
-llvm_version = USERARG.get('llvm', '8')
-clang_ver = 'clang-{}'.format(llvm_version)
-repo_ver = ['deb http://apt.llvm.org/{}/ llvm-toolchain-{}-{} main'.format(release_name, release_name, llvm_version)]
-Stage0 += apt_get(ospackages=[clang_ver, 'libomp-dev'], repositories=repo_ver, keys=['https://apt.llvm.org/llvm-snapshot.gpg.key'])
-clang_update = 'update-alternatives --install /usr/bin/clang clang /usr/bin/clang-{} 90'.format(llvm_version)
-clangpp_update = 'update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang-{} 90'.format(llvm_version)
-Stage0 += shell(commands=[clang_update, clangpp_update])
-
-# Doxygen
-Stage0 += shell(commands=['cd /var/tmp', 'git clone https://github.com/doxygen/doxygen'])
-Stage0 += shell(commands=['cd /var/tmp/doxygen', 'git checkout Release_1_8_16',
-                          'mkdir build', 'cd build',
-                          'cmake ..', 'make -j10', 'make install'])
-Stage0 += shell(commands=['cd /var/tmp', 'rm -rf doxygen'])
-
-# Texlive
-if os.path.isdir('texlive/'):
-    Stage0 += copy(src='texlive/texlive.profile', dest='/var/tmp')
-    Stage0 += shell(commands=['cd /var/tmp', 'wget '
-                              'http://mirror.ctan.org/systems/texlive/tlnet/install-tl-unx.tar.gz',
-                              'tar -xvf install-tl-unx.tar.gz', 'cd install-tl-2*',
-                              './install-tl --profile=../texlive.profile'])
-    Stage0 += shell(commands=['cd /var/tmp', 'rm -rf install-tl*'])
-    Stage0 += shell(commands=['tlmgr install mathtools float xcolor varwidth '
-                              'fancyvrb multirow hanging adjustbox xkeyval '
-                              'collectbox stackengine etoolbox listofitems ulem '
-                              'wasysym sectsty tocloft newunicodechar caption etoc '
-                              'pgf ec helvetic courier wasy'])
-
-# Copy PAPI libs
-add_papi = USERARG.get('papi', 'False')
-if os.path.isdir('papi/') and add_papi == 'True':
-    Stage0 += apt_get(ospackages=['libpfm4'])
-    Stage0 += copy(src='papi/include/*', dest='/usr/include/')
-    Stage0 += copy(src='papi/lib/*', dest='/usr/lib/')
-    Stage0 += copy(src='papi/bin/*', dest='/usr/bin/')
-
-intel_path = 'intel/parallel_studio_xe_2019/compilers_and_libraries/linux/'
-if os.path.isdir(intel_path):
-    Stage0 += copy(src=intel_path+'bin/intel64/', dest='/opt/intel/bin/')
-    Stage0 += copy(src=intel_path+'lib/intel64/', dest='/opt/intel/lib/')
-    Stage0 += copy(src=intel_path+'include/', dest='/opt/intel/include/')
-    Stage0 += environment(variables={'INTEL_LICENSE_FILE': '28518@scclic1.scc.kit.edu'})
-    Stage0 += environment(variables={'PATH': '$PATH:/opt/intel/bin'})
-    Stage0 += environment(variables={'LIBRARY_PATH': '$LIBRARY_PATH:/opt/intel/lib'})
-    Stage0 += environment(variables={'LD_LIBRARY_PATH': '$LD_LIBRARY_PATH:/opt/intel/lib'})
-    Stage0 += environment(variables={'LD_RUN_PATH': '$LD_RUN_PATH:/opt/intel/lib'})
diff --git a/dev_tools/containers/gko-amd-gnu7-llvm60.baseimage b/dev_tools/containers/gko-amd-gnu7-llvm60.baseimage
deleted file mode 100644
index 018129dd792..00000000000
--- a/dev_tools/containers/gko-amd-gnu7-llvm60.baseimage
+++ /dev/null
@@ -1,75 +0,0 @@
-FROM ubuntu:16.04
-MAINTAINER Terry Cojean <terry.cojean@kit.edu>
-
-# Initialize the image
-# Modify to pre-install dev tools and ROCm packages
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl && \
-  curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add - && \
-  sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list' && \
-  apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-  libelf1 \
-  libnuma-dev \
-  build-essential \
-  git \
-  vim-nox \
-  cmake-curses-gui \
-  kmod \
-  rocm-dev \
-  hipsparse hipblas rocthrust && \
-  apt-get clean && \
-  rm -rf /var/lib/apt/lists/*
-
-
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-        wget && \
-    rm -rf /var/lib/apt/lists/*
-RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://cmake.org/files/v3.11/cmake-3.11.1-Linux-x86_64.sh && \
-    /bin/sh /var/tmp/cmake-3.11.1-Linux-x86_64.sh --prefix=/usr/local --skip-license && \
-    rm -rf /var/tmp/cmake-3.11.1-Linux-x86_64.sh
-
-
-# GNU compiler
-RUN apt-get update -y && \
-     apt-get install -y --no-install-recommends software-properties-common && \
-     apt-add-repository ppa:ubuntu-toolchain-r/test -y && \
-     apt-get update -y && \
-     apt-get install -y --no-install-recommends \
-     gcc-7 \
-     g++-7 \
-     gfortran-7 && \
-     rm -rf /var/lib/apt/lists/*
-
-RUN update-alternatives --install /usr/bin/gcc gcc $(which gcc-7) 30 && \
-    update-alternatives --install /usr/bin/g++ g++ $(which g++-7) 30 && \
-    update-alternatives --install /usr/bin/gfortran gfortran $(which gfortran-7) 30
-
-# LLVM compiler
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-        clang-6.0 && \
-    rm -rf /var/lib/apt/lists/*
-RUN update-alternatives --install /usr/bin/clang clang $(which clang-6.0) 30 && \
-    update-alternatives --install /usr/bin/clang++ clang++ $(which clang++-6.0) 30
-
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-        libomp-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-
-#    sed --in-place=.rocm-backup 's|^\(PATH=.*\)"$|\1:/opt/rocm/bin"|' /etc/environment
-
-USER root
-WORKDIR /root
-ENV PATH "${PATH}:/opt/rocm/bin"
-
-# The following are optional enhancements for the command-line experience
-# Uncomment the following to install a pre-configured vim environment based on http://vim.spf13.com/
-# 1.  Sets up an enhanced command line dev environment within VIM
-# 2.  Aliases GDB to enable TUI mode by default
-#RUN curl -sL https://j.mp/spf13-vim3 | bash && \
-#    echo "alias gdb='gdb --tui'\n" >> ~/.bashrc
-
-# Default to a login shell
-CMD ["bash", "-l"]
diff --git a/dev_tools/containers/texlive/texlive.profile b/dev_tools/containers/texlive/texlive.profile
deleted file mode 100644
index 3c0cbaabe68..00000000000
--- a/dev_tools/containers/texlive/texlive.profile
+++ /dev/null
@@ -1,30 +0,0 @@
-# texlive.profile written on Fri Oct 11 15:47:26 2019 UTC
-# It will NOT be updated and reflects only the
-# installation profile at installation time.
-selected_scheme scheme-basic
-TEXDIR /usr/local/texlive/2019
-TEXMFCONFIG ~/.texlive2019/texmf-config
-TEXMFHOME ~/texmf
-TEXMFLOCAL /usr/local/texlive/texmf-local
-TEXMFSYSCONFIG /usr/local/texlive/2019/texmf-config
-TEXMFSYSVAR /usr/local/texlive/2019/texmf-var
-TEXMFVAR ~/.texlive2019/texmf-var
-binary_x86_64-linux 1
-instopt_adjustpath 1
-instopt_adjustrepo 1
-instopt_letter 0
-instopt_portable 0
-instopt_write18_restricted 1
-tlpdbopt_autobackup 1
-tlpdbopt_backupdir tlpkg/backups
-tlpdbopt_create_formats 1
-tlpdbopt_desktop_integration 1
-tlpdbopt_file_assocs 1
-tlpdbopt_generate_updmap 0
-tlpdbopt_install_docfiles 1
-tlpdbopt_install_srcfiles 1
-tlpdbopt_post_code 1
-tlpdbopt_sys_bin /usr/bin
-tlpdbopt_sys_info /usr/info
-tlpdbopt_sys_man /usr/man
-tlpdbopt_w32_multi_user 1
\ No newline at end of file
diff --git a/dev_tools/scripts/add_license.ignore b/dev_tools/scripts/add_license.ignore
new file mode 100644
index 00000000000..cfcb6f4adaa
--- /dev/null
+++ b/dev_tools/scripts/add_license.ignore
@@ -0,0 +1,3 @@
+build/
+third_party/
+external-lib-interfacing.cpp
\ No newline at end of file
diff --git a/dev_tools/scripts/add_license.sh b/dev_tools/scripts/add_license.sh
index 85d73595ebd..34caf4146b9 100755
--- a/dev_tools/scripts/add_license.sh
+++ b/dev_tools/scripts/add_license.sh
@@ -53,9 +53,9 @@ echo -e "/*${GINKGO_LICENSE_BEACON}\n$(cat ${LICENSE_FILE})\n${GINKGO_LICENSE_BE
 
 # Does not work if a found file (including the path) contains a newline
 find "${GINKGO_ROOT_DIR}" \
-    ! \( -name "build" -prune -o -name "third_party" -prune -o -name "external-lib-interfacing.cpp" -prune \) \
-    \( -name '*.cuh' -o -name '*.hpp' -o -name '*.hpp.in' -o -name '*.cpp' -o -name '*.cu' \) \
+    \( -name '*.cuh' -o -name '*.hpp' -o -name '*.hpp.in' -o -name '*.cpp' -o -name '*.cu' -o -name '*.hpp.inc' \) \
     -type f -print \
+    | grep -F -v -f "${THIS_DIR}/add_license.ignore" \
     | \
     while IFS='' read -r i; do
         # `grep -F` is important here because the characters in the beacon should be matched against
diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
new file mode 100644
index 00000000000..768f3693327
--- /dev/null
+++ b/dev_tools/scripts/config
@@ -0,0 +1,40 @@
+- "test_install|benchmark"
+    - FixInclude: "ginkgo/ginkgo.hpp"
+- "executor"
+    - FixInclude: "ginkgo/core/base/executor.hpp"
+- "hip/base/config.hip.hpp"
+    - FixInclude: "hip/hip_runtime.h"
+- "(cuda|hip|omp)/test/factorization/par_ilu_kernels"
+    - FixInclude: "core/factorization/par_ilu_kernels.hpp"
+- "(cuda|hip)/preconditioner/jacobi_"
+    - FixInclude: "core/preconditioner/jacobi_kernels.hpp"
+- "core/test/base/(extended_float|iterator_factory)"
+    - RemoveTest: "true"
+- "_builder\.cpp"
+    - RemoveTest: "true"
+- "_builder\.hpp"
+    - CoreSuffix: "_builder"
+- "components.*_kernels(\.hip)?\.(cu|cpp|hpp|cuh)"
+    - CoreSuffix: "_kernels"
+    - RemoveTest: "true"
+- "components"
+    - RemoveTest: "true"
+    - PathIgnore: "1"
+    - PathPrefix: "core"
+- "test/utils"
+    - CoreSuffix: "_test"
+    - PathIgnore: "1"
+    - PathPrefix: "core"
+- "core\/.*"
+    - CoreSuffix: "_kernels"
+    - PathPrefix: "ginkgo"
+    - PathIgnore: "0"
+    - RemoveTest: "true"
+- "/(test|base)/"
+    - CoreSuffix: "_kernels"
+    - PathPrefix: "ginkgo/core"
+    - PathIgnore: "1"
+    - RemoveTest: "true"
+- ".*"
+    - PathPrefix: "core"
+    - PathIgnore: "1"
diff --git a/dev_tools/scripts/create_new_algorithm.sh b/dev_tools/scripts/create_new_algorithm.sh
index 24b37b475fe..f6893f68c82 100755
--- a/dev_tools/scripts/create_new_algorithm.sh
+++ b/dev_tools/scripts/create_new_algorithm.sh
@@ -97,11 +97,13 @@ TEMPLATE_FILES=(
     "${name}_kernels.hpp"
     "${name}_kernels.cpp"
     "${name}_kernels.cpp"
-    "${name}_kernels.c*"
+    "${name}_*.[ch]*"
+    "${name}_kernels.hip.cpp"
     "${name}.cpp"
     "${name}_kernels.cpp"
     "${name}_kernels.cpp"
     "${name}_kernels.cpp"
+    "${name}_kernels.*"
 )
 CMAKE_FILES=(
     "core/CMakeLists.txt"
@@ -110,10 +112,12 @@ CMAKE_FILES=(
     "reference/CMakeLists.txt"
     "omp/CMakeLists.txt"
     "cuda/CMakeLists.txt"
+    "hip/CMakeLists.txt"
     "core/test/$source_type/CMakeLists.txt"
     "reference/test/$source_type/CMakeLists.txt"
     "omp/test/$source_type/CMakeLists.txt"
     "cuda/test/$source_type/CMakeLists.txt"
+    "hip/test/$source_type/CMakeLists.txt"
 )
 TEMPLATE_FILES_LOCATIONS=(
     "core/$source_type"
@@ -122,22 +126,26 @@ TEMPLATE_FILES_LOCATIONS=(
     "reference/$source_type"
     "omp/$source_type"
     "cuda/$source_type"
+    "hip/$source_type"
     "core/test/$source_type"
     "reference/test/$source_type"
     "omp/test/$source_type"
     "cuda/test/$source_type"
+    "hip/test/$source_type"
 )
 TEMPLATE_FILES_TYPES=(
     "$source_type file"
     "class header"
     "kernel header"
-    "kernel file"
-    "kernel file"
-    "kernel file"
-    "unit tests for ${name} $type"
+    "Reference kernel file"
+    "OpenMP kernel file"
+    "CUDA kernel file"
+    "HIP kernel file"
+    "unit tests for ${name} $source_type"
     "unit tests for ${name} reference kernels"
     "unit tests for ${name} OMP kernels"
     "unit tests for ${name} CUDA kernels"
+    "unit tests for ${name} HIP kernels"
 )
 TEMPLATE_FILES_DESCRIPTIONS=(
     "This is where the ${name} algorithm needs to be implemented."
@@ -146,131 +154,140 @@ TEMPLATE_FILES_DESCRIPTIONS=(
     "Reference kernels for ${name} need to be implemented here."
     "OMP kernels for ${name} need to be implemented here."
     "CUDA kernels for ${name} need to be implemented here."
-    ""
-    ""
-    ""
-    ""
+    "HIP kernels for ${name} need to be implemented here."
+    "This is where core related unit tests should be implemented, i.e. relating to the interface without executor usage."
+    "This is where tests with the Reference executor should be implemented. Usually, this means comparing against previously known values."
+    "This is where tests with the OpenMP executor should be implemented. Usually, this means comparing against a Reference execution."
+    "This is where tests with the CUDA executor should be implemented. Usually, this means comparing against a Reference execution."
+    "This is where tests with the HIP executor should be implemented. Usually, this means comparing against a Reference execution."
 )
 
 mkdir ${TMPDIR}
 
-# create folder for temporary files
-
-# copy files needed into temporary folder
 for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ ))
 do
     sourcename=$(echo ${TEMPLATE_FILES[$i-1]} | sed "s/${name}/${source_name}/" )
     sourcepath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${sourcename}
-    file=$(ls ${GINKGO_ROOT_DIR}/${sourcepath})
-    if [ -f "$file" ]
-    then
-        # We have evaluated the extension and found it
-        # Integrate it in the template list
-        filename=$(basename -- ${file})
-        source_path=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${filename}
-        TEMPLATE_FILES[$i-1]=$(echo "${filename}" | sed "s/${source_name}/${name}/")
-    else
-        echo "Warning: Source file $sourcepath was not found."
-    fi
 
-    destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
+    # create folder for temporary files
     mkdir -p ${TMPDIR}/${TEMPLATE_FILES_LOCATIONS[$i-1]}
-    cp ${GINKGO_ROOT_DIR}/$sourcepath ${TMPDIR}/$destpath
-done
 
-# search and replace keywords with new solver name
-echo -e "\nCreating temporary files:"
-for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ ))
-do
-    destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
-    perl -pi -e "s/${source_name}/$name/g" ${TMPDIR}/$destpath
-    perl -pi -e "s/${source_name^}/$Name/g" ${TMPDIR}/$destpath
-    perl -pi -e "s/${source_name^^}/$NAME/g" ${TMPDIR}/$destpath
+    # Evaluate the extension and try to find the matching files
+    for j in $(ls ${GINKGO_ROOT_DIR}/${sourcepath})
+    do
+        if [ -f "$j" ]
+        then
+            filename=$(basename -- ${j})
+            source_path=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${filename}
+            destname=$(echo "${filename}" | sed "s/${source_name}/${name}/")
+            destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/$destname
 
+            cp ${GINKGO_ROOT_DIR}/$source_path ${TMPDIR}/$destpath
 
-    # Comment all code
-    awk '/^{$/,/^}$/ { if ($0 == "{"){ print "GKO_NOT_IMPLEMENTED;"; print "//" $0; print "// TODO (script): change the code imported from '${source_type}'/'${source_name}' if needed"; next} else { print "//" $0; next }} 1' ${TMPDIR}/$destpath > tmp
-    mv tmp  ${TMPDIR}/$destpath
+            # Replace all instances of source_name by the user's requested name
+            perl -n -i -e "print unless m/.*common.*${source_name}_kernels.hpp.inc.*/" ${TMPDIR}/$destpath
+            perl -pi -e "s/${source_name}/$name/g" ${TMPDIR}/$destpath
+            perl -pi -e "s/${source_name^}/$Name/g" ${TMPDIR}/$destpath
+            perl -pi -e "s/${source_name^^}/$NAME/g" ${TMPDIR}/$destpath
 
-    ls ${TMPDIR}/$destpath
-done
+            # Comment all code
+            awk -v name=${name} '/^{$/,/^}$/ { if ($0 == "{"){ print "GKO_NOT_IMPLEMENTED;"; print "//" $0; print "// TODO (script:" name "): change the code imported from '${source_type}'/'${source_name}' if needed"; next} else { print "//" $0; next }} 1' ${TMPDIR}/$destpath > tmp
+            mv tmp ${TMPDIR}/$destpath
 
-if [ $execute == 1 ]
-then
-    echo -e "\nRenaming and distributing files"
-    # rename and distribute the files to the right location
-    # for each file, make sure it does not exist yet
-    for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ ))
-    do
-        sourcepath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
-        destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
-        if [ ! -f ${GINKGO_ROOT_DIR}/$destpath ]; then
-            cp ${TMPDIR}/${sourcepath} ${GINKGO_ROOT_DIR}/${destpath}
+            ls ${TMPDIR}/$destpath
+
+            if [ $execute == 1 ]
+            then
+                if [ ! -f ${GINKGO_ROOT_DIR}/$destpath ]; then
+                    cp ${TMPDIR}/${destpath} ${GINKGO_ROOT_DIR}/${destpath}
+                else
+                    echo -e "Error: file ${GINKGO_ROOT_DIR}/$destpath exists"
+                    echo -e "Remove file first if you want to replace it."
+                    read -p ""
+                fi
+            fi
         else
-            echo -e "Error: file ${GINKGO_ROOT_DIR}/$destpath exists"
-            echo -e "Remove file first if you want to replace it."
-            read -p ""
+            echo "Warning: Source file $sourcepath was not found."
         fi
     done
+done
 
-
-    echo -e "cleaning up temporary files."
-    rm -rf ${TMPDIR}
-
-
+if [ $execute == 1 ]
+then
     if [ $automatic_additions -eq 1 ]
     then
         ## Try to automatically add the files to CMakeLists
-        echo -e "Modifiying CMakeLists.txt and common_kernels.inc.cpp"
+        echo -e "Modifying CMakeLists.txt and common_kernels.inc.cpp"
         for ((i=1; i<=${#CMAKE_FILES[@]}; i++))
         do
-            destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
-            if [ ! -f ${GINKGO_ROOT_DIR}/${destpath} ];
-            then
-                continue
-            fi
-
-            cmake_file="${GINKGO_ROOT_DIR}/${CMAKE_FILES[$i-1]}"
-            if [[ $cmake_file == *"test/"* ]]
-            then
-                insert=$(grep -E "\(${source_name}[_\)]{1}" $cmake_file | sed "s/$source_name/$name/")
-                echo "$insert" >> $cmake_file
-                cat $cmake_file | sort > tmp
-                mv tmp $cmake_file
-            elif [[ $cmake_file != "${GINKGO_ROOT_DIR}/" ]]
-            then
-                ## Works only if we have something of the form:
-                ##target_sources(
-                ##     PRIVATE
-                ##         <lib1>
-                ##         ...
-                ##         <libn>)
-                list=( $(awk '/^target_sources/,/        .*\)/ {if ( match($0, "target_sources") == 0 && match($0, "PRIVATE") == 0 ) { print $0 }}' $cmake_file) )
-                last_elem=$((${#list[@]}-1))
-                list[$last_elem]=$(echo ${list[$last_elem]} | tr -d ')')
-                list+=( "$source_type/${TEMPLATE_FILES[$i-1]}" )
-                IFS=$'\n' sorted=($(sort <<<"${list[*]}"))
-                unset IFS
-                last_elem=$((${#sorted[@]}-1))
-                sorted[$last_elem]=$(echo ${sorted[$last_elem]}")")
-
-                ## find the correct position
-                insert_to=$(grep -n -m 1 "target_sources" $cmake_file | sed 's/:.*//')
-                insert_to=$((insert_to + 1)) # account for the "PRIVATE"
-
-                ## clear up the CMakeList.txt
-                awk '/^target_sources/,/        .*\)/ {if (match($0, "target_sources") != 0 || match($0, "PRIVATE") != 0){ print $0 }; next}1'  $cmake_file > tmp
-
-                mytmp=`mktemp`
-                head -n$insert_to tmp > $mytmp
-                for line in "${sorted[@]}"
-                do
-                    echo "        $line" >> $mytmp
-                done
-                tail -n +$((insert_to+1)) tmp >> $mytmp
-                mv $mytmp tmp
-                mv tmp $cmake_file
-            fi
+            sourcepath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
+            for j in $(ls ${GINKGO_ROOT_DIR}/${sourcepath})
+            do
+                filename=$(basename -- $j)
+                shortname=$(echo $filename | cut -d"." -f1)
+                sourcename=$(echo ${shortname} | sed "s/${name}/${source_name}/" )
+                if [[ ! -f ${j} || "${j}" == *".hpp" || "${j}" == *".cuh" ]];
+                then
+                    continue
+                fi
+
+                cmake_file="${GINKGO_ROOT_DIR}/${CMAKE_FILES[$i-1]}"
+                if [[ $cmake_file == *"test/"* ]]
+                then
+                    insert=$(grep -E "\(${sourcename}[_\)]{1}" $cmake_file | sed "s/$source_name/$name/")
+                    echo "$insert" >> $cmake_file
+                    cat $cmake_file | sort > tmp
+                    mv tmp $cmake_file
+                elif [[ $cmake_file != "${GINKGO_ROOT_DIR}/" ]]
+                then
+                    ## For most directories this works with something of the form:
+                    ##target_sources(
+                    ##     PRIVATE
+                    ##     <lib1>
+                    ##     ...
+                    ##     <libn>)
+                    ## For HIP:
+                    ##set(GINKGO_HIP_SOURCES
+                    ##    <lib1>
+                    ##    ...
+                    ##    <libn>)
+                    if [[ $cmake_file == *"hip/"* ]]
+                    then
+                        list=( $(awk '/^set\(GINKGO_HIP_SOURCES/,/    .*\)/ {if ( match($0, "GINKGO_HIP_SOURCES") == 0 ) { print $0 }}' $cmake_file) )
+                    else
+                        list=( $(awk '/^target_sources/,/    .*\)/ {if ( match($0, "target_sources") == 0 && match($0, "PRIVATE") == 0 ) { print $0 }}' $cmake_file) )
+                    fi
+
+                    last_elem=$((${#list[@]}-1))
+                    list[$last_elem]=$(echo ${list[$last_elem]} | tr -d ')')
+                    list+=( "$source_type/${filename}" )
+                    IFS=$'\n' sorted=($(sort <<<"${list[*]}"))
+                    unset IFS
+                    last_elem=$((${#sorted[@]}-1))
+                    sorted[$last_elem]=$(echo ${sorted[$last_elem]}")")
+
+                    ## find the correct position and clear up the CMakeList.txt
+                    if [[ $cmake_file == *"hip/"* ]]
+                    then
+                        insert_to=$(grep -n -m 1 "GINKGO_HIP_SOURCES" $cmake_file | sed 's/:.*//')
+                        awk '/^set\(GINKGO_HIP_SOURCES/,/    .*\)/ {if (match($0, "GINKGO_HIP_SOURCES") != 0 ){ print $0 }; next}1'  $cmake_file > tmp
+                    else
+                        insert_to=$(grep -n -m 1 "target_sources" $cmake_file | sed 's/:.*//')
+                        insert_to=$((insert_to + 1)) # account for the "PRIVATE"
+                        awk '/^target_sources/,/    .*\)/ {if (match($0, "target_sources") != 0 || match($0, "PRIVATE") != 0){ print $0 }; next}1'  $cmake_file > tmp
+                    fi
+
+                    mytmp=`mktemp`
+                    head -n$insert_to tmp > $mytmp
+                    for line in "${sorted[@]}"
+                    do
+                        echo "    $line" >> $mytmp
+                    done
+                    tail -n +$((insert_to+1)) tmp >> $mytmp
+                    mv $mytmp tmp
+                    mv tmp $cmake_file
+                fi
+            done
         done
 
 
@@ -305,7 +322,7 @@ then
 
         mytmp=`mktemp`
         head -n$old_code_block_end $common_kernels_file > $mytmp
-        echo -e "\n\n// TODO (script): adapt this block as needed" >> $mytmp
+        echo -e "\n\n// TODO (script:${name}): adapt this block as needed" >> $mytmp
         for line in "${old_code_block[@]}"
         do
             echo -e "$line" | sed "s/${source_name^^}/$NAME/g" | sed "s/${source_name}/$name/g" >> $mytmp
@@ -313,6 +330,9 @@ then
         tail -n +$((old_code_block_end+1)) $common_kernels_file >> $mytmp
         mv $mytmp $common_kernels_file
     fi
+
+    echo -e "cleaning up temporary files."
+    rm -rf ${TMPDIR}
 else
     echo -e "\nNo file was copied because --dry-run was used"
     echo -e "You can inspect the generated solver files in ${TMPDIR}."
@@ -322,7 +342,7 @@ if [ -f todo_${name}.txt ]; then
     rm todo_${name}.txt
 fi
 
-echo -e "\nSummary:"                                                                 | tee -a todo_${name}.txt
+echo -e "\n###Summary:"                                                                 | tee -a todo_${name}.txt
 for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ ))
 do
     destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]}
@@ -342,38 +362,35 @@ then
     do
         if [[ "${CMAKE_FILES[$i-1]}" != "" ]]
         then
-            echo "Modified ${CMAKE_FILES[$i-1]}"                 | tee -a todo_${name}.txt
+            echo "Modified ${CMAKE_FILES[$i-1]}"              | tee -a todo_${name}.txt
         fi
     done
-    echo "Modified core/device_hooks/common_kernels.inc.cpp"     | tee -a todo_${name}.txt
+    echo "Modified core/device_hooks/common_kernels.inc.cpp"  | tee -a todo_${name}.txt
 fi
 
-echo -e "In all of the previous files ${source_name} was automatically replaced into ${name}. Ensure there is no inconsistency."                               | tee -a todo_${name}.txt
-echo -e ""                                                       | tee -a todo_${name}.txt
-echo -e "All the imported code was commented and TODO items were generated in the new files." | tee -a todo_${name}.txt
-echo -e "Check all the modified files for '// TODO (script):' items"| tee -a todo_${name}.txt
-echo -e "e.g. by using grep -HR '// TODO (script):' ${GINKGO_ROOT_DIR}"| tee -a todo_${name}.txt
-echo ""                                                          | tee -a todo_${name}.txt
-
 if [ $automatic_additions -eq 0 ]
 then
-    echo ""                                                                         | tee -a todo_${name}.txt
-    echo "The following CMakeLists have to be modified manually:"                   | tee -a todo_${name}.txt
-    echo "core/CMakeLists.txt"                                                      | tee -a todo_${name}.txt
-    echo "core/test/${source_type}/CMakeLists.txt"                                          | tee -a todo_${name}.txt
-    echo ""                                                                         | tee -a todo_${name}.txt
-    echo "reference/CMakeLists.txt"                                                 | tee -a todo_${name}.txt
-    echo "reference/test/${source_type}/CMakeLists.txt"                                     | tee -a todo_${name}.txt
-    echo ""                                                                         | tee -a todo_${name}.txt
-    echo "omp/CMakeLists.txt"                                                       | tee -a todo_${name}.txt
-    echo "omp/test/${source_type}/CMakeLists.txt"                                           | tee -a todo_${name}.txt
-    echo ""                                                                         | tee -a todo_${name}.txt
-    echo "cuda/CMakeLists.txt"                                                       | tee -a todo_${name}.txt
-    echo "cuda/test/${source_type}/CMakeLists.txt"                                           | tee -a todo_${name}.txt
-    echo ""                                                                         | tee -a todo_${name}.txt
-    echo ""                                                                         | tee -a todo_${name}.txt
-    echo "The following header file has to modified:"                               | tee -a todo_${name}.txt
-    echo "core/device_hooks/common_kernels.inc.cpp"                                 | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo "The following CMakeLists have to be modified manually:"| tee -a todo_${name}.txt
+    echo "core/CMakeLists.txt"                                | tee -a todo_${name}.txt
+    echo "core/test/${source_type}/CMakeLists.txt"            | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo "reference/CMakeLists.txt"                           | tee -a todo_${name}.txt
+    echo "reference/test/${source_type}/CMakeLists.txt"       | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo "omp/CMakeLists.txt"                                 | tee -a todo_${name}.txt
+    echo "omp/test/${source_type}/CMakeLists.txt"             | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo "cuda/CMakeLists.txt"                                | tee -a todo_${name}.txt
+    echo "cuda/test/${source_type}/CMakeLists.txt"            | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo "hip/CMakeLists.txt"                                 | tee -a todo_${name}.txt
+    echo "hip/test/${source_type}/CMakeLists.txt"             | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo ""                                                   | tee -a todo_${name}.txt
+    echo "The following header file has to be modified:"      | tee -a todo_${name}.txt
+    echo "core/device_hooks/common_kernels.inc.cpp"           | tee -a todo_${name}.txt
     echo "Equivalent to the other solvers, the following part has to be appended:"  | tee -a todo_${name}.txt
     echo "##################################################" | tee -a todo_${name}.txt
     echo "#include #include \"core/solver/test_kernels.hpp\"" | tee -a todo_${name}.txt
@@ -394,5 +411,17 @@ then
     echo ""                                                   | tee -a todo_${name}.txt
     echo ""                                                   | tee -a todo_${name}.txt
 fi
+
+echo -e "\n\n\n### TODO:"                                         | tee -a todo_${name}.txt
+echo -e "In all of the previous files ${source_name} was automatically replaced into ${name}. Ensure there is no inconsistency."                              | tee -a todo_${name}.txt
+echo -e ""                                                    | tee -a todo_${name}.txt
+echo -e "All the imported code was commented and TODO items were generated in the new files." | tee -a todo_${name}.txt
+echo -e "Check all the modified files for \"// TODO (script:${name}):\" items"| tee -a todo_${name}.txt
+echo -e "e.g. by using  grep -nR \"// TODO (script:${name}):\" ${GINKGO_ROOT_DIR} | grep -v \"create_new_algorithm.sh\" | grep -v \"todo_${name}.txt\"." | tee -a todo_${name}.txt
+echo ""                                                       | tee -a todo_${name}.txt
+echo "A tentative list of relevant TODO items follows:"       | tee -a todo_${name}.txt
+grep -nR "// TODO (script:${name}):" ${GINKGO_ROOT_DIR} | grep -v "create_new_algorithm.sh" | grep -v "todo_${name}.txt" | tee -a todo_${name}.txt
+
+
 echo "A summary of the required next steps has been written to:"
 echo "todo_${name}.txt"
diff --git a/dev_tools/scripts/cuda2hip.sh b/dev_tools/scripts/cuda2hip.sh
new file mode 100755
index 00000000000..c372e0869b2
--- /dev/null
+++ b/dev_tools/scripts/cuda2hip.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+HIPIFY=/opt/rocm/hip/bin/hipify-perl
+# For some reasons, hipify from apt does not add HIP_KERNEL_NAME.
+
+if [ "$0" != "dev_tools/scripts/cuda2hip.sh" ]; then
+    echo "You are only allowed to run dev_tools/scripts/cuda2hip.sh in the ginkgo source folder."
+    exit 1
+fi
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 path/to/cuda/file"
+    exit 2
+fi
+
+ORIGIN_FILE=$1
+echo "CUDA: ${ORIGIN_FILE}"
+NEW_FILE=$(echo ${ORIGIN_FILE} | sed -E "s/^cuda/hip/g;s/(cuh|hpp)$/hip\.hpp/g;s/(cpp|cu)$/hip\.cpp/g")
+echo "HIP: ${NEW_FILE}"
+${HIPIFY} "${ORIGIN_FILE}" > "${NEW_FILE}"
+
+# String replacement
+# header file
+REG="s/(cuda[a-z\/_]*)(\.hpp|\.cuh)/\1.hip.hpp/g"
+# cuda -> hip
+REG="${REG};s/cuda/hip/g;s/Cuda/Hip/g;s/CUDA/HIP/g"
+# cublas -> hipblas
+REG="${REG};s/cublas/hipblas/g;s/Cublas/Hipblas/g;s/CUBLAS/HIPBLAS/g"
+# cusparse -> hipsparse
+REG="${REG};s/cusparse/hipsparse/g;s/Cusparse/Hipsparse/g;s/CUSPARSE/HIPSPARSE/g"
+# culibs -> hiplibs
+REG="${REG};s/culibs/hiplibs/g"
+# header definition
+REG="${REG};s/(CUH_|HPP_)$/HIP_HPP_/g"
+
+sed -i -E "${REG}" "${NEW_FILE}"
+
+# Move the namespace into correct place.
+# {namespace}::hipLaunchKernelGGL( to hipLaunchKernelGGL({namespace}::
+sed -i -E "s/(.*)::hipLaunchKernelGGL\(/hipLaunchKernelGGL\(\1::/g" "${NEW_FILE}"
+# {namespace}::HIP_KERNEL_NAME( to HIP_KERNEL_NAME({namespace}::
+sed -i -E "s/(.*)::HIP_KERNEL_NAME\(/HIP_KERNEL_NAME\(\1::/g" "${NEW_FILE}"
diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
new file mode 100755
index 00000000000..21b92419ccd
--- /dev/null
+++ b/dev_tools/scripts/format_header.sh
@@ -0,0 +1,313 @@
+#!/usr/bin/env bash
+
+convert_header () {
+    local regex="^(#include )(<|\")(.*)(\"|>)$"
+    if [[ $@ =~ ${regex} ]]; then
+        header_file="${BASH_REMATCH[3]}"
+        if [ -f "${header_file}" ]; then
+            if [[ "${header_file}" =~ ^ginkgo ]]; then
+                echo "#include <${header_file}>"
+            else
+                echo "#include \"${header_file}\""
+            fi
+        elif [ "${header_file}" = "matrices/config.hpp" ]; then 
+            echo "#include \"${header_file}\""
+        else
+            echo "#include <${header_file}>"
+        fi
+    else
+        echo "$@"
+    fi
+}
+
+get_header_def () {
+    local regex="\.(hpp|cuh)"
+    if [[ $@ =~ $regex ]]; then
+        local def=$(echo "$@" | sed -E "s~include/ginkgo/~~g;s~/|\.~_~g")
+        def=$(echo GKO_${def^^}_)
+        echo $def
+    else
+        echo ""
+    fi
+}
+
+add_regroup () {
+    cp .clang-format .clang-format.temp
+    sed -i "s~\.\.\.~~g" .clang-format
+    cat dev_tools/scripts/regroup >> .clang-format
+    echo "..." >> .clang-format
+}
+
+remove_regroup () {
+    mv .clang-format.temp .clang-format
+}
+
+# It reads "dev_tools/scripts/config" to generate the corresponding main header
+# The setting setting:
+# - "file_regex"
+#   - CoreSuffix: "core_suffix_regex"           (default "")
+#   - PathPrefix: "path_prefix_regex"           (default "")
+#   - PathIgnore: "path_ignore_number"          (default "0")
+#   - RemoveTest: "false/true"                  (default "test")
+#   - FixInclude: "the specific main header"    (default "")
+# Only "file_regex" without any setting is fine, and it means find the same name with header suffix
+# For example, /path/to/file.cpp will change to /path/to/file.hpp
+# file_regex : selecting which file apply this rule
+# CoreSuffix : remove the pattern which passes the "core_suffix_regex" of file
+# PathPrefix : adds "path_prefix_regex" before path, and the position depends on PathIgnore
+# PathIgnore : ignore the number "path_ignore_number" folder from top level, and then add "path_prefix_regex" into path
+# RemoveTest : Decide whether ignore /test/ in the path
+# FixInclude : Specify the main header. If it is set, ignore others setting
+# Note: This script picks the first fitting "file_regex" rules according the ordering in config
+get_include_regex () {
+    local file="$1"
+    declare -n local_output=$2
+    local core_suffix=""
+    local path_prefix=""
+    local path_ignore="0"
+    local fix_include=""
+    local remove_test="false"
+    local item_regex="^-\ +\"(.*)\""
+    local path_prefix_regex="PathPrefix:\ +\"(.*)\""
+    local core_suffix_regex="CoreSuffix:\ +\"(.*)\""
+    local path_ignore_regex="PathIgnore:\ +\"(.*)\""
+    local fix_include_regex="FixInclude:\ +\"(.*)\""
+    local remove_test_regex="RemoveTest:\ +\"(.*)\""
+    local match="false"
+    while IFS='' read -r line; do
+        if [[ "$line" =~ $item_regex ]]; then
+            file_regex="${BASH_REMATCH[1]}"
+            if [[ "$match" = "true" ]]; then
+                break
+            elif [[ $file =~ $file_regex ]]; then
+                match="true"
+            fi
+        elif [ "$match" = "true" ]; then
+            if [[ "$line" =~ $path_prefix_regex ]]; then
+                path_prefix="${BASH_REMATCH[1]}"
+            elif [[ "$line" =~ $core_suffix_regex ]]; then
+                core_suffix="${BASH_REMATCH[1]}"
+            elif [[ "$line" =~ $path_ignore_regex ]]; then
+                path_ignore="${BASH_REMATCH[1]}"
+            elif [[ "$line" =~ $fix_include_regex ]]; then
+                fix_include="${BASH_REMATCH[1]}"
+            elif [[ "$line" =~ $remove_test_regex ]]; then
+                remove_test="${BASH_REMATCH[1]}"
+            else
+                echo "Ignore unknow setting: \"${file_regex}\" - ${line}"
+            fi
+        fi
+    done < "dev_tools/scripts/config"
+    local_output=""
+    if [ -z "${fix_include}" ]; then
+        local path_regex="([a-zA-Z_]*\/){${path_ignore}}(.*)\.(cpp|hpp|cu|cuh)"
+        if [ ! -z "${path_prefix}" ]; then
+            path_prefix="${path_prefix}/"
+        fi
+        local_output=$(echo "${file}" | sed -E "s~\.hip~~g;s~$path_regex~$path_prefix\2~g")
+        local_output=$(echo "${local_output}" | sed -E "s~$core_suffix$~~g")
+        local_output="#include (<|\")$local_output\.(hpp|hip\.hpp|cuh)(\"|>)"
+        if [ "${remove_test}" = "true" ]; then
+            local_output=$(echo "${local_output}" | sed -E "s~test/~~g")
+        fi
+    else
+        local_output="#include (<|\")$fix_include(\"|>)"
+    fi
+}
+
+GINKGO_LICENSE_BEACON="******************************<GINKGO LICENSE>******************************"
+
+CONTENT="content.cpp" # Store the residual part (start from namespace)
+BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file
+HAS_HIP_RUNTIME="false"
+DURING_LICENSE="false"
+INCLUDE_REGEX="^#include.*"
+INCLUDE_INC="\.inc"
+MAIN_PART_MATCH=""
+
+# FORCE_TOP_ON/OFF is only valid before other #include
+FORCE_TOP_ON="// force-top: on"
+FORCE_TOP_OFF="// force-top: off"
+FORCE_TOP="force_top"
+DURING_FORCE_TOP="false"
+
+get_include_regex $1 MAIN_PART_MATCH
+HEADER_DEF=$(get_header_def $1)
+
+IFNDEF=""
+DEFINE=""
+IFNDEF_REGEX="^#ifndef GKO_"
+DEFINE_REGEX="^#define GKO_"
+HEADER_REGEX="\.(hpp|cuh)"
+SKIP="true"
+START_BLOCK_REX="^(#if| *\/\*)"
+END_BLOCK_REX="^#endif|\*\/$"
+ENDIF_REX="^#endif"
+IN_BLOCK=0
+KEEP_LINES=0
+LAST_NONEMPTY=""
+ALARM=""
+COMMENT_REGEX="^ *\/\/"
+CONSIDER_REGEX="${START_BLOCK_REX}|${END_BLOCK_REX}|${COMMENT_REGEX}|${INCLUDE_REGEX}"
+
+# This part capture the main header and give the possible fail arrangement information
+while IFS='' read -r line || [ -n "$line" ]; do
+    if [ "${line}" = '#include "hip/hip_runtime.h"' ] && [ "${SKIP}" = "true" ]; then
+        HAS_HIP_RUNTIME="true"
+    elif [ "${line}" = "/*${GINKGO_LICENSE_BEACON}" ] || [ "${DURING_LICENSE}" = "true" ]; then
+        DURING_LICENSE="true"
+        if [ "${line}" = "${GINKGO_LICENSE_BEACON}*/" ]; then
+            DURING_LICENSE="false"
+        fi
+    elif [ "${SKIP}" = "true" ] && ([ "$line" = "${FORCE_TOP_ON}" ] || [ "${DURING_FORCE_TOP}" = "true" ]); then
+        DURING_FORCE_TOP="true"
+        if [ "$line" = "${FORCE_TOP_OFF}" ]; then
+            DURING_FORCE_TOP="false"
+        fi
+        if [[ "${line}" =~ $INCLUDE_REGEX ]]; then
+            line="$(convert_header ${line})"
+        fi
+        echo "$line" >> "${FORCE_TOP}"
+    elif [ -z "${line}" ] && [ "${SKIP}" = "true" ]; then
+    # Ignore all empty lines between LICENSE and Header
+        :
+    else
+        if [[ "${line}" =~ $INCLUDE_REGEX ]]; then
+            line="$(convert_header ${line})"
+        fi
+        if [ -z "${line}" ]; then
+            KEEP_LINES=$((KEEP_LINES+1))
+        else
+            LAST_NONEMPTY="${line}"
+            KEEP_LINES=0
+        fi
+        if [[ $1 =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${IFNDEF_REGEX} ]] && [ "${SKIP}" = "true" ] && [ -z "${DEFINE}" ]; then
+            IFNDEF="${line}"
+        elif [[ $1 =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${DEFINE_REGEX} ]] && [ "${SKIP}" = "true" ] && [ ! -z "${IFNDEF}" ]; then
+            DEFINE="${line}"
+        elif [ -z "${MAIN_PART_MATCH}" ] || [[ ! "${line}" =~ ${MAIN_PART_MATCH} ]] || [[ "${IN_BLOCK}" -gt 0 ]]; then
+            echo "${line}" >> "${CONTENT}"
+            SKIP="false"
+            if [[ "${line}" =~ $START_BLOCK_REX ]]; then
+                # keep everythin in #if block and /* block
+                IN_BLOCK=$((IN_BLOCK+1))
+                if [ -z "${ALARM}" ]; then
+                    ALARM="set"
+                fi
+            fi
+            if [[ "${IN_BLOCK}" = "0" ]] && [ ! -z "${line}" ] && [[ ! "${line}" =~ ${CONSIDER_REGEX} ]]; then
+                if [ "${ALARM}" = "set" ]; then
+                    ALARM="true"
+                elif [ -z "${ALARM}" ]; then
+                    ALARM="false"
+                fi
+            fi
+            if [[ "${line}" =~ $END_BLOCK_REX ]]; then
+                IN_BLOCK=$((IN_BLOCK-1))
+            fi
+        else
+            echo "${line}" >> ${BEFORE}
+        fi
+    fi
+done < $1
+if [ "${ALARM}" = "true" ]; then
+    echo "Warning $1: sorting is probably incorrect"
+fi
+
+# Wrtie license
+echo "/*${GINKGO_LICENSE_BEACON}" > $1
+cat LICENSE >> $1
+echo "${GINKGO_LICENSE_BEACON}*/" >> $1
+echo "" >> $1
+
+# Wrtie the definition of header according to path
+if [ ! -z "${IFNDEF}" ] && [ ! -z "${DEFINE}" ]; then
+    IFNDEF="#ifndef ${HEADER_DEF}"
+    DEFINE="#define ${HEADER_DEF}"
+elif [ -z "${IFNDEF}" ] && [ -z "${DEFINE}" ]; then
+    :
+else
+    echo "Warning $1: only #ifndef GKO_ or #define GKO_ is in the header"
+fi
+if [ ! -z "${IFNDEF}" ]; then
+    echo "${IFNDEF}" >> $1
+fi
+if [ ! -z "${DEFINE}" ]; then
+    echo "${DEFINE}" >> $1
+    echo "" >> $1
+    echo "" >> $1
+fi
+
+# Write the force-top header
+if [ -f "${FORCE_TOP}" ]; then
+    cat "${FORCE_TOP}" >> $1
+    echo "" >> $1
+    echo "" >> $1
+    rm "${FORCE_TOP}"
+fi
+
+# Write the main header and give warnning if there are multiple matches
+if [ -f "${BEFORE}" ]; then
+    # sort or remove the duplication
+    clang-format -i -style=file ${BEFORE}
+    if [ $(wc -l < ${BEFORE}) -gt "1" ]; then
+        echo "Warning $1: there are multiple main header matchings"
+    fi
+    cat ${BEFORE} >> $1
+    if [ -f "${CONTENT}" ]; then
+        echo "" >> $1
+        echo "" >> $1
+    fi
+    rm "${BEFORE}"
+fi
+
+# Arrange the remain files and give 
+if [ -f "${CONTENT}" ]; then
+    add_regroup
+    if [ "${HAS_HIP_RUNTIME}" = "true" ]; then
+        echo "#include <hip/hip_runtime.h>" > temp
+    fi
+    head -n -${KEEP_LINES} ${CONTENT} >> temp
+    if [ ! -z "${IFNDEF}" ] && [ ! -z "${DEFINE}" ]; then
+        # Ignore the last line #endif
+        if [[ "${LAST_NONEMPTY}" =~ $ENDIF_REX ]]; then
+            head -n -1 temp > ${CONTENT}
+            echo "#endif  // $HEADER_DEF" >> ${CONTENT}
+        else 
+            echo "Warning $1: Found the begin header_def but did not find the end of header_def"
+            cat temp > ${CONTENT}
+        fi
+    else
+        cat temp > "${CONTENT}"
+    fi
+    clang-format -i -style=file "${CONTENT}"
+    rm temp
+    remove_regroup
+    PREV_INC=0
+    IN_IF="false"
+    SKIP="true"
+    while IFS='' read -r line; do
+        # Skip the empty line in the beginning
+        if [ "${SKIP}" = "true" ] && [[ -z "${line}" ]]; then
+            continue
+        else
+            SKIP="false"
+        fi
+        # Insert content with correct number empty lines
+        if [[ ${line} =~ ${INCLUDE_REGEX} ]] && [[ ! ${line} =~ ${INCLUDE_INC} ]]; then
+            if [[ ${PREV_INC} == 1 ]]; then
+                echo "" >> $1
+            fi
+            PREV_INC=0
+        else
+            if [ -z "${line}" ]; then
+                PREV_INC=$((PREV_INC+1))
+            else
+                # To keep the original lines
+                PREV_INC=-3
+            fi
+        fi
+        echo "${line}" >> $1
+    done < "${CONTENT}"
+    rm "${CONTENT}"
+fi
diff --git a/dev_tools/scripts/gdb-ginkgo.py b/dev_tools/scripts/gdb-ginkgo.py
new file mode 100644
index 00000000000..f20f0597b31
--- /dev/null
+++ b/dev_tools/scripts/gdb-ginkgo.py
@@ -0,0 +1,130 @@
+# Pretty-printers for Ginkgo
+# Based on the pretty-printers for libstdc++.
+
+# Copyright (C) 2008-2020 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import gdb
+import itertools
+import sys
+import re
+
+if sys.version_info[0] > 2:
+    ### Python 3 stuff
+    Iterator = object
+    # Python 3 folds these into the normal functions.
+    imap = map
+    izip = zip
+    # Also, int subsumes long
+    long = int
+else:
+    ### Python 2 stuff
+    class Iterator:
+        """Compatibility mixin for iterators
+
+        Instead of writing next() methods for iterators, write
+        __next__() methods and use this mixin to make them work in
+        Python 2 as well as Python 3.
+
+        Idea stolen from the "six" documentation:
+        <http://pythonhosted.org/six/#six.Iterator>
+        """
+
+        def next(self):
+            return self.__next__()
+
+    # In Python 2, we still need these from itertools
+    from itertools import imap, izip
+
+_versioned_namespace = '__8::'
+
+def is_specialization_of(x, template_name):
+    "Test if a type is a given template instantiation."
+    global _versioned_namespace
+    if type(x) is gdb.Type:
+        x = x.tag
+    if _versioned_namespace:
+        return re.match('^std::(%s)?%s<.*>$' % (_versioned_namespace, template_name), x) is not None
+    return re.match('^std::%s<.*>$' % template_name, x) is not None
+
+
+def get_unique_ptr_data_ptr(val):
+    impl_type = val.type.fields()[0].type.tag
+    # Check for new implementations first:
+    if is_specialization_of(impl_type, '__uniq_ptr_data') \
+        or is_specialization_of(impl_type, '__uniq_ptr_impl'):
+        tuple_member = val['_M_t']['_M_t']
+    elif is_specialization_of(impl_type, 'tuple'):
+        tuple_member = val['_M_t']
+    else:
+        raise ValueError("Unsupported implementation for unique_ptr: %s" % impl_type)
+    tuple_impl_type = tuple_member.type.fields()[0].type # _Tuple_impl
+    tuple_head_type = tuple_impl_type.fields()[1].type   # _Head_base
+    head_field = tuple_head_type.fields()[0]
+    if head_field.name == '_M_head_impl':
+        return tuple_member['_M_head_impl']
+    elif head_field.is_base_class:
+        return tuple_member.cast(head_field.type)
+    else:
+        raise ValueError("Unsupported implementation for tuple in unique_ptr: %s" % impl_type)
+
+
+class GkoArrayPrinter:
+    "Print a gko::Array"
+
+    class _iterator(Iterator):
+        def __init__ (self, start, size):
+            self.item = start
+            self.size = size
+            self.count = 0
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            count = self.count
+            self.count = self.count + 1
+            if self.count > self.size:
+                raise StopIteration
+            elt = self.item.dereference()
+            self.item = self.item + 1
+            return ('[%d]' % count, elt)
+
+    def __init__(self, val):
+        self.val = val
+        self.execname = str(self.val['exec_']['_M_ptr'].dereference().dynamic_type)
+        self.pointer = get_unique_ptr_data_ptr(self.val['data_']);
+        self.is_cpu = re.match('gko::(Reference|Omp)Executor', str(self.execname)) is not None
+
+    def children(self):
+        if self.is_cpu:
+            return self._iterator(self.pointer, self.val['num_elems_'])
+        return []
+
+    def to_string(self):     
+        return ('%s of length %d on %s (%s)' % (str(self.val.type), int(self.val['num_elems_']), self.execname, self.pointer))
+
+    def display_hint(self):
+        return 'array'
+
+def lookup_type(val):
+    if not str(val.type).startswith('gko::'):
+        return None
+    suffix = str(val.type)[5:]
+    if suffix.startswith('Array'):
+        return GkoArrayPrinter(val)
+    return None
+
+gdb.pretty_printers.append(lookup_type)
diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
new file mode 100644
index 00000000000..036d5d81588
--- /dev/null
+++ b/dev_tools/scripts/regroup
@@ -0,0 +1,12 @@
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex: '^<(rapidjson|gflags|gtest|papi).*'
+    Priority: 3
+  - Regex: '^<(omp|cu|hip|thrust).*'
+    Priority: 2
+  - Regex: '^<ginkgo.*'
+    Priority: 5
+  - Regex: '^".*'
+    Priority: 6
+  - Regex: '.*'
+    Priority: 1
diff --git a/dev_tools/scripts/update_ginkgo_header.sh b/dev_tools/scripts/update_ginkgo_header.sh
index df78ba7794e..ace94846f1e 100755
--- a/dev_tools/scripts/update_ginkgo_header.sh
+++ b/dev_tools/scripts/update_ginkgo_header.sh
@@ -44,7 +44,8 @@ fi
 
 # Put all header files as a list (separated by newlines) in the file ${HEADER_LIST}
 # Requires detected files (including the path) to not contain newlines
-find "${TOP_HEADER_FOLDER}" -name '*.hpp' -type f -print > "${HEADER_LIST}"
+find "${TOP_HEADER_FOLDER}" -name '*.hpp' -type f -print | \
+        grep -v 'residual_norm_reduction.hpp' > "${HEADER_LIST}"
 
 if [ ${?} -ne 0 ]; then
     echo "${WARNING_PREFIX} "'The `find` command returned with an error!' 1>&2
@@ -76,7 +77,7 @@ if [[ "$(file "${GINKGO_HEADER_TEMPLATE_FILE}")" == *"CRLF"* ]]; then
 fi
 
 # Generate a new, temporary ginkgo header file.
-# It will get compared at the end to the existing file in order to prevent 
+# It will get compared at the end to the existing file in order to prevent
 # the rebuilding of targets which depend on the global header
 # (e.g. benchmarks and examples)
 GINKGO_HEADER_TMP="${GINKGO_HEADER_FILE}.tmp"
@@ -109,12 +110,12 @@ while IFS='' read -r line; do
         while IFS='' read -r prefixed_file; do
             # Remove the include directory from the file name
             file="${prefixed_file#${TOP_HEADER_FOLDER}/}"
-            
+
             # Do not include yourself
             if [ "${file}" == "${GINKGO_HEADER_FILE}" ]; then
                 continue
             fi
-            
+
             CURRENT_FOLDER="$(dirname ${file})"
             # add newline between different include folder
             if [ "${READING_FIRST_LINE}" != true ] && \
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 2c624dc4fd0..d416149638a 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -6,7 +6,7 @@ option(GINKGO_DOC_GENERATE_PDF "Generate PDF documentation" OFF)
 option(GINKGO_DOC_GENERATE_DEV "Generate internal documentation" OFF)
 option(GINKGO_DOC_GENERATE_EXAMPLES "Generate example documentation" ON)
 if(GINKGO_DOC_GENERATE_EXAMPLES)
-  add_subdirectory(examples)
+    add_subdirectory(examples)
 endif()
 
 if (GINKGO_DOC_GENERATE_PDF)
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 4c25a288a38..268f8348145 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -5,6 +5,8 @@
     <tab type="mainpage" visible="yes" title=""/>
     <tab type="user" visible="yes" title="Tutorial" url="https://github.com/ginkgo-project/ginkgo/wiki/Tutorial:-Building-a-Poisson-Solver" />
     <tab type="user" visible="yes" title="Examples" url="@ref Examples" />
+    <tab type="user" visible="yes" title="Citing Ginkgo" url="@ref citing_ginkgo" />
+    <tab type="user" visible="yes" title="Contributing To Ginkgo" url="@ref contributing_guidelines" />
     <tab type="user" visible="yes" title="Using Ginkgo" url="@ref install_ginkgo">
       <tab type="user" visible="yes" title="Installing Ginkgo" url="@ref install_ginkgo" />
       <tab type="user" visible="yes" title="Testing Ginkgo" url="@ref testing_ginkgo" />
diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt
index be0224278da..f227048dad8 100644
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
@@ -1,41 +1,41 @@
 # Collect all of the directory names for the examples programs
 FILE(GLOB _ginkgo_examples
-  ${CMAKE_SOURCE_DIR}/examples/*
-  )
+    ${CMAKE_SOURCE_DIR}/examples/*
+    )
 LIST(REMOVE_ITEM _ginkgo_examples "${CMAKE_SOURCE_DIR}/examples/CMakeLists.txt")
 
 ADD_CUSTOM_TARGET(examples)
 
 file(GLOB _ginkgo_examples_tooltip
-  ${CMAKE_SOURCE_DIR}/examples/*/doc/tooltip
-  )
+    ${CMAKE_SOURCE_DIR}/examples/*/doc/tooltip
+    )
 
 file(GLOB _ginkgo_examples_kind
-  ${CMAKE_SOURCE_DIR}/examples/*/doc/kind
-  )
+    ${CMAKE_SOURCE_DIR}/examples/*/doc/kind
+    )
 file(GLOB _ginkgo_examples_buildson
-  ${CMAKE_SOURCE_DIR}/examples/*/doc/builds-on
-  )
+    ${CMAKE_SOURCE_DIR}/examples/*/doc/builds-on
+    )
 
 ADD_CUSTOM_COMMAND(
-  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp
-  COMMAND ${PERL_EXECUTABLE}
-  ARGS
-  ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl
-  ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in
-  ${_ginkgo_examples}
-  > ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp
-  DEPENDS
-  ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl
-  ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in
-  ${_ginkgo_examples_tooltip}
-  ${_ginkgo_examples_kind}
-  ${_ginkgo_examples_buildson}
-  )
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp
+    COMMAND ${PERL_EXECUTABLE}
+    ARGS
+    ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in
+    ${_ginkgo_examples}
+    > ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp
+    DEPENDS
+    ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in
+    ${_ginkgo_examples_tooltip}
+    ${_ginkgo_examples_kind}
+    ${_ginkgo_examples_buildson}
+    )
 ADD_CUSTOM_TARGET(build_examples_hpp
-  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp
-  COMMENT
-  "Building examples.hpp")
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp
+    COMMENT
+    "Building examples.hpp")
 ADD_DEPENDENCIES(examples build_examples_hpp)
 
 #
@@ -43,46 +43,46 @@ ADD_DEPENDENCIES(examples build_examples_hpp)
 #
 
 FOREACH(example ${_ginkgo_examples})
-  GET_FILENAME_COMPONENT(example "${example}" NAME)
+    GET_FILENAME_COMPONENT(example "${example}" NAME)
 
-  ADD_CUSTOM_COMMAND(
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp
-    COMMAND ${PERL_EXECUTABLE}
-    ARGS
-      ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain
-      < ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp
-      > ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp
-    DEPENDS
-      ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain
-      ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp
-    VERBATIM
-    )
+    ADD_CUSTOM_COMMAND(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp
+        COMMAND ${PERL_EXECUTABLE}
+        ARGS
+        ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain
+        < ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp
+        > ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp
+        DEPENDS
+        ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain
+        ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp
+        VERBATIM
+        )
 
-  ADD_CUSTOM_COMMAND(
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp
-    COMMAND ${PERL_EXECUTABLE}
-    ARGS
-      ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl
-      ${example} ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR}
-      > ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl
-      ${CMAKE_SOURCE_DIR}/doc/scripts/intro2toc
-      ${CMAKE_SOURCE_DIR}/doc/scripts/create_anchors
-      ${CMAKE_SOURCE_DIR}/doc/scripts/program2doxygen
-      ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp
-      ${CMAKE_SOURCE_DIR}/examples/${example}/doc/intro.dox
-      ${CMAKE_SOURCE_DIR}/examples/${example}/doc/results.dox
-    )
+    ADD_CUSTOM_COMMAND(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp
+        COMMAND ${PERL_EXECUTABLE}
+        ARGS
+        ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl
+        ${example} ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR}
+        > ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp
+        WORKING_DIRECTORY
+        ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS
+        ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl
+        ${CMAKE_SOURCE_DIR}/doc/scripts/intro2toc
+        ${CMAKE_SOURCE_DIR}/doc/scripts/create_anchors
+        ${CMAKE_SOURCE_DIR}/doc/scripts/program2doxygen
+        ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp
+        ${CMAKE_SOURCE_DIR}/examples/${example}/doc/intro.dox
+        ${CMAKE_SOURCE_DIR}/examples/${example}/doc/results.dox
+        )
 
-  ADD_CUSTOM_TARGET(examples_${example}
-    DEPENDS
-      ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp
-      ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp
-      COMMENT
+    ADD_CUSTOM_TARGET(examples_${example}
+        DEPENDS
+        ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp
+        ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp
+        COMMENT
         "Building doxygen input file for examples program <${example}>"
-    )
-  ADD_DEPENDENCIES(examples examples_${example})
+        )
+    ADD_DEPENDENCIES(examples examples_${example})
 ENDFOREACH()
diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in
index 6dcff03ab3c..cf0b01f9a04 100644
--- a/doc/examples/examples.hpp.in
+++ b/doc/examples/examples.hpp.in
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -128,8 +128,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *   <tr valign="top">
  *       <td>@ref twentyseven_pt_stencil_solver</td>
- *       <td> Using a twentyseven point 3D stencil to solve the poisson equation with
- *            array views.
+ *       <td> Using a twentyseven point 3D stencil to solve the poisson equation
+ *            with array views.
  *       </td></tr>
  *
  *   <tr valign="top">
diff --git a/doc/headers/cuda_executor.hpp b/doc/headers/cuda_executor.hpp
index d42eb6bc197..52b9307357a 100644
--- a/doc/headers/cuda_executor.hpp
+++ b/doc/headers/cuda_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/executors.hpp b/doc/headers/executors.hpp
index d12e9c804da..002f64230e9 100644
--- a/doc/headers/executors.hpp
+++ b/doc/headers/executors.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -51,6 +51,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * CPU);
  * +    @ref exec_cuda specifies that the data should be stored and the
  *      operations executed on the NVIDIA GPU accelerator;
+ * +    @ref exec_hip uses the HIP library to compile code for either NVIDIA or
+ *      AMD GPU accelerator;
  * +    @ref exec_ref executes a non-optimized reference implementation,
  *      which can be used to debug the library.
  */
diff --git a/doc/headers/factor.hpp b/doc/headers/factor.hpp
index 324f9657235..320668cabae 100644
--- a/doc/headers/factor.hpp
+++ b/doc/headers/factor.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/hip_executor.hpp b/doc/headers/hip_executor.hpp
new file mode 100644
index 00000000000..4805cfb4b87
--- /dev/null
+++ b/doc/headers/hip_executor.hpp
@@ -0,0 +1,40 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @defgroup exec_hip HIP Executor
+ *
+ * @brief A module dedicated to the implementation and usage of the HIP
+ * executor in Ginkgo.
+ *
+ * @ingroup Executor
+ */
diff --git a/doc/headers/jacobi.hpp b/doc/headers/jacobi.hpp
index 8e406d75fea..875efa2c4d0 100644
--- a/doc/headers/jacobi.hpp
+++ b/doc/headers/jacobi.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/linop.hpp b/doc/headers/linop.hpp
index e208fd9391c..12fc582eb9d 100644
--- a/doc/headers/linop.hpp
+++ b/doc/headers/linop.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/logging.hpp b/doc/headers/logging.hpp
index e30edb61e72..e9563469b87 100644
--- a/doc/headers/logging.hpp
+++ b/doc/headers/logging.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/matrix_formats.hpp b/doc/headers/matrix_formats.hpp
index b085eba5bc8..641cb98bc13 100644
--- a/doc/headers/matrix_formats.hpp
+++ b/doc/headers/matrix_formats.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/modules.dot b/doc/headers/modules.dot
deleted file mode 100644
index 6978fdf670b..00000000000
--- a/doc/headers/modules.dot
+++ /dev/null
@@ -1,37 +0,0 @@
- digraph G
-{
-  graph[rankdir="TB",bgcolor="transparent"];
-
-  edge [fontname="Times-Roman",fontsize=15,labelfontname="Times-Roman",labelfontsize=14];
-  node [fontname="Times-Roman",fontsize=15,
-        shape=record,height=0.2,width=0.4,
-        color="black", fillcolor="white", style="filled"];
-
-  exec [label="Executors",URL="\ref Executor"];
-  omp [label="OpenMP Executor",URL="\ref exec_omp "];
-  ref [label="Reference Executor",URL="\ref exec_ref"];
-  cuda [label="CUDA Executor",URL="\ref exec_cuda"];
-  linop [label="Linear Operators",URL="\ref LinOp"];
-  solvers [label="Solvers",URL="\ref solvers"];
-  precond [label="Preconditioners",URL="\ref precond"];
-  factor [label="Factorizations",URL="\ref factor"];
-  matformat [label="Matrix Formats",URL="\ref mat_formats "];
-  // log [label="Logging",URL="\ref log"];
-  stop [label="Stopping Criteria",URL="\ref stop"];
-
-  // log -> exec [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  exec -> ref [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  exec -> cuda[color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  exec -> omp [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  omp -> linop [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  cuda -> linop [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  ref -> linop [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  omp -> stop [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  cuda -> stop [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  ref -> stop [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  linop -> matformat [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  linop -> solvers [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  linop -> precond [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  linop -> factor [color="black",fontsize=14,style="solid",fontname="Times-Roman"];
-  stop -> solvers [color="black",fontsize=14,style="dashed",fontname="Times-Roman"];
-}
diff --git a/doc/headers/omp_executor.hpp b/doc/headers/omp_executor.hpp
index 0f6c5e7e140..83df1f5b292 100644
--- a/doc/headers/omp_executor.hpp
+++ b/doc/headers/omp_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/preconditioners.hpp b/doc/headers/preconditioners.hpp
index 1da0d147f4b..b7797b92d1b 100644
--- a/doc/headers/preconditioners.hpp
+++ b/doc/headers/preconditioners.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/ref_executor.hpp b/doc/headers/ref_executor.hpp
index dd7b4ea0940..c4faf61e2c4 100644
--- a/doc/headers/ref_executor.hpp
+++ b/doc/headers/ref_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/solvers.hpp b/doc/headers/solvers.hpp
index 481120c36a7..ac0f797cf02 100644
--- a/doc/headers/solvers.hpp
+++ b/doc/headers/solvers.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/headers/stop.hpp b/doc/headers/stop.hpp
index 2439d7d9162..16ce4487601 100644
--- a/doc/headers/stop.hpp
+++ b/doc/headers/stop.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/helpers.cmake b/doc/helpers.cmake
index cbda187a9a7..3cb4c636bc1 100644
--- a/doc/helpers.cmake
+++ b/doc/helpers.cmake
@@ -7,11 +7,11 @@ function(ginkgo_configure_to_string in variable)
 endfunction()
 
 macro(ginkgo_to_string variable)
-  set(${variable} "")
-  foreach(var  ${ARGN})
-    set(${variable} "${${variable}} ${var}")
-  endforeach()
-  string(STRIP "${${variable}}" ${variable})
+    set(${variable} "")
+    foreach(var  ${ARGN})
+        set(${variable} "${${variable}} ${var}")
+    endforeach()
+    string(STRIP "${${variable}}" ${variable})
 endmacro()
 
 # writes the concatenated configured files <in1,2>
@@ -47,65 +47,66 @@ function(ginkgo_doc_gen name in pdf mainpage-in)
     set(doxyfile "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile-${name}")
     set(layout "${DOC_BASE}/DoxygenLayout.xml")
     ginkgo_file_concat("${DOC_BASE}/pages"
-      "${mainpage-in}" BASE_DOC.md "${MAINPAGE}"
-      )
+        "${mainpage-in}" BASE_DOC.md "${MAINPAGE}"
+        )
     set(doxygen_base_input
-      "${DOC_BASE}/headers/"
-      )
+        "${DOC_BASE}/headers/"
+        )
     list(APPEND doxygen_base_input
-      ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp
-      ${DIR_BASE}/include
-      ${MAINPAGE}
-      )
-    if(GINKGO_DOC_GENERATE_EXAMPLES)
-      list(APPEND doxygen_base_input
-        ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp
+        ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp
+        ${DIR_BASE}/include
+        ${MAINPAGE}
         )
+    if(GINKGO_DOC_GENERATE_EXAMPLES)
+        list(APPEND doxygen_base_input
+            ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp
+            )
     endif()
     set(doxygen_dev_input
-      "${DIR_BASE}/core"
-      )
+        "${DIR_BASE}/core"
+        )
     list(APPEND doxygen_dev_input
-      ${DIR_BASE}/omp
-      ${DIR_BASE}/cuda
-      ${DIR_BASE}/reference
-      )
+        ${DIR_BASE}/omp
+        ${DIR_BASE}/cuda
+        ${DIR_BASE}/hip
+        ${DIR_BASE}/reference
+        )
     set(doxygen_image_path "${CMAKE_SOURCE_DIR}/doc/images/")
     file(GLOB doxygen_depend
-      ${DOC_BASE}/headers/*.hpp
-      ${DIR_BASE}/include/ginkgo/**/*.hpp
-      )
-    list(APPEND doxygen_depend
-      ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp
-      )
-    if(GINKGO_DOC_GENERATE_EXAMPLES)
-      list(APPEND doxygen_depend
-        ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp
+        ${DOC_BASE}/headers/*.hpp
+        ${DIR_BASE}/include/ginkgo/**/*.hpp
         )
-      FILE(GLOB _ginkgo_examples
-        ${DIR_BASE}/examples/*
+    list(APPEND doxygen_depend
+        ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp
         )
-      LIST(REMOVE_ITEM _ginkgo_examples "${DIR_BASE}/examples/CMakeLists.txt")
-      FOREACH(_ex ${_ginkgo_examples})
-        GET_FILENAME_COMPONENT(_ex "${_ex}" NAME)
-        LIST(APPEND doxygen_depend
-          ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp
-          )
-        LIST(APPEND doxygen_base_input
-          ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp
-          )
-      ENDFOREACH()
+    if(GINKGO_DOC_GENERATE_EXAMPLES)
+        list(APPEND doxygen_depend
+            ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp
+            )
+        FILE(GLOB _ginkgo_examples
+            ${DIR_BASE}/examples/*
+            )
+        LIST(REMOVE_ITEM _ginkgo_examples "${DIR_BASE}/examples/CMakeLists.txt")
+        FOREACH(_ex ${_ginkgo_examples})
+            GET_FILENAME_COMPONENT(_ex "${_ex}" NAME)
+            LIST(APPEND doxygen_depend
+                ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp
+                )
+            LIST(APPEND doxygen_base_input
+                ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp
+                )
+        ENDFOREACH()
     endif()
     list(APPEND doxygen_dev_input
-      ${doxygen_base_input}
-      )
+        ${doxygen_base_input}
+        )
     # pick some markdown files we want as pages
-    set(doxygen_markdown_files "../../INSTALL.md ../../TESTING.md ../../BENCHMARKING.md")
+    set(doxygen_markdown_files "../../INSTALL.md ../../TESTING.md ../../BENCHMARKING.md ../../CONTRIBUTING.md ../../CITING.md")
     ginkgo_to_string(doxygen_base_input_str ${doxygen_base_input} )
     ginkgo_to_string(doxygen_dev_input_str ${doxygen_dev_input} )
     ginkgo_to_string(doxygen_image_path_str ${doxygen_image_path} )
     add_custom_target("${name}" ALL
-      #DEPEND "${doxyfile}.stamp" Doxyfile.in ${in} ${in2}
+        #DEPEND "${doxyfile}.stamp" Doxyfile.in ${in} ${in2}
         COMMAND "${DOXYGEN_EXECUTABLE}" ${doxyfile}
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 77d1117667a..1ea92e19886 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,13 +3,17 @@ add_subdirectory(custom-logger)
 add_subdirectory(custom-matrix-format)
 add_subdirectory(custom-stopping-criterion)
 if(GINKGO_BUILD_EXTLIB_EXAMPLE)
-  add_subdirectory(external-lib-interfacing)
+    add_subdirectory(external-lib-interfacing)
 endif()
+add_subdirectory(adaptiveprecision-blockjacobi)
 add_subdirectory(ginkgo-overhead)
 add_subdirectory(ginkgo-ranges)
 add_subdirectory(ilu-preconditioned-solver)
+add_subdirectory(ir-ilu-preconditioned-solver)
 add_subdirectory(inverse-iteration)
+add_subdirectory(iterative-refinement)
 add_subdirectory(minimal-cuda-solver)
+add_subdirectory(mixed-precision-ir)
 add_subdirectory(nine-pt-stencil-solver)
 add_subdirectory(papi-logging)
 add_subdirectory(performance-debugging)
@@ -19,4 +23,3 @@ add_subdirectory(simple-solver)
 add_subdirectory(simple-solver-logging)
 add_subdirectory(three-pt-stencil-solver)
 add_subdirectory(twentyseven-pt-stencil-solver)
-
diff --git a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
new file mode 100644
index 00000000000..d3188aaca12
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(adaptiveprecision-blockjacobi adaptiveprecision-blockjacobi.cpp)
+target_link_libraries(adaptiveprecision-blockjacobi ginkgo)
+target_include_directories(adaptiveprecision-blockjacobi PRIVATE ${PROJECT_SOURCE_DIR})
+configure_file(data/A.mtx data/A.mtx COPYONLY)
diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
new file mode 100644
index 00000000000..3b58d78e37a
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
@@ -0,0 +1,146 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+
+int main(int argc, char *argv[])
+{
+    // Some shortcuts
+    using ValueType = double;
+    using IndexType = int;
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
+
+    // Print version information
+    std::cout << gko::version_info::get() << std::endl;
+
+    // Figure out where to run the code
+    std::shared_ptr<gko::Executor> exec;
+    if (argc == 1 || std::string(argv[1]) == "reference") {
+        exec = gko::ReferenceExecutor::create();
+    } else if (argc == 2 && std::string(argv[1]) == "omp") {
+        exec = gko::OmpExecutor::create();
+    } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
+               gko::CudaExecutor::get_num_devices() > 0) {
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else {
+        std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
+        std::exit(-1);
+    }
+
+    // Read data
+    auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
+    // Create RHS and initial guess as 1
+    gko::size_type size = A->get_size()[0];
+    auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1));
+    for (auto i = 0; i < size; i++) {
+        host_x->at(i, 0) = 1.;
+    }
+    auto x = vec::create(exec);
+    auto b = vec::create(exec);
+    x->copy_from(host_x.get());
+    b->copy_from(host_x.get());
+
+    // Calculate initial residual by overwriting b
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto neg_one = gko::initialize<vec>({-1.0}, exec);
+    auto initres = gko::initialize<vec>({0.0}, exec);
+    A->apply(lend(one), lend(x), lend(neg_one), lend(b));
+    b->compute_norm2(lend(initres));
+
+    // copy b again
+    b->copy_from(host_x.get());
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
+    auto iter_stop =
+        gko::stop::Iteration::build().with_max_iters(10000u).on(exec);
+    auto tol_stop = gko::stop::ResidualNormReduction<ValueType>::build()
+                        .with_reduction_factor(reduction_factor)
+                        .on(exec);
+
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create(exec);
+    iter_stop->add_logger(logger);
+    tol_stop->add_logger(logger);
+
+    // Create solver factory
+    auto solver_gen =
+        cg::build()
+            .with_criteria(gko::share(iter_stop), gko::share(tol_stop))
+            // Add preconditioner, these 2 lines are the only
+            // difference from the simple solver example
+            .with_preconditioner(bj::build()
+                                     .with_max_block_size(16u)
+                                     .with_storage_optimization(
+                                         gko::precision_reduction::autodetect())
+                                     .on(exec))
+            .on(exec);
+    // Create solver
+    auto solver = solver_gen->generate(A);
+
+
+    // Solve system
+    exec->synchronize();
+    std::chrono::nanoseconds time(0);
+    auto tic = std::chrono::steady_clock::now();
+    solver->apply(lend(b), lend(x));
+    auto toc = std::chrono::steady_clock::now();
+    time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+
+    // Calculate residual
+    auto res = gko::initialize<vec>({0.0}, exec);
+    A->apply(lend(one), lend(x), lend(neg_one), lend(b));
+    b->compute_norm2(lend(res));
+
+    std::cout << "Initial residual norm sqrt(r^T r): \n";
+    write(std::cout, lend(initres));
+    std::cout << "Final residual norm sqrt(r^T r): \n";
+    write(std::cout, lend(res));
+
+    // Print solver statistics
+    std::cout << "CG iteration count:     " << logger->get_num_iterations()
+              << std::endl;
+    std::cout << "CG execution time [ms]: "
+              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
+}
diff --git a/examples/adaptiveprecision-blockjacobi/build.sh b/examples/adaptiveprecision-blockjacobi/build.sh
new file mode 100755
index 00000000000..fba046ccb94
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# set up script
+if [ $# -ne 1 ]; then
+    echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY"
+    exit 1
+fi
+BUILD_DIR=$1
+THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
+
+# copy libraries
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
+SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
+for prefix in ${LIBRARY_DIRS}; do
+    for name in ${LIBRARY_NAMES}; do
+        for suffix in ${SUFFIXES}; do
+            cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \
+                ${THIS_DIR}/lib${name}${suffix} 2>/dev/null
+        done
+    done
+done
+
+# figure out correct compiler flags
+if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
+else
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
+fi
+if [ -z "${CXX}" ]; then
+    CXX="c++"
+fi
+
+# build
+${CXX} -std=c++11 -o ${THIS_DIR}/adaptiveprecision-blockjacobi \
+    ${THIS_DIR}/adaptiveprecision-blockjacobi.cpp \
+    -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
+    -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/adaptiveprecision-blockjacobi/data/A.mtx b/examples/adaptiveprecision-blockjacobi/data/A.mtx
new file mode 100644
index 00000000000..c67437da567
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/data/A.mtx
@@ -0,0 +1,114 @@
+%%MatrixMarket matrix coordinate integer symmetric
+%-------------------------------------------------------------------------------
+% UF Sparse Matrix Collection, Tim Davis
+% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b
+% name: JGD_Trefethen/Trefethen_20b
+% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.]
+% id: 2203
+% date: 2008
+% author: N. Trefethen
+% ed: J.-G. Dumas
+% fields: name title A id date author ed kind notes
+% kind: combinatorial problem
+%-------------------------------------------------------------------------------
+% notes:
+% Diagonal matrices with primes, Nick Trefethen, Oxford Univ.          
+% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection,         
+% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html            
+%                                                                      
+% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems,   
+% SIAM News, vol 35, no. 1.                                            
+%                                                                      
+% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero        
+% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the
+% main diagonal and the number 1 in all the positions A(i,j) with      
+% |i-j| = 1,2,4,8, . . . ,16384.  What is the (1,1) entry of inv(A)?   
+%                                                                      
+% http://www.siam.org/news/news.php?id=388                             
+%                                                                      
+% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms     
+%-------------------------------------------------------------------------------
+19 19 83
+1 1 3
+2 1 1
+3 1 1
+5 1 1
+9 1 1
+17 1 1
+2 2 5
+3 2 1
+4 2 1
+6 2 1
+10 2 1
+18 2 1
+3 3 7
+4 3 1
+5 3 1
+7 3 1
+11 3 1
+19 3 1
+4 4 11
+5 4 1
+6 4 1
+8 4 1
+12 4 1
+5 5 13
+6 5 1
+7 5 1
+9 5 1
+13 5 1
+6 6 17
+7 6 1
+8 6 1
+10 6 1
+14 6 1
+7 7 19
+8 7 1
+9 7 1
+11 7 1
+15 7 1
+8 8 23
+9 8 1
+10 8 1
+12 8 1
+16 8 1
+9 9 29
+10 9 1
+11 9 1
+13 9 1
+17 9 1
+10 10 31
+11 10 1
+12 10 1
+14 10 1
+18 10 1
+11 11 37
+12 11 1
+13 11 1
+15 11 1
+19 11 1
+12 12 41
+13 12 1
+14 12 1
+16 12 1
+13 13 43
+14 13 1
+15 13 1
+17 13 1
+14 14 47
+15 14 1
+16 14 1
+18 14 1
+15 15 53
+16 15 1
+17 15 1
+19 15 1
+16 16 59
+17 16 1
+18 16 1
+17 17 61
+18 17 1
+19 17 1
+18 18 67
+19 18 1
+19 19 71
diff --git a/examples/adaptiveprecision-blockjacobi/doc/builds-on b/examples/adaptiveprecision-blockjacobi/doc/builds-on
new file mode 100644
index 00000000000..9b64c9bfd28
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/doc/builds-on
@@ -0,0 +1 @@
+preconditioned-solver
diff --git a/examples/adaptiveprecision-blockjacobi/doc/intro.dox b/examples/adaptiveprecision-blockjacobi/doc/intro.dox
new file mode 100644
index 00000000000..410f698f261
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/doc/intro.dox
@@ -0,0 +1,10 @@
+<a name="Adaptive Precision block-Jacobi"></a>
+<h1>This example shows how to use the adaptive precision block-Jacobi 
+preconditioner.</h1>
+
+<h3> In this example, we first read in a matrix from file, then generate a 
+right-hand side and an initial guess. The preconditioned CG solver is enhanced 
+with a block-Jacobi preconditioner that optimizes the storage format for the 
+distinct inverted diagonal blocks to the numerical requirements. The example
+features the iteration count and runtime of the CG solver.</h3>
+
diff --git a/examples/adaptiveprecision-blockjacobi/doc/kind b/examples/adaptiveprecision-blockjacobi/doc/kind
new file mode 100644
index 00000000000..53a96d5771f
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/doc/kind
@@ -0,0 +1 @@
+preconditioners
diff --git a/examples/adaptiveprecision-blockjacobi/doc/results.dox b/examples/adaptiveprecision-blockjacobi/doc/results.dox
new file mode 100644
index 00000000000..87c5b74c60c
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/doc/results.dox
@@ -0,0 +1,19 @@
+<h1>Results</h1>
+This is the expected output:
+
+@code{.cpp}
+
+Initial residual norm sqrt(r^T r): 
+%%MatrixMarket matrix array real general
+1 1
+194.679
+Final residual norm sqrt(r^T r): 
+%%MatrixMarket matrix array real general
+1 1
+2.8994e-11
+CG iteration count:     8
+CG execution time [ms]: 4.10581
+
+@endcode
+
+<h3> Comments about programming and debugging </h3>
diff --git a/examples/adaptiveprecision-blockjacobi/doc/short-intro b/examples/adaptiveprecision-blockjacobi/doc/short-intro
new file mode 100644
index 00000000000..7aa3396bae6
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/doc/short-intro
@@ -0,0 +1 @@
+The preconditioned solver example.
diff --git a/examples/adaptiveprecision-blockjacobi/doc/tooltip b/examples/adaptiveprecision-blockjacobi/doc/tooltip
new file mode 100644
index 00000000000..6458f7fb3e5
--- /dev/null
+++ b/examples/adaptiveprecision-blockjacobi/doc/tooltip
@@ -0,0 +1 @@
+Use a preconditioner in Ginkgo. Solve a linear system.
diff --git a/examples/custom-logger/build.sh b/examples/custom-logger/build.sh
index 5cd278063a8..67587b6aa90 100755
--- a/examples/custom-logger/build.sh
+++ b/examples/custom-logger/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp
index d7ecde9b576..d5ded538df7 100644
--- a/examples/custom-logger/custom-logger.cpp
+++ b/examples/custom-logger/custom-logger.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -51,18 +51,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Utility function which gets the scalar value of a Ginkgo gko::matrix::Dense
 // matrix representing the norm of a vector.
 template <typename ValueType>
-double get_norm(const gko::matrix::Dense<ValueType> *norm)
+gko::remove_complex<ValueType> get_norm(
+    const gko::matrix::Dense<ValueType> *norm)
 {
     // Put the value on CPU thanks to the master executor
     auto cpu_norm = clone(norm->get_executor()->get_master(), norm);
     // Return the scalar value contained at position (0, 0)
-    return cpu_norm->at(0, 0);
+    return std::real(cpu_norm->at(0, 0));
 }
 
 // Utility function which computes the norm of a Ginkgo gko::matrix::Dense
 // vector.
 template <typename ValueType>
-double compute_norm(const gko::matrix::Dense<ValueType> *b)
+gko::remove_complex<ValueType> compute_norm(
+    const gko::matrix::Dense<ValueType> *b)
 {
     // Get the executor of the vector
     auto exec = b->get_executor();
@@ -83,10 +85,10 @@ struct ResidualLogger : gko::log::Logger {
     void write() const
     {
         // Print a header for the table
-        std::cout << "Recurrent vs real residual norm:" << std::endl;
+        std::cout << "Recurrent vs true residual norm:" << std::endl;
         std::cout << '|' << std::setw(10) << "Iteration" << '|' << std::setw(25)
                   << "Recurrent Residual Norm" << '|' << std::setw(25)
-                  << "Real Residual Norm" << '|' << std::endl;
+                  << "True Residual Norm" << '|' << std::endl;
         // Print a separation line. Note that for creating `10` characters
         // `std::setw()` should be set to `11`.
         std::cout << '|' << std::setfill('-') << std::setw(11) << '|'
@@ -188,14 +190,16 @@ int main(int argc, char *argv[])
     // with one column/one row. The advantage of this concept is that using
     // multiple vectors is a now a natural extension of adding columns/rows are
     // necessary.
-    using vec = gko::matrix::Dense<>;
+    using ValueType = double;
+    using IndexType = int;
+    using vec = gko::matrix::Dense<ValueType>;
     // The gko::matrix::Csr class is used here, but any other matrix class such
     // as gko::matrix::Coo, gko::matrix::Hybrid, gko::matrix::Ell or
     // gko::matrix::Sellp could also be used.
-    using mtx = gko::matrix::Csr<>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
     // The gko::solver::Cg is used here, but any other solver class can also be
     // used.
-    using cg = gko::solver::Cg<>;
+    using cg = gko::solver::Cg<ValueType>;
 
     // Print the ginkgo version information.
     std::cout << gko::version_info::get() << std::endl;
@@ -217,7 +221,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create());
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
@@ -234,6 +241,7 @@ int main(int argc, char *argv[])
     auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
     auto b = gko::read<vec>(std::ifstream("data/b.mtx"), exec);
     auto x = gko::read<vec>(std::ifstream("data/x0.mtx"), exec);
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
 
     // @sect3{Creating the solver}
     // Generate the gko::solver factory. Ginkgo uses the concept of Factories to
@@ -248,14 +256,14 @@ int main(int argc, char *argv[])
         cg::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-15)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             .on(exec);
 
     // Instantiate a ResidualLogger logger.
-    auto logger = std::make_shared<ResidualLogger<double>>(exec, gko::lend(A),
-                                                           gko::lend(b));
+    auto logger = std::make_shared<ResidualLogger<ValueType>>(
+        exec, gko::lend(A), gko::lend(b));
 
     // Add the previously created logger to the solver factory. The logger will
     // be automatically propagated to all solvers created from this factory.
diff --git a/examples/custom-logger/doc/intro.dox b/examples/custom-logger/doc/intro.dox
index a81e16f2111..5d135a65013 100644
--- a/examples/custom-logger/doc/intro.dox
+++ b/examples/custom-logger/doc/intro.dox
@@ -9,13 +9,13 @@ In this example, a simple logger is implemented to track the solver's recurrent
 <h3> About the example </h3>
 Each example has the following sections:
 <ol>
-  <li> <b>Introduction:</b>This gives an overview of the example and mentions
-  any interesting aspects in the example that might help the reader.
-  <li> <b>The commented program:</b> This section is intended for you to
-  understand the details of the example so that you can play with it and understand
-  Ginkgo and its features better.
-  <li> <b>Results:</b> This section shows the results of the code when run. Though the
-  results may not be completely the same, you can expect the behaviour to be similar. 
-  <li> <b>The plain program:</b> This is the complete code without any comments to have
-  an complete overview of the code.
-  </ol>
+    <li> <b>Introduction:</b>This gives an overview of the example and mentions
+    any interesting aspects in the example that might help the reader.
+    <li> <b>The commented program:</b> This section is intended for you to
+    understand the details of the example so that you can play with it and understand
+    Ginkgo and its features better.
+    <li> <b>Results:</b> This section shows the results of the code when run. Though the
+    results may not be completely the same, you can expect the behaviour to be similar.
+    <li> <b>The plain program:</b> This is the complete code without any comments to have
+    an complete overview of the code.
+</ol>
diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt
index 383ec42c735..d9633e1ab11 100644
--- a/examples/custom-matrix-format/CMakeLists.txt
+++ b/examples/custom-matrix-format/CMakeLists.txt
@@ -7,5 +7,7 @@ if (GINKGO_BUILD_CUDA AND GINKGO_BUILD_OMP)
         stencil_kernel.cu)
     target_link_libraries(custom-matrix-format ginkgo)
     target_include_directories(custom-matrix-format PRIVATE
-        ${PROJECT_SOURCE_DIR})
+        ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+    # workaround for clang-cuda/g++ interaction
+    set_target_properties(custom-matrix-format PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
diff --git a/examples/custom-matrix-format/build.sh b/examples/custom-matrix-format/build.sh
index 6471c90fbfe..dbb2a67d72b 100755
--- a/examples/custom-matrix-format/build.sh
+++ b/examples/custom-matrix-format/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 
 CXX="nvcc"
diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp
index 5c4e750f7f9..2ed33b59bea 100644
--- a/examples/custom-matrix-format/custom-matrix-format.cpp
+++ b/examples/custom-matrix-format/custom-matrix-format.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // A CUDA kernel implementing the stencil, which will be used if running on the
 // CUDA executor. Unfortunately, NVCC has serious problems interpreting some
 // parts of Ginkgo's code, so the kernel has to be compiled separately.
-extern void stencil_kernel(std::size_t size, const double *coefs,
-                           const double *b, double *x);
+template <typename ValueType>
+void stencil_kernel(std::size_t size, const ValueType *coefs,
+                    const ValueType *b, ValueType *x);
 
 
 // A stencil matrix class representing the 3pt stencil linear operator.
@@ -57,21 +58,22 @@ extern void stencil_kernel(std::size_t size, const double *coefs,
 // implementation of the static create method. This method will forward all its
 // arguments to the constructor to create the object, and return an
 // std::unique_ptr to the created object.
-class StencilMatrix : public gko::EnableLinOp<StencilMatrix>,
-                      public gko::EnableCreateMethod<StencilMatrix> {
+template <typename ValueType>
+class StencilMatrix : public gko::EnableLinOp<StencilMatrix<ValueType>>,
+                      public gko::EnableCreateMethod<StencilMatrix<ValueType>> {
 public:
     // This constructor will be called by the create method. Here we initialize
     // the coefficients of the stencil.
     StencilMatrix(std::shared_ptr<const gko::Executor> exec,
-                  gko::size_type size = 0, double left = -1.0,
-                  double center = 2.0, double right = -1.0)
+                  gko::size_type size = 0, ValueType left = -1.0,
+                  ValueType center = 2.0, ValueType right = -1.0)
         : gko::EnableLinOp<StencilMatrix>(exec, gko::dim<2>{size}),
           coefficients(exec, {left, center, right})
     {}
 
 protected:
-    using vec = gko::matrix::Dense<>;
-    using coef_type = gko::Array<double>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using coef_type = gko::Array<ValueType>;
 
     // Here we implement the application of the linear operator, x = A * b.
     // apply_impl will be called by the apply method, after the arguments have
@@ -156,14 +158,15 @@ class StencilMatrix : public gko::EnableLinOp<StencilMatrix>,
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
-void generate_stencil_matrix(gko::matrix::Csr<> *matrix)
+template <typename ValueType, typename IndexType>
+void generate_stencil_matrix(gko::matrix::Csr<ValueType, IndexType> *matrix)
 {
     const auto discretization_points = matrix->get_size()[0];
     auto row_ptrs = matrix->get_row_ptrs();
     auto col_idxs = matrix->get_col_idxs();
     auto values = matrix->get_values();
-    int pos = 0;
-    const double coefs[] = {-1, 2, -1};
+    IndexType pos = 0;
+    const ValueType coefs[] = {-1, 2, -1};
     row_ptrs[0] = pos;
     for (int i = 0; i < discretization_points; ++i) {
         for (auto ofs : {-1, 0, 1}) {
@@ -179,14 +182,15 @@ void generate_stencil_matrix(gko::matrix::Csr<> *matrix)
 
 
 // Generates the RHS vector given `f` and the boundary conditions.
-template <typename Closure>
-void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs)
+template <typename Closure, typename ValueType>
+void generate_rhs(Closure f, ValueType u0, ValueType u1,
+                  gko::matrix::Dense<ValueType> *rhs)
 {
     const auto discretization_points = rhs->get_size()[0];
     auto values = rhs->get_values();
-    const auto h = 1.0 / (discretization_points + 1);
+    const ValueType h = 1.0 / (discretization_points + 1);
     for (int i = 0; i < discretization_points; ++i) {
-        const auto xi = (i + 1) * h;
+        const ValueType xi = ValueType(i + 1) * h;
         values[i] = -f(xi) * h * h;
     }
     values[0] += u0;
@@ -195,7 +199,9 @@ void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs)
 
 
 // Prints the solution `u`.
-void print_solution(double u0, double u1, const gko::matrix::Dense<> *u)
+template <typename ValueType>
+void print_solution(ValueType u0, ValueType u1,
+                    const gko::matrix::Dense<ValueType> *u)
 {
     std::cout << u0 << '\n';
     for (int i = 0; i < u->get_size()[0]; ++i) {
@@ -207,8 +213,9 @@ void print_solution(double u0, double u1, const gko::matrix::Dense<> *u)
 
 // Computes the 1-norm of the error given the computed `u` and the correct
 // solution function `correct_u`.
-template <typename Closure>
-double calculate_error(int discretization_points, const gko::matrix::Dense<> *u,
+template <typename Closure, typename ValueType>
+double calculate_error(int discretization_points,
+                       const gko::matrix::Dense<ValueType> *u,
                        Closure correct_u)
 {
     const auto h = 1.0 / (discretization_points + 1);
@@ -226,9 +233,12 @@ double calculate_error(int discretization_points, const gko::matrix::Dense<> *u,
 int main(int argc, char *argv[])
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<double>;
-    using mtx = gko::matrix::Csr<double, int>;
-    using cg = gko::solver::Cg<double>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
 
     if (argc < 2) {
         std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]"
@@ -245,7 +255,8 @@ int main(int argc, char *argv[])
     const auto omp = gko::OmpExecutor::create();
     std::map<std::string, std::shared_ptr<gko::Executor>> exec_map{
         {"omp", omp},
-        {"cuda", gko::CudaExecutor::create(0, omp)},
+        {"cuda", gko::CudaExecutor::create(0, omp, true)},
+        {"hip", gko::HipExecutor::create(0, omp, true)},
         {"reference", gko::ReferenceExecutor::create()}};
 
     // executor where Ginkgo will perform the computation
@@ -254,8 +265,8 @@ int main(int argc, char *argv[])
     const auto app_exec = exec_map["omp"];
 
     // problem:
-    auto correct_u = [](double x) { return x * x * x; };
-    auto f = [](double x) { return 6 * x; };
+    auto correct_u = [](ValueType x) { return x * x * x; };
+    auto f = [](ValueType x) { return ValueType(6) * x; };
     auto u0 = correct_u(0);
     auto u1 = correct_u(1);
 
@@ -267,19 +278,20 @@ int main(int argc, char *argv[])
         u->get_values()[i] = 0.0;
     }
 
+    const ValueType reduction_factor = 1e-7;
     // Generate solver and solve the system
     cg::build()
         .with_criteria(gko::stop::Iteration::build()
                            .with_max_iters(discretization_points)
                            .on(exec),
-                       gko::stop::ResidualNormReduction<>::build()
-                           .with_reduction_factor(1e-6)
+                       gko::stop::ResidualNormReduction<ValueType>::build()
+                           .with_reduction_factor(reduction_factor)
                            .on(exec))
         .on(exec)
         // notice how our custom StencilMatrix can be used in the same way as
         // any built-in type
-        ->generate(
-            StencilMatrix::create(exec, discretization_points, -1, 2, -1))
+        ->generate(StencilMatrix<ValueType>::create(exec, discretization_points,
+                                                    -1, 2, -1))
         ->apply(lend(rhs), lend(u));
 
     print_solution(u0, u1, lend(u));
diff --git a/examples/custom-matrix-format/stencil_kernel.cu b/examples/custom-matrix-format/stencil_kernel.cu
index 66a471ad65e..fdd04d1aa3d 100644
--- a/examples/custom-matrix-format/stencil_kernel.cu
+++ b/examples/custom-matrix-format/stencil_kernel.cu
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -32,13 +32,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cstdlib>
 
+#include <ginkgo/ginkgo.hpp>
+
+
+#define INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \
+    template _macro(float);                     \
+    template _macro(double);
+
+
+#define STENCIL_KERNEL(_type)                                                 \
+    void stencil_kernel(std::size_t size, const _type *coefs, const _type *b, \
+                        _type *x);
+
 
 namespace {
 
 
 // a parallel CUDA kernel that computes the application of a 3 point stencil
-__global__ void stencil_kernel_impl(std::size_t size, const double *coefs,
-                                    const double *b, double *x)
+template <typename ValueType>
+__global__ void stencil_kernel_impl(std::size_t size, const ValueType *coefs,
+                                    const ValueType *b, ValueType *x)
 {
     const auto thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     if (thread_id >= size) {
@@ -58,10 +71,13 @@ __global__ void stencil_kernel_impl(std::size_t size, const double *coefs,
 }  // namespace
 
 
-void stencil_kernel(std::size_t size, const double *coefs, const double *b,
-                    double *x)
+template <typename ValueType>
+void stencil_kernel(std::size_t size, const ValueType *coefs,
+                    const ValueType *b, ValueType *x)
 {
     constexpr auto block_size = 512;
     const auto grid_size = (size + block_size - 1) / block_size;
     stencil_kernel_impl<<<grid_size, block_size>>>(size, coefs, b, x);
 }
+
+INSTANTIATE_FOR_EACH_VALUE_TYPE(STENCIL_KERNEL);
\ No newline at end of file
diff --git a/examples/custom-stopping-criterion/build.sh b/examples/custom-stopping-criterion/build.sh
index 8f4ba176c25..410f3e3c9cb 100755
--- a/examples/custom-stopping-criterion/build.sh
+++ b/examples/custom-stopping-criterion/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lpthread -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lpthread -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lpthread -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lpthread -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
index ebfa2789eb6..975e20f3a6c 100644
--- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
+++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -93,9 +93,12 @@ void run_solver(volatile bool *stop_iteration_process,
                 std::shared_ptr<gko::Executor> exec)
 {
     // Some shortcuts
-    using mtx = gko::matrix::Csr<>;
-    using vec = gko::matrix::Dense<>;
-    using bicg = gko::solver::Bicgstab<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using bicg = gko::solver::Bicgstab<ValueType>;
 
     // Read Data
     auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
@@ -110,7 +113,7 @@ void run_solver(volatile bool *stop_iteration_process,
                                          .on(exec))
                       .on(exec)
                       ->generate(A);
-    solver->add_logger(gko::log::Stream<>::create(
+    solver->add_logger(gko::log::Stream<ValueType>::create(
         exec, gko::log::Logger::iteration_complete_mask, std::cout, true));
     solver->apply(lend(b), lend(x));
 
@@ -145,7 +148,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
diff --git a/examples/external-lib-interfacing/CMakeLists.txt b/examples/external-lib-interfacing/CMakeLists.txt
index 2a12a9aa02f..0858b5e40d7 100644
--- a/examples/external-lib-interfacing/CMakeLists.txt
+++ b/examples/external-lib-interfacing/CMakeLists.txt
@@ -1,29 +1,29 @@
 if(GINKGO_BUILD_EXTLIB_EXAMPLE)
-# This is just an example of the CMakeLists.txt file that can be used after the
-# correct version of deal.ii has been installed.
-cmake_minimum_required(VERSION 3.8)
-project(DEAL_II_EXAMPLE LANGUAGES CXX)
+    # This is just an example of the CMakeLists.txt file that can be used after the
+    # correct version of deal.ii has been installed.
+    cmake_minimum_required(VERSION 3.8)
+    project(DEAL_II_EXAMPLE LANGUAGES CXX)
 
-find_package(MPI REQUIRED)
+    find_package(MPI REQUIRED)
 
-set(deal.II_DIR "/path/to/deal.ii/installation")
-find_package(deal.II 9.0.0 REQUIRED
-  HINTS ${deal.II_DIR} ${DEAL_II_DIR})
-DEAL_II_INITIALIZE_CACHED_VARIABLES()
+    set(deal.II_DIR "/path/to/deal.ii/installation")
+    find_package(deal.II 9.0.0 REQUIRED
+        HINTS ${deal.II_DIR} ${DEAL_II_DIR})
+    DEAL_II_INITIALIZE_CACHED_VARIABLES()
 
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
 
-add_executable(${PROJECT_NAME} "")
-target_sources(${PROJECT_NAME} PRIVATE external-lib-interfacing.cpp)
-target_compile_options(${PROJECT_NAME} PRIVATE -g -Wall)
-target_compile_definitions(${PROJECT_NAME} PRIVATE OMPI_SKIP_MPICXX)
+    add_executable(${PROJECT_NAME} "")
+    target_sources(${PROJECT_NAME} PRIVATE external-lib-interfacing.cpp)
+    target_compile_options(${PROJECT_NAME} PRIVATE -g -Wall)
+    target_compile_definitions(${PROJECT_NAME} PRIVATE OMPI_SKIP_MPICXX)
 
-target_link_libraries(${PROJECT_NAME}
-    ${MPI_C_LIBRARIES} Ginkgo::ginkgo)
+    target_link_libraries(${PROJECT_NAME}
+        ${MPI_C_LIBRARIES} Ginkgo::ginkgo)
 
-target_include_directories(${PROJECT_NAME}
-     PRIVATE ${MPI_C_INCLUDE_PATH} ${GINKGO_INC_DIR} ${GINKGO_LIB_DIR} )
+    target_include_directories(${PROJECT_NAME}
+        PRIVATE ${MPI_C_INCLUDE_PATH} ${GINKGO_INC_DIR} ${GINKGO_LIB_DIR} )
 
-DEAL_II_SETUP_TARGET(${PROJECT_NAME})
+    DEAL_II_SETUP_TARGET(${PROJECT_NAME})
 endif()
diff --git a/examples/ginkgo-overhead/build.sh b/examples/ginkgo-overhead/build.sh
index 47c10e673d9..9c3fd902cfa 100755
--- a/examples/ginkgo-overhead/build.sh
+++ b/examples/ginkgo-overhead/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/ginkgo-overhead/ginkgo-overhead.cpp b/examples/ginkgo-overhead/ginkgo-overhead.cpp
index 2e73149c73f..b8bc7acc2b1 100644
--- a/examples/ginkgo-overhead/ginkgo-overhead.cpp
+++ b/examples/ginkgo-overhead/ginkgo-overhead.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -47,9 +47,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int main(int argc, char *argv[])
 {
-    using vec = gko::matrix::Dense<>;
-    using mtx = gko::matrix::Dense<>;
-    using cg = gko::solver::Cg<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
 
     long unsigned num_iters = 1000000;
     if (argc > 2) {
@@ -87,8 +90,11 @@ int main(int argc, char *argv[])
     auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(tac - tic);
     std::cout << "Running " << num_iters
               << " iterations of the CG solver took a total of "
-              << 1.0 * time.count() / std::nano::den << " seconds." << std::endl
+              << static_cast<double>(time.count()) /
+                     static_cast<double>(std::nano::den)
+              << " seconds." << std::endl
               << "\tAverage library overhead:     "
-              << 1.0 * time.count() / num_iters << " [nanoseconds / iteration]"
-              << std::endl;
+              << static_cast<double>(time.count()) /
+                     static_cast<double>(num_iters)
+              << " [nanoseconds / iteration]" << std::endl;
 }
diff --git a/examples/ginkgo-ranges/build.sh b/examples/ginkgo-ranges/build.sh
index d53d6287c24..012cf07c9ba 100755
--- a/examples/ginkgo-ranges/build.sh
+++ b/examples/ginkgo-ranges/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/ginkgo-ranges/ginkgo-ranges.cpp b/examples/ginkgo-ranges/ginkgo-ranges.cpp
index 0e5f5d37f30..c471f967d60 100644
--- a/examples/ginkgo-ranges/ginkgo-ranges.cpp
+++ b/examples/ginkgo-ranges/ginkgo-ranges.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -82,13 +82,16 @@ void print_lu(const gko::range<Accessor> &A)
 
 int main(int argc, char *argv[])
 {
+    using ValueType = double;
+    using IndexType = int;
+
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
 
     // Create some test data, add some padding just to demonstrate how to use it
     // with ranges.
     // clang-format off
-    double data[] = {
+    ValueType data[] = {
         2.,  4.,  5., -1.0,
         4., 11., 12., -1.0,
         6., 24., 24., -1.0
@@ -97,7 +100,8 @@ int main(int argc, char *argv[])
 
     // Create a 3-by-3 range, with a 2D row-major accessor using data as the
     // underlying storage. Set the stride (a.k.a. "LDA") to 4.
-    auto A = gko::range<gko::accessor::row_major<double, 2>>(data, 3u, 3u, 4u);
+    auto A =
+        gko::range<gko::accessor::row_major<ValueType, 2>>(data, 3u, 3u, 4u);
 
     // use the LU factorization routine defined above to factorize the matrix
     factorize(A);
diff --git a/examples/ilu-preconditioned-solver/build.sh b/examples/ilu-preconditioned-solver/build.sh
index e8135b95328..a21f2e37584 100755
--- a/examples/ilu-preconditioned-solver/build.sh
+++ b/examples/ilu-preconditioned-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
index d616a68e6fb..3d61d61d5c8 100644
--- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
+++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,9 +43,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int main(int argc, char *argv[])
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<>;
-    using mtx = gko::matrix::Csr<>;
-    using gmres = gko::solver::Gmres<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using gmres = gko::solver::Gmres<ValueType>;
 
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
@@ -58,7 +61,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
@@ -70,15 +76,17 @@ int main(int argc, char *argv[])
     auto x = gko::read<vec>(std::ifstream("data/x0.mtx"), exec);
 
     // Generate incomplete factors using ParILU
-    auto par_ilu_fact = gko::factorization::ParIlu<>::build().on(exec);
+    auto par_ilu_fact =
+        gko::factorization::ParIlu<ValueType, IndexType>::build().on(exec);
     // Generate concrete factorization for input matrix
     auto par_ilu = par_ilu_fact->generate(A);
 
     // Generate an ILU preconditioner factory by setting lower and upper
     // triangular solver - in this case the exact triangular solves
     auto ilu_pre_factory =
-        gko::preconditioner::Ilu<gko::solver::LowerTrs<>,
-                                 gko::solver::UpperTrs<>, false>::build()
+        gko::preconditioner::Ilu<gko::solver::LowerTrs<ValueType, IndexType>,
+                                 gko::solver::UpperTrs<ValueType, IndexType>,
+                                 false>::build()
             .on(exec);
 
     // Use incomplete factors to generate ILU preconditioner
@@ -88,12 +96,13 @@ int main(int argc, char *argv[])
     // Generating a solver factory tied to a specific preconditioner makes sense
     // if there are several very similar systems to solve, and the same
     // solver+preconditioner combination is expected to be effective.
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     auto ilu_gmres_factory =
-        gko::solver::Gmres<>::build()
+        gmres::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(1000u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-15)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             .with_generated_preconditioner(gko::share(ilu_preconditioner))
             .on(exec);
diff --git a/examples/inverse-iteration/build.sh b/examples/inverse-iteration/build.sh
index 7b47813df38..628f7260a01 100755
--- a/examples/inverse-iteration/build.sh
+++ b/examples/inverse-iteration/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp
index 8eb68728bae..856483bfbd6 100644
--- a/examples/inverse-iteration/inverse-iteration.cpp
+++ b/examples/inverse-iteration/inverse-iteration.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -66,7 +66,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
diff --git a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
new file mode 100644
index 00000000000..dd77e163e59
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(ir-ilu-preconditioned-solver ir-ilu-preconditioned-solver.cpp)
+target_link_libraries(ir-ilu-preconditioned-solver ginkgo)
+target_include_directories(ir-ilu-preconditioned-solver PRIVATE ${PROJECT_SOURCE_DIR})
+configure_file(data/A.mtx data/A.mtx COPYONLY)
diff --git a/examples/ir-ilu-preconditioned-solver/build.sh b/examples/ir-ilu-preconditioned-solver/build.sh
new file mode 100755
index 00000000000..e3e8b513daa
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# set up script
+if [ $# -ne 1 ]; then
+    echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY"
+    exit 1
+fi
+BUILD_DIR=$1
+THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
+
+# copy libraries
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
+SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
+for prefix in ${LIBRARY_DIRS}; do
+    for name in ${LIBRARY_NAMES}; do
+        for suffix in ${SUFFIXES}; do
+            cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \
+                ${THIS_DIR}/lib${name}${suffix} 2>/dev/null
+        done
+    done
+done
+
+# figure out correct compiler flags
+if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
+else
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
+fi
+if [ -z "${CXX}" ]; then
+    CXX="c++"
+fi
+
+# build
+${CXX} -std=c++11 -o ${THIS_DIR}/ir-ilu-preconditioned-solver \
+    ${THIS_DIR}/ir-ilu-preconditioned-solver.cpp \
+    -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
+    -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/ir-ilu-preconditioned-solver/data/A.mtx b/examples/ir-ilu-preconditioned-solver/data/A.mtx
new file mode 100644
index 00000000000..c67437da567
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/data/A.mtx
@@ -0,0 +1,114 @@
+%%MatrixMarket matrix coordinate integer symmetric
+%-------------------------------------------------------------------------------
+% UF Sparse Matrix Collection, Tim Davis
+% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b
+% name: JGD_Trefethen/Trefethen_20b
+% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.]
+% id: 2203
+% date: 2008
+% author: N. Trefethen
+% ed: J.-G. Dumas
+% fields: name title A id date author ed kind notes
+% kind: combinatorial problem
+%-------------------------------------------------------------------------------
+% notes:
+% Diagonal matrices with primes, Nick Trefethen, Oxford Univ.          
+% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection,         
+% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html            
+%                                                                      
+% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems,   
+% SIAM News, vol 35, no. 1.                                            
+%                                                                      
+% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero        
+% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the
+% main diagonal and the number 1 in all the positions A(i,j) with      
+% |i-j| = 1,2,4,8, . . . ,16384.  What is the (1,1) entry of inv(A)?   
+%                                                                      
+% http://www.siam.org/news/news.php?id=388                             
+%                                                                      
+% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms     
+%-------------------------------------------------------------------------------
+19 19 83
+1 1 3
+2 1 1
+3 1 1
+5 1 1
+9 1 1
+17 1 1
+2 2 5
+3 2 1
+4 2 1
+6 2 1
+10 2 1
+18 2 1
+3 3 7
+4 3 1
+5 3 1
+7 3 1
+11 3 1
+19 3 1
+4 4 11
+5 4 1
+6 4 1
+8 4 1
+12 4 1
+5 5 13
+6 5 1
+7 5 1
+9 5 1
+13 5 1
+6 6 17
+7 6 1
+8 6 1
+10 6 1
+14 6 1
+7 7 19
+8 7 1
+9 7 1
+11 7 1
+15 7 1
+8 8 23
+9 8 1
+10 8 1
+12 8 1
+16 8 1
+9 9 29
+10 9 1
+11 9 1
+13 9 1
+17 9 1
+10 10 31
+11 10 1
+12 10 1
+14 10 1
+18 10 1
+11 11 37
+12 11 1
+13 11 1
+15 11 1
+19 11 1
+12 12 41
+13 12 1
+14 12 1
+16 12 1
+13 13 43
+14 13 1
+15 13 1
+17 13 1
+14 14 47
+15 14 1
+16 14 1
+18 14 1
+15 15 53
+16 15 1
+17 15 1
+19 15 1
+16 16 59
+17 16 1
+18 16 1
+17 17 61
+18 17 1
+19 17 1
+18 18 67
+19 18 1
+19 19 71
diff --git a/examples/ir-ilu-preconditioned-solver/doc/builds-on b/examples/ir-ilu-preconditioned-solver/doc/builds-on
new file mode 100644
index 00000000000..7c236123b46
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/doc/builds-on
@@ -0,0 +1 @@
+ilu-preconditioned-solver iterative-refinement
diff --git a/examples/ir-ilu-preconditioned-solver/doc/intro.dox b/examples/ir-ilu-preconditioned-solver/doc/intro.dox
new file mode 100644
index 00000000000..64e3322a219
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/doc/intro.dox
@@ -0,0 +1,15 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+
+<h3> About the example </h3>
+This example shows how to combine iterative refinement with the adaptive
+precision block-Jacobi preconditioner in order to approximately solve the
+triangular systems occurring in ILU preconditioning. Using an adaptive precision
+block-Jacobi preconditioner matrix as inner solver for the iterative refinement
+method is equivalent to doing adaptive precision block-Jacobi relaxation in the
+triangular solves. This example roughly approximates the triangular solves with
+five adaptive precision block-Jacobi sweeps with a maximum block size of 16.
+
+This example is motivated by "Multiprecision block-Jacobi for Iterative
+Triangular Solves" (Göbel, Anzt, Cojean, Flegar, Quintana-Ortí, Euro-Par 2020).
+The theory and a detailed analysis can be found there.
diff --git a/examples/ir-ilu-preconditioned-solver/doc/kind b/examples/ir-ilu-preconditioned-solver/doc/kind
new file mode 100644
index 00000000000..53a96d5771f
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/doc/kind
@@ -0,0 +1 @@
+preconditioners
diff --git a/examples/ir-ilu-preconditioned-solver/doc/results.dox b/examples/ir-ilu-preconditioned-solver/doc/results.dox
new file mode 100644
index 00000000000..eaaaa5758cd
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/doc/results.dox
@@ -0,0 +1,37 @@
+<h1>Results</h1>
+This is the expected output:
+
+@code{.cpp}
+Using 5 block-Jacobi sweeps.
+Solution (x):
+%%MatrixMarket matrix array real general
+19 1
+0.252218
+0.108645
+0.0662811
+0.0630433
+0.0384088
+0.0396536
+0.0402648
+0.0338935
+0.0193098
+0.0234653
+0.0211499
+0.0196413
+0.0199151
+0.0181674
+0.0162722
+0.0150714
+0.0107016
+0.0121141
+0.0123025
+GMRES iteration count:     7
+GMRES execution time [ms]: 2.64993
+Residual norm sqrt(r^T r):
+%%MatrixMarket matrix array real general
+1 1
+2.23805e-10
+
+@endcode
+
+<h3> Comments about programming and debugging </h3>
diff --git a/examples/ir-ilu-preconditioned-solver/doc/short-intro b/examples/ir-ilu-preconditioned-solver/doc/short-intro
new file mode 100644
index 00000000000..3f8cd5ad813
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/doc/short-intro
@@ -0,0 +1 @@
+The IR-ILU preconditioned solver example.
diff --git a/examples/ir-ilu-preconditioned-solver/doc/tooltip b/examples/ir-ilu-preconditioned-solver/doc/tooltip
new file mode 100644
index 00000000000..7b7208257aa
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/doc/tooltip
@@ -0,0 +1,5 @@
+Generate an incomplete factorization.
+Generate an ILU preconditioner from a factorization.
+Use an iterative solver to solve the triangular systems in the preconditioner. 
+Use an ILU preconditioner in an iterative solver.
+Solve a linear system.
diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
new file mode 100644
index 00000000000..620e755d490
--- /dev/null
+++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
@@ -0,0 +1,185 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+
+int main(int argc, char *argv[])
+{
+    // Some shortcuts
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using gmres = gko::solver::Gmres<ValueType>;
+    using ir = gko::solver::Ir<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
+
+    // Print version information
+    std::cout << gko::version_info::get() << std::endl;
+
+    // Figure out where to run the code and how many block-Jacobi sweeps to use
+    std::shared_ptr<gko::Executor> exec;
+    if (argc == 1 || std::string(argv[1]) == "reference") {
+        exec = gko::ReferenceExecutor::create();
+    } else if ((argc == 2 || argc == 3) && std::string(argv[1]) == "omp") {
+        exec = gko::OmpExecutor::create();
+    } else if ((argc == 2 || argc == 3) && std::string(argv[1]) == "cuda" &&
+               gko::CudaExecutor::get_num_devices() > 0) {
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if ((argc == 2 || argc == 3) && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else {
+        std::cerr << "Usage: " << argv[0] << " [executor] [sweeps]"
+                  << std::endl;
+        std::exit(-1);
+    }
+    unsigned int sweeps = (argc == 3) ? atoi(argv[2]) : 5u;
+
+    // Read data
+    auto A = gko::share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
+    // Create RHS and initial guess as 1
+    gko::size_type num_rows = A->get_size()[0];
+    auto host_x = vec::create(exec->get_master(), gko::dim<2>(num_rows, 1));
+    for (gko::size_type i = 0; i < num_rows; i++) {
+        host_x->at(i, 0) = 1.;
+    }
+    auto x = vec::create(exec);
+    auto b = vec::create(exec);
+    x->copy_from(host_x.get());
+    b->copy_from(host_x.get());
+    auto clone_x = vec::create(exec);
+    clone_x->copy_from(lend(x));
+
+    // Generate incomplete factors using ParILU
+    auto par_ilu_fact =
+        gko::factorization::ParIlu<ValueType, IndexType>::build().on(exec);
+    // Generate concrete factorization for input matrix
+    auto par_ilu = par_ilu_fact->generate(A);
+
+    // Generate an iterative refinement factory to be used as a triangular
+    // solver in the preconditioner application. The generated method is
+    // equivalent to doing five block-Jacobi sweeps with a maximum block size
+    // of 16.
+    auto bj_factory =
+        bj::build()
+            .with_max_block_size(16u)
+            .with_storage_optimization(gko::precision_reduction::autodetect())
+            .on(exec);
+
+    auto trisolve_factory =
+        ir::build()
+            .with_solver(share(bj_factory))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(sweeps).on(exec))
+            .on(exec);
+
+    // Generate an ILU preconditioner factory by setting lower and upper
+    // triangular solver - in this case the previously defined iterative
+    // refinement method.
+    auto ilu_pre_factory =
+        gko::preconditioner::Ilu<ir, ir>::build()
+            .with_l_solver_factory(gko::clone(trisolve_factory))
+            .with_u_solver_factory(gko::clone(trisolve_factory))
+            .on(exec);
+
+    // Use incomplete factors to generate ILU preconditioner
+    auto ilu_preconditioner = ilu_pre_factory->generate(gko::share(par_ilu));
+
+    // Create stopping criteria for Gmres
+    const gko::remove_complex<ValueType> reduction_factor = 1e-12;
+    auto iter_stop =
+        gko::stop::Iteration::build().with_max_iters(1000u).on(exec);
+    auto tol_stop = gko::stop::ResidualNormReduction<ValueType>::build()
+                        .with_reduction_factor(reduction_factor)
+                        .on(exec);
+
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create(exec);
+    iter_stop->add_logger(logger);
+    tol_stop->add_logger(logger);
+
+    // Use preconditioner inside GMRES solver factory
+    // Generating a solver factory tied to a specific preconditioner makes sense
+    // if there are several very similar systems to solve, and the same
+    // solver+preconditioner combination is expected to be effective.
+    auto ilu_gmres_factory =
+        gmres::build()
+            .with_criteria(gko::share(iter_stop), gko::share(tol_stop))
+            .with_generated_preconditioner(gko::share(ilu_preconditioner))
+            .on(exec);
+
+    // Generate preconditioned solver for a specific target system
+    auto ilu_gmres = ilu_gmres_factory->generate(A);
+
+    // Warmup run
+    ilu_gmres->apply(lend(b), lend(x));
+
+    // Solve system 100 times and take the average time.
+    std::chrono::nanoseconds time(0);
+    for (int i = 0; i < 100; i++) {
+        x->copy_from(lend(clone_x));
+        auto tic = std::chrono::high_resolution_clock::now();
+        ilu_gmres->apply(lend(b), lend(x));
+        auto toc = std::chrono::high_resolution_clock::now();
+        time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+    }
+
+    std::cout << "Using " << sweeps << " block-Jacobi sweeps. \n";
+
+    // Print solution
+    std::cout << "Solution (x): \n";
+    write(std::cout, gko::lend(x));
+
+    // Calculate residual
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto neg_one = gko::initialize<vec>({-1.0}, exec);
+    auto res = gko::initialize<vec>({0.0}, exec);
+    A->apply(gko::lend(one), gko::lend(x), gko::lend(neg_one), gko::lend(b));
+    b->compute_norm2(gko::lend(res));
+
+    std::cout << "GMRES iteration count:     " << logger->get_num_iterations()
+              << "\n";
+    std::cout << "GMRES execution time [ms]: "
+              << static_cast<double>(time.count()) / 100000000.0 << "\n";
+    std::cout << "Residual norm sqrt(r^T r): \n";
+    write(std::cout, gko::lend(res));
+}
diff --git a/examples/iterative-refinement/CMakeLists.txt b/examples/iterative-refinement/CMakeLists.txt
new file mode 100644
index 00000000000..a21b54d2a96
--- /dev/null
+++ b/examples/iterative-refinement/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(iterative-refinement iterative-refinement.cpp)
+target_link_libraries(iterative-refinement ginkgo)
+target_include_directories(iterative-refinement PRIVATE ${PROJECT_SOURCE_DIR})
+configure_file(data/A.mtx data/A.mtx COPYONLY)
diff --git a/examples/iterative-refinement/build.sh b/examples/iterative-refinement/build.sh
new file mode 100755
index 00000000000..06f7d201f1b
--- /dev/null
+++ b/examples/iterative-refinement/build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# set up script
+if [ $# -ne 1 ]; then
+    echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY"
+    exit 1
+fi
+BUILD_DIR=$1
+THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
+
+# copy libraries
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
+SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
+for prefix in ${LIBRARY_DIRS}; do
+    for name in ${LIBRARY_NAMES}; do
+        for suffix in ${SUFFIXES}; do
+            cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \
+                ${THIS_DIR}/lib${name}${suffix} 2>/dev/null
+        done
+    done
+done
+
+# figure out correct compiler flags
+if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
+else
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
+fi
+if [ -z "${CXX}" ]; then
+    CXX="c++"
+fi
+
+# build
+${CXX} -std=c++11 -o ${THIS_DIR}/iterative-refinement \
+    ${THIS_DIR}/iterative-refinement.cpp \
+    -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
+    -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/iterative-refinement/data/A.mtx b/examples/iterative-refinement/data/A.mtx
new file mode 100644
index 00000000000..c67437da567
--- /dev/null
+++ b/examples/iterative-refinement/data/A.mtx
@@ -0,0 +1,114 @@
+%%MatrixMarket matrix coordinate integer symmetric
+%-------------------------------------------------------------------------------
+% UF Sparse Matrix Collection, Tim Davis
+% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b
+% name: JGD_Trefethen/Trefethen_20b
+% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.]
+% id: 2203
+% date: 2008
+% author: N. Trefethen
+% ed: J.-G. Dumas
+% fields: name title A id date author ed kind notes
+% kind: combinatorial problem
+%-------------------------------------------------------------------------------
+% notes:
+% Diagonal matrices with primes, Nick Trefethen, Oxford Univ.          
+% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection,         
+% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html            
+%                                                                      
+% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems,   
+% SIAM News, vol 35, no. 1.                                            
+%                                                                      
+% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero        
+% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the
+% main diagonal and the number 1 in all the positions A(i,j) with      
+% |i-j| = 1,2,4,8, . . . ,16384.  What is the (1,1) entry of inv(A)?   
+%                                                                      
+% http://www.siam.org/news/news.php?id=388                             
+%                                                                      
+% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms     
+%-------------------------------------------------------------------------------
+19 19 83
+1 1 3
+2 1 1
+3 1 1
+5 1 1
+9 1 1
+17 1 1
+2 2 5
+3 2 1
+4 2 1
+6 2 1
+10 2 1
+18 2 1
+3 3 7
+4 3 1
+5 3 1
+7 3 1
+11 3 1
+19 3 1
+4 4 11
+5 4 1
+6 4 1
+8 4 1
+12 4 1
+5 5 13
+6 5 1
+7 5 1
+9 5 1
+13 5 1
+6 6 17
+7 6 1
+8 6 1
+10 6 1
+14 6 1
+7 7 19
+8 7 1
+9 7 1
+11 7 1
+15 7 1
+8 8 23
+9 8 1
+10 8 1
+12 8 1
+16 8 1
+9 9 29
+10 9 1
+11 9 1
+13 9 1
+17 9 1
+10 10 31
+11 10 1
+12 10 1
+14 10 1
+18 10 1
+11 11 37
+12 11 1
+13 11 1
+15 11 1
+19 11 1
+12 12 41
+13 12 1
+14 12 1
+16 12 1
+13 13 43
+14 13 1
+15 13 1
+17 13 1
+14 14 47
+15 14 1
+16 14 1
+18 14 1
+15 15 53
+16 15 1
+17 15 1
+19 15 1
+16 16 59
+17 16 1
+18 16 1
+17 17 61
+18 17 1
+19 17 1
+18 18 67
+19 18 1
+19 19 71
diff --git a/examples/iterative-refinement/doc/builds-on b/examples/iterative-refinement/doc/builds-on
new file mode 100644
index 00000000000..369aa997770
--- /dev/null
+++ b/examples/iterative-refinement/doc/builds-on
@@ -0,0 +1 @@
+simple-solver
diff --git a/examples/iterative-refinement/doc/intro.dox b/examples/iterative-refinement/doc/intro.dox
new file mode 100644
index 00000000000..049c0f24cc7
--- /dev/null
+++ b/examples/iterative-refinement/doc/intro.dox
@@ -0,0 +1,8 @@
+<a name="Iterative Refinement"></a>
+<h1>This example shows how to use the iterative refinement solver.</h1>
+
+<h3> In this example, we first read in a matrix from file, then generate a
+right-hand side and an initial guess. An inaccurate CG solver is used as the
+inner solver to an iterative refinement (IR) method which solves a linear
+system. The example features the iteration count and runtime of the IR solver.
+</h3>
diff --git a/examples/iterative-refinement/doc/kind b/examples/iterative-refinement/doc/kind
new file mode 100644
index 00000000000..15a13db4511
--- /dev/null
+++ b/examples/iterative-refinement/doc/kind
@@ -0,0 +1 @@
+basic
diff --git a/examples/iterative-refinement/doc/results.dox b/examples/iterative-refinement/doc/results.dox
new file mode 100644
index 00000000000..1ee878f6e02
--- /dev/null
+++ b/examples/iterative-refinement/doc/results.dox
@@ -0,0 +1,19 @@
+<h1>Results</h1>
+This is the expected output:
+
+@code{.cpp}
+
+Initial residual norm sqrt(r^T r):
+%%MatrixMarket matrix array real general
+1 1
+194.679
+Final residual norm sqrt(r^T r):
+%%MatrixMarket matrix array real general
+1 1
+4.23821e-11
+IR iteration count:     24
+IR execution time [ms]: 18.0692
+
+@endcode
+
+<h3> Comments about programming and debugging </h3>
diff --git a/examples/iterative-refinement/doc/short-intro b/examples/iterative-refinement/doc/short-intro
new file mode 100644
index 00000000000..a91594e87a5
--- /dev/null
+++ b/examples/iterative-refinement/doc/short-intro
@@ -0,0 +1 @@
+The iterative refinement solver example.
diff --git a/examples/iterative-refinement/doc/tooltip b/examples/iterative-refinement/doc/tooltip
new file mode 100644
index 00000000000..852c8b02e65
--- /dev/null
+++ b/examples/iterative-refinement/doc/tooltip
@@ -0,0 +1 @@
+Use an iterative refinement method in Ginkgo. Solve a linear system.
diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp
new file mode 100644
index 00000000000..b0463db125d
--- /dev/null
+++ b/examples/iterative-refinement/iterative-refinement.cpp
@@ -0,0 +1,149 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+
+int main(int argc, char *argv[])
+{
+    // Some shortcuts
+    using ValueType = double;
+    using IndexType = int;
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using ir = gko::solver::Ir<ValueType>;
+
+    // Print version information
+    std::cout << gko::version_info::get() << std::endl;
+
+    // Figure out where to run the code
+    std::shared_ptr<gko::Executor> exec;
+    if (argc == 1 || std::string(argv[1]) == "reference") {
+        exec = gko::ReferenceExecutor::create();
+    } else if (argc == 2 && std::string(argv[1]) == "omp") {
+        exec = gko::OmpExecutor::create();
+    } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
+               gko::CudaExecutor::get_num_devices() > 0) {
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else {
+        std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
+        std::exit(-1);
+    }
+
+    // Read data
+    auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
+    // Create RHS and initial guess as 1
+    gko::size_type size = A->get_size()[0];
+    auto host_x = gko::matrix::Dense<ValueType>::create(exec->get_master(),
+                                                        gko::dim<2>(size, 1));
+    for (auto i = 0; i < size; i++) {
+        host_x->at(i, 0) = 1.;
+    }
+    auto x = gko::matrix::Dense<ValueType>::create(exec);
+    auto b = gko::matrix::Dense<ValueType>::create(exec);
+    x->copy_from(host_x.get());
+    b->copy_from(host_x.get());
+
+    // Calculate initial residual by overwriting b
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto neg_one = gko::initialize<vec>({-1.0}, exec);
+    auto initres = gko::initialize<vec>({0.0}, exec);
+    A->apply(lend(one), lend(x), lend(neg_one), lend(b));
+    b->compute_norm2(lend(initres));
+
+    // copy b again
+    b->copy_from(host_x.get());
+    gko::size_type max_iters = 10000u;
+    gko::remove_complex<ValueType> outer_reduction_factor = 1e-12;
+    auto iter_stop =
+        gko::stop::Iteration::build().with_max_iters(max_iters).on(exec);
+    auto tol_stop = gko::stop::ResidualNormReduction<ValueType>::build()
+                        .with_reduction_factor(outer_reduction_factor)
+                        .on(exec);
+
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create(exec);
+    iter_stop->add_logger(logger);
+    tol_stop->add_logger(logger);
+
+    // Create solver factory
+    gko::remove_complex<ValueType> inner_reduction_factor = 1e-2;
+    auto solver_gen =
+        ir::build()
+            .with_solver(
+                cg::build()
+                    .with_criteria(
+                        gko::stop::ResidualNormReduction<ValueType>::build()
+                            .with_reduction_factor(inner_reduction_factor)
+                            .on(exec))
+                    .on(exec))
+            .with_criteria(gko::share(iter_stop), gko::share(tol_stop))
+            .on(exec);
+    // Create solver
+    auto solver = solver_gen->generate(A);
+
+
+    // Solve system
+    exec->synchronize();
+    std::chrono::nanoseconds time(0);
+    auto tic = std::chrono::steady_clock::now();
+    solver->apply(lend(b), lend(x));
+    auto toc = std::chrono::steady_clock::now();
+    time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+
+    // Calculate residual
+    auto res = gko::initialize<vec>({0.0}, exec);
+    A->apply(lend(one), lend(x), lend(neg_one), lend(b));
+    b->compute_norm2(lend(res));
+
+    std::cout << "Initial residual norm sqrt(r^T r): \n";
+    write(std::cout, lend(initres));
+    std::cout << "Final residual norm sqrt(r^T r): \n";
+    write(std::cout, lend(res));
+
+    // Print solver statistics
+    std::cout << "IR iteration count:     " << logger->get_num_iterations()
+              << std::endl;
+    std::cout << "IR execution time [ms]: "
+              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
+}
diff --git a/examples/minimal-cuda-solver/build.sh b/examples/minimal-cuda-solver/build.sh
index 0c75e6c1ef3..422db49149b 100755
--- a/examples/minimal-cuda-solver/build.sh
+++ b/examples/minimal-cuda-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
index 8cabf27db58..1b47f712766 100644
--- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
+++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int main()
 {
     // Instantiate a CUDA executor
-    auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
     // Read data
     auto A = gko::read<gko::matrix::Csr<>>(std::cin, gpu);
     auto b = gko::read<gko::matrix::Dense<>>(std::cin, gpu);
diff --git a/examples/mixed-precision-ir/CMakeLists.txt b/examples/mixed-precision-ir/CMakeLists.txt
new file mode 100644
index 00000000000..e4f81ef2c55
--- /dev/null
+++ b/examples/mixed-precision-ir/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(mixed-precision-ir mixed-precision-ir.cpp)
+target_link_libraries(mixed-precision-ir ginkgo)
+target_include_directories(mixed-precision-ir PRIVATE ${PROJECT_SOURCE_DIR})
+configure_file(data/A.mtx data/A.mtx COPYONLY)
diff --git a/examples/mixed-precision-ir/build.sh b/examples/mixed-precision-ir/build.sh
new file mode 100755
index 00000000000..a73ea3cde18
--- /dev/null
+++ b/examples/mixed-precision-ir/build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# set up script
+if [ $# -ne 1 ]; then
+    echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY"
+    exit 1
+fi
+BUILD_DIR=$1
+THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
+
+# copy libraries
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
+SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
+for prefix in ${LIBRARY_DIRS}; do
+    for name in ${LIBRARY_NAMES}; do
+        for suffix in ${SUFFIXES}; do
+            cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \
+                ${THIS_DIR}/lib${name}${suffix} 2>/dev/null
+        done
+    done
+done
+
+# figure out correct compiler flags
+if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
+else
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
+fi
+if [ -z "${CXX}" ]; then
+    CXX="c++"
+fi
+
+# build
+${CXX} -std=c++11 -o ${THIS_DIR}/mixed-precision-ir \
+    ${THIS_DIR}/mixed-precision-ir.cpp \
+    -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
+    -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/mixed-precision-ir/data/A.mtx b/examples/mixed-precision-ir/data/A.mtx
new file mode 100644
index 00000000000..c67437da567
--- /dev/null
+++ b/examples/mixed-precision-ir/data/A.mtx
@@ -0,0 +1,114 @@
+%%MatrixMarket matrix coordinate integer symmetric
+%-------------------------------------------------------------------------------
+% UF Sparse Matrix Collection, Tim Davis
+% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b
+% name: JGD_Trefethen/Trefethen_20b
+% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.]
+% id: 2203
+% date: 2008
+% author: N. Trefethen
+% ed: J.-G. Dumas
+% fields: name title A id date author ed kind notes
+% kind: combinatorial problem
+%-------------------------------------------------------------------------------
+% notes:
+% Diagonal matrices with primes, Nick Trefethen, Oxford Univ.          
+% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection,         
+% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html            
+%                                                                      
+% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems,   
+% SIAM News, vol 35, no. 1.                                            
+%                                                                      
+% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero        
+% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the
+% main diagonal and the number 1 in all the positions A(i,j) with      
+% |i-j| = 1,2,4,8, . . . ,16384.  What is the (1,1) entry of inv(A)?   
+%                                                                      
+% http://www.siam.org/news/news.php?id=388                             
+%                                                                      
+% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms     
+%-------------------------------------------------------------------------------
+19 19 83
+1 1 3
+2 1 1
+3 1 1
+5 1 1
+9 1 1
+17 1 1
+2 2 5
+3 2 1
+4 2 1
+6 2 1
+10 2 1
+18 2 1
+3 3 7
+4 3 1
+5 3 1
+7 3 1
+11 3 1
+19 3 1
+4 4 11
+5 4 1
+6 4 1
+8 4 1
+12 4 1
+5 5 13
+6 5 1
+7 5 1
+9 5 1
+13 5 1
+6 6 17
+7 6 1
+8 6 1
+10 6 1
+14 6 1
+7 7 19
+8 7 1
+9 7 1
+11 7 1
+15 7 1
+8 8 23
+9 8 1
+10 8 1
+12 8 1
+16 8 1
+9 9 29
+10 9 1
+11 9 1
+13 9 1
+17 9 1
+10 10 31
+11 10 1
+12 10 1
+14 10 1
+18 10 1
+11 11 37
+12 11 1
+13 11 1
+15 11 1
+19 11 1
+12 12 41
+13 12 1
+14 12 1
+16 12 1
+13 13 43
+14 13 1
+15 13 1
+17 13 1
+14 14 47
+15 14 1
+16 14 1
+18 14 1
+15 15 53
+16 15 1
+17 15 1
+19 15 1
+16 16 59
+17 16 1
+18 16 1
+17 17 61
+18 17 1
+19 17 1
+18 18 67
+19 18 1
+19 19 71
diff --git a/examples/mixed-precision-ir/doc/builds-on b/examples/mixed-precision-ir/doc/builds-on
new file mode 100644
index 00000000000..732380a55b6
--- /dev/null
+++ b/examples/mixed-precision-ir/doc/builds-on
@@ -0,0 +1 @@
+iterative-refinement
diff --git a/examples/mixed-precision-ir/doc/intro.dox b/examples/mixed-precision-ir/doc/intro.dox
new file mode 100644
index 00000000000..167972f3768
--- /dev/null
+++ b/examples/mixed-precision-ir/doc/intro.dox
@@ -0,0 +1,8 @@
+<a name="Mixed Precision Iterative Refinement (MPIR)"></a>
+<h1>This example manually implements a Mixed Precision Iterative Refinement (MPIR) solver.</h1>
+
+<h3> In this example, we first read in a matrix from file, then generate a
+right-hand side and an initial guess. An inaccurate CG solver in single precision
+is used as the inner solver to an iterative refinement (IR) in double precision
+method which solves a linear system.
+</h3>
diff --git a/examples/mixed-precision-ir/doc/kind b/examples/mixed-precision-ir/doc/kind
new file mode 100644
index 00000000000..c1d9154931a
--- /dev/null
+++ b/examples/mixed-precision-ir/doc/kind
@@ -0,0 +1 @@
+techniques
diff --git a/examples/mixed-precision-ir/doc/results.dox b/examples/mixed-precision-ir/doc/results.dox
new file mode 100644
index 00000000000..93abb0f7519
--- /dev/null
+++ b/examples/mixed-precision-ir/doc/results.dox
@@ -0,0 +1,19 @@
+<h1>Results</h1>
+This is the expected output:
+
+@code{.cpp}
+
+Initial residual norm sqrt(r^T r): 
+%%MatrixMarket matrix array real general
+1 1
+194.679
+Final residual norm sqrt(r^T r): 
+%%MatrixMarket matrix array real general
+1 1
+1.22728e-10
+MPIR iteration count:     25
+MPIR execution time [ms]: 18.0933
+
+@endcode
+
+<h3> Comments about programming and debugging </h3>
diff --git a/examples/mixed-precision-ir/doc/short-intro b/examples/mixed-precision-ir/doc/short-intro
new file mode 100644
index 00000000000..df19909cc80
--- /dev/null
+++ b/examples/mixed-precision-ir/doc/short-intro
@@ -0,0 +1 @@
+The Mixed Precision Iterative Refinement (MPIR) solver example.
diff --git a/examples/mixed-precision-ir/doc/tooltip b/examples/mixed-precision-ir/doc/tooltip
new file mode 100644
index 00000000000..b0cce88707b
--- /dev/null
+++ b/examples/mixed-precision-ir/doc/tooltip
@@ -0,0 +1 @@
+Manually implement a Mixed Precision Iterative Refinement (MPIR) method in Ginkgo. Solve a linear system.
diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp
new file mode 100644
index 00000000000..395b553f6c9
--- /dev/null
+++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp
@@ -0,0 +1,177 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+
+int main(int argc, char *argv[])
+{
+    // Some shortcuts
+    using ValueType = double;
+    using SolverType = float;
+    using IndexType = int;
+    using vec = gko::matrix::Dense<ValueType>;
+    using solver_vec = gko::matrix::Dense<SolverType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using solver_mtx = gko::matrix::Csr<SolverType, IndexType>;
+    using cg = gko::solver::Cg<SolverType>;
+
+    gko::size_type max_outer_iters = 100u;
+    gko::size_type max_inner_iters = 100u;
+    gko::remove_complex<ValueType> outer_reduction_factor = 1e-12;
+    gko::remove_complex<SolverType> inner_reduction_factor = 1e-2;
+
+    // Print version information
+    std::cout << gko::version_info::get() << std::endl;
+
+    // Figure out where to run the code
+    std::shared_ptr<gko::Executor> exec;
+    if (argc == 1 || std::string(argv[1]) == "reference") {
+        exec = gko::ReferenceExecutor::create();
+    } else if (argc == 2 && std::string(argv[1]) == "omp") {
+        exec = gko::OmpExecutor::create();
+    } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
+               gko::CudaExecutor::get_num_devices() > 0) {
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else {
+        std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
+        std::exit(-1);
+    }
+
+    // Read data
+    auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
+    // Create RHS and initial guess as 1
+    gko::size_type size = A->get_size()[0];
+    auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1));
+    for (auto i = 0; i < size; i++) {
+        host_x->at(i, 0) = 1.;
+    }
+    auto x = vec::create(exec);
+    auto b = vec::create(exec);
+    x->copy_from(host_x.get());
+    b->copy_from(host_x.get());
+
+    // Calculate initial residual by overwriting b
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto neg_one = gko::initialize<vec>({-1.0}, exec);
+    auto initres_vec = gko::initialize<vec>({0.0}, exec);
+    A->apply(lend(one), lend(x), lend(neg_one), lend(b));
+    b->compute_norm2(lend(initres_vec));
+
+    // Build lower-precision system matrix and residual
+    auto solver_A = solver_mtx::create(exec);
+    auto inner_residual = solver_vec::create(exec);
+    auto outer_residual = vec::create(exec);
+    A->convert_to(lend(solver_A));
+    b->convert_to(lend(outer_residual));
+
+    // restore b
+    b->copy_from(host_x.get());
+
+    // Create inner solver
+    auto inner_solver =
+        cg::build()
+            .with_criteria(gko::stop::ResidualNormReduction<SolverType>::build()
+                               .with_reduction_factor(inner_reduction_factor)
+                               .on(exec),
+                           gko::stop::Iteration::build()
+                               .with_max_iters(max_inner_iters)
+                               .on(exec))
+            .on(exec)
+            ->generate(give(solver_A));
+
+    // Solve system
+    exec->synchronize();
+    std::chrono::nanoseconds time(0);
+    auto res_vec = gko::initialize<vec>({0.0}, exec);
+    auto initres = exec->copy_val_to_host(initres_vec->get_const_values());
+    auto inner_solution = solver_vec::create(exec);
+    auto outer_delta = vec::create(exec);
+    auto tic = std::chrono::steady_clock::now();
+    int iter = -1;
+    while (true) {
+        ++iter;
+
+        // convert residual to inner precision
+        outer_residual->convert_to(lend(inner_residual));
+        outer_residual->compute_norm2(lend(res_vec));
+        auto res = exec->copy_val_to_host(res_vec->get_const_values());
+
+        // break if we exceed the number of iterations or have converged
+        if (iter > max_outer_iters || res / initres < outer_reduction_factor) {
+            break;
+        }
+
+        // Use the inner solver to solve
+        // A * inner_solution = inner_residual
+        // with residual as initial guess.
+        inner_solution->copy_from(lend(inner_residual));
+        inner_solver->apply(lend(inner_residual), lend(inner_solution));
+
+        // convert inner solution to outer precision
+        inner_solution->convert_to(lend(outer_delta));
+
+        // x = x + inner_solution
+        x->add_scaled(lend(one), lend(outer_delta));
+
+        // residual = b - A * x
+        outer_residual->copy_from(lend(b));
+        A->apply(lend(neg_one), lend(x), lend(one), lend(outer_residual));
+    }
+
+    auto toc = std::chrono::steady_clock::now();
+    time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+
+    // Calculate residual
+    A->apply(lend(one), lend(x), lend(neg_one), lend(b));
+    b->compute_norm2(lend(res_vec));
+
+    std::cout << "Initial residual norm sqrt(r^T r): \n";
+    write(std::cout, lend(initres_vec));
+    std::cout << "Final residual norm sqrt(r^T r): \n";
+    write(std::cout, lend(res_vec));
+
+    // Print solver statistics
+    std::cout << "MPIR iteration count:     " << iter << std::endl;
+    std::cout << "MPIR execution time [ms]: "
+              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
+}
diff --git a/examples/nine-pt-stencil-solver/build.sh b/examples/nine-pt-stencil-solver/build.sh
old mode 100644
new mode 100755
index 6f5a4dfdb0c..79af2d5055b
--- a/examples/nine-pt-stencil-solver/build.sh
+++ b/examples/nine-pt-stencil-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
index 6600e7291ba..e181c9a563a 100644
--- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
+++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -83,7 +83,7 @@ constexpr double default_alpha = 10.0 / 3.0;
 constexpr double default_beta = -2.0 / 3.0;
 constexpr double default_gamma = -1.0 / 6.0;
 
-/* Possible alternative default values are for example
+/* Possible alternative default values are
  * default_alpha = 8.0;
  * default_beta = -1.0;
  * default_gamma = -1.0;
@@ -91,18 +91,20 @@ constexpr double default_gamma = -1.0 / 6.0;
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
-void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs,
-                             double *values, double *coefs)
+template <typename ValueType, typename IndexType>
+void generate_stencil_matrix(IndexType dp, IndexType *row_ptrs,
+                             IndexType *col_idxs, ValueType *values,
+                             ValueType *coefs)
 {
-    int pos = 0;
+    IndexType pos = 0;
     const size_t dp_2 = dp * dp;
     row_ptrs[0] = pos;
-    for (int k = 0; k < dp; ++k) {
-        for (int i = 0; i < dp; ++i) {
+    for (IndexType k = 0; k < dp; ++k) {
+        for (IndexType i = 0; i < dp; ++i) {
             const size_t index = i + k * dp;
-            for (int j = -1; j <= 1; ++j) {
-                for (int l = -1; l <= 1; ++l) {
-                    const int64_t offset = l + 1 + 3 * (j + 1);
+            for (IndexType j = -1; j <= 1; ++j) {
+                for (IndexType l = -1; l <= 1; ++l) {
+                    const IndexType offset = l + 1 + 3 * (j + 1);
                     if ((k + j) >= 0 && (k + j) < dp && (i + l) >= 0 &&
                         (i + l) < dp) {
                         values[pos] = coefs[offset];
@@ -118,15 +120,17 @@ void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs,
 
 
 // Generates the RHS vector given `f` and the boundary conditions.
-template <typename Closure, typename ClosureT>
-void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
+template <typename Closure, typename ClosureT, typename ValueType,
+          typename IndexType>
+void generate_rhs(IndexType dp, Closure f, ClosureT u, ValueType *rhs,
+                  ValueType *coefs)
 {
     const size_t dp_2 = dp * dp;
-    const auto h = 1.0 / (dp + 1.0);
-    for (int i = 0; i < dp; ++i) {
-        const auto yi = (i + 1) * h;
-        for (int j = 0; j < dp; ++j) {
-            const auto xi = (j + 1) * h;
+    const ValueType h = 1.0 / (dp + 1.0);
+    for (IndexType i = 0; i < dp; ++i) {
+        const auto yi = ValueType(i + 1) * h;
+        for (IndexType j = 0; j < dp; ++j) {
+            const auto xi = ValueType(j + 1) * h;
             const auto index = i * dp + j;
             rhs[index] = -f(xi, yi) * h * h;
         }
@@ -135,7 +139,7 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
     // Iterating over the edges to add boundary values
     // and adding the overlapping 3x1 to the rhs
     for (size_t i = 0; i < dp; ++i) {
-        const auto xi = (i + 1) * h;
+        const auto xi = ValueType(i + 1) * h;
         const auto index_top = i;
         const auto index_bot = i + dp * (dp - 1);
 
@@ -148,7 +152,7 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
         rhs[index_bot] -= u(xi + h, 1.0) * coefs[8];
     }
     for (size_t i = 0; i < dp; ++i) {
-        const auto yi = (i + 1) * h;
+        const auto yi = ValueType(i + 1) * h;
         const auto index_left = i * dp;
         const auto index_right = i * dp + (dp - 1);
 
@@ -170,10 +174,11 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
 
 
 // Prints the solution `u`.
-void print_solution(int dp, const double *u)
+template <typename ValueType, typename IndexType>
+void print_solution(IndexType dp, const ValueType *u)
 {
-    for (int i = 0; i < dp; ++i) {
-        for (int j = 0; j < dp; ++j) {
+    for (IndexType i = 0; i < dp; ++i) {
+        for (IndexType j = 0; j < dp; ++j) {
             std::cout << u[i * dp + j] << ' ';
         }
         std::cout << '\n';
@@ -184,16 +189,17 @@ void print_solution(int dp, const double *u)
 
 // Computes the 1-norm of the error given the computed `u` and the correct
 // solution function `correct_u`.
-template <typename Closure>
-double calculate_error(int dp, const double *u, Closure correct_u)
+template <typename Closure, typename ValueType, typename IndexType>
+gko::remove_complex<ValueType> calculate_error(IndexType dp, const ValueType *u,
+                                               Closure correct_u)
 {
-    const auto h = 1.0 / (dp + 1);
-    auto error = 0.0;
-    for (int j = 0; j < dp; ++j) {
-        const auto xi = (j + 1) * h;
-        for (int i = 0; i < dp; ++i) {
+    const ValueType h = 1.0 / (dp + 1);
+    gko::remove_complex<ValueType> error = 0.0;
+    for (IndexType j = 0; j < dp; ++j) {
+        const auto xi = ValueType(j + 1) * h;
+        for (IndexType i = 0; i < dp; ++i) {
             using std::abs;
-            const auto yi = (i + 1) * h;
+            const auto yi = ValueType(i + 1) * h;
             error +=
                 abs(u[i * dp + j] - correct_u(xi, yi)) / abs(correct_u(xi, yi));
         }
@@ -202,26 +208,28 @@ double calculate_error(int dp, const double *u, Closure correct_u)
 }
 
 
+template <typename ValueType, typename IndexType>
 void solve_system(const std::string &executor_string,
-                  unsigned int discretization_points, int *row_ptrs,
-                  int *col_idxs, double *values, double *rhs, double *u,
-                  double accuracy)
+                  unsigned int discretization_points, IndexType *row_ptrs,
+                  IndexType *col_idxs, ValueType *values, ValueType *rhs,
+                  ValueType *u, gko::remove_complex<ValueType> reduction_factor)
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<double>;
-    using mtx = gko::matrix::Csr<double, int>;
-    using cg = gko::solver::Cg<double>;
-    using bj = gko::preconditioner::Jacobi<double, int>;
-    using val_array = gko::Array<double>;
-    using idx_array = gko::Array<int>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
+    using val_array = gko::Array<ValueType>;
+    using idx_array = gko::Array<IndexType>;
     const auto &dp = discretization_points;
-    const size_t dp_2 = dp * dp;
+    const gko::size_type dp_2 = dp * dp;
 
     // Figure out where to run the code
     const auto omp = gko::OmpExecutor::create();
     std::map<std::string, std::shared_ptr<gko::Executor>> exec_map{
         {"omp", omp},
-        {"cuda", gko::CudaExecutor::create(0, omp)},
+        {"cuda", gko::CudaExecutor::create(0, omp, true)},
+        {"hip", gko::HipExecutor::create(0, omp, true)},
         {"reference", gko::ReferenceExecutor::create()}};
     // executor where Ginkgo will perform the computation
     const auto exec = exec_map.at(executor_string);  // throws if not valid
@@ -263,8 +271,8 @@ void solve_system(const std::string &executor_string,
         cg::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(dp_2).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(accuracy)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             .with_preconditioner(bj::build().on(exec))
             .on(exec);
@@ -283,17 +291,19 @@ int main(int argc, char *argv[])
                   << std::endl;
         std::exit(-1);
     }
+    using ValueType = double;
+    using IndexType = int;
 
     const int discretization_points = argc >= 2 ? std::atoi(argv[1]) : 100;
     const auto executor_string = argc >= 3 ? argv[2] : "reference";
-    const double alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha;
-    const double beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta;
-    const double gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma;
+    const ValueType alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha;
+    const ValueType beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta;
+    const ValueType gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma;
 
     // clang-format off
-    std::array<double, 9> coefs{
+    std::array<ValueType, 9> coefs{
         gamma_c, beta_c, gamma_c,
-	beta_c, alpha_c, beta_c,
+	      beta_c, alpha_c, beta_c,
         gamma_c, beta_c, gamma_c};
     // clang-format on
 
@@ -301,38 +311,45 @@ int main(int argc, char *argv[])
     const size_t dp_2 = dp * dp;
 
     // problem:
-    auto correct_u = [](double x, double y) { return x * x * x + y * y * y; };
-    auto f = [](double x, double y) { return 6 * x + 6 * y; };
+    auto correct_u = [](ValueType x, ValueType y) {
+        return x * x * x + y * y * y;
+    };
+    auto f = [](ValueType x, ValueType y) {
+        return ValueType(6) * x + ValueType(6) * y;
+    };
 
     // matrix
-    std::vector<int> row_ptrs(dp_2 + 1);
-    std::vector<int> col_idxs((3 * dp - 2) * (3 * dp - 2));
-    std::vector<double> values((3 * dp - 2) * (3 * dp - 2));
+    std::vector<IndexType> row_ptrs(dp_2 + 1);
+    std::vector<IndexType> col_idxs((3 * dp - 2) * (3 * dp - 2));
+    std::vector<ValueType> values((3 * dp - 2) * (3 * dp - 2));
     // right hand side
-    std::vector<double> rhs(dp_2);
+    std::vector<ValueType> rhs(dp_2);
     // solution
-    std::vector<double> u(dp_2, 0.0);
+    std::vector<ValueType> u(dp_2, 0.0);
 
     generate_stencil_matrix(dp, row_ptrs.data(), col_idxs.data(), values.data(),
                             coefs.data());
     // looking for solution u = x^3: f = 6x, u(0) = 0, u(1) = 1
     generate_rhs(dp, f, correct_u, rhs.data(), coefs.data());
 
-    auto start_time = std::chrono::steady_clock::now();
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
 
+    auto start_time = std::chrono::steady_clock::now();
     solve_system(executor_string, dp, row_ptrs.data(), col_idxs.data(),
-                 values.data(), rhs.data(), u.data(), 1e-12);
-
+                 values.data(), rhs.data(), u.data(), reduction_factor);
     auto stop_time = std::chrono::steady_clock::now();
-    double runtime_duration =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time -
-                                                             start_time)
-            .count() *
+    auto runtime_duration =
+        static_cast<double>(
+            std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time -
+                                                                 start_time)
+                .count()) *
         1e-6;
 
     print_solution(dp, u.data());
     std::cout << "The average relative error is "
-              << calculate_error(dp, u.data(), correct_u) / dp_2 << std::endl;
+              << calculate_error(dp, u.data(), correct_u) /
+                     static_cast<gko::remove_complex<ValueType>>(dp_2)
+              << std::endl;
     std::cout << "The runtime is " << std::to_string(runtime_duration) << " ms"
               << std::endl;
 }
diff --git a/examples/papi-logging/build.sh b/examples/papi-logging/build.sh
index 77a12b42db4..050fceca7ce 100755
--- a/examples/papi-logging/build.sh
+++ b/examples/papi-logging/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lpapi -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lpapi -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lpapi -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lpapi -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp
index 2931eddff42..d4cfef40438 100644
--- a/examples/papi-logging/papi-logging.cpp
+++ b/examples/papi-logging/papi-logging.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -126,9 +126,12 @@ void print_papi_counters(int eventset)
 int main(int argc, char *argv[])
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<>;
-    using mtx = gko::matrix::Csr<>;
-    using cg = gko::solver::Cg<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
 
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
@@ -141,7 +144,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
@@ -153,12 +159,13 @@ int main(int argc, char *argv[])
     auto x = gko::read<vec>(std::ifstream("data/x0.mtx"), exec);
 
     // Generate solver
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     auto solver_gen =
         cg::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-20)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             .on(exec);
     auto solver = solver_gen->generate(A);
@@ -172,7 +179,7 @@ int main(int argc, char *argv[])
 
 
     // Create a PAPI logger and add it to relevant LinOps
-    auto logger = gko::log::Papi<>::create(
+    auto logger = gko::log::Papi<ValueType>::create(
         exec, gko::log::Logger::linop_apply_completed_mask |
                   gko::log::Logger::linop_advanced_apply_completed_mask);
     solver->add_logger(logger);
diff --git a/examples/performance-debugging/build.sh b/examples/performance-debugging/build.sh
index 35c20208d1c..70ed26bcc01 100755
--- a/examples/performance-debugging/build.sh
+++ b/examples/performance-debugging/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/performance-debugging/doc/results.dox b/examples/performance-debugging/doc/results.dox
index 57f7555981e..e6299ad72bf 100644
--- a/examples/performance-debugging/doc/results.dox
+++ b/examples/performance-debugging/doc/results.dox
@@ -35,7 +35,7 @@ Apply operations times (ns):
         dense::compute_dot#3: 28548
         dense::compute_norm2#2: 45677
         free: 25109
-        residual_norm_reduction::residual_norm_reduction#9: 10617
+        residual_norm::residual_norm#9: 10617
 Recurrent Residual Norms:
 [
         4.3589
diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp
index 1ed7f815a49..c5a35383447 100644
--- a/examples/performance-debugging/performance-debugging.cpp
+++ b/examples/performance-debugging/performance-debugging.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -70,14 +70,14 @@ std::unique_ptr<vec<ValueType>> create_vector(
 
 // utilities for computing norms and residuals
 template <typename ValueType>
-double get_norm(const vec<ValueType> *norm)
+gko::remove_complex<ValueType> get_norm(const vec<ValueType> *norm)
 {
-    return clone(norm->get_executor()->get_master(), norm)->at(0, 0);
+    return std::real(clone(norm->get_executor()->get_master(), norm)->at(0, 0));
 }
 
 
 template <typename ValueType>
-double compute_norm(const vec<ValueType> *b)
+gko::remove_complex<ValueType> compute_norm(const vec<ValueType> *b)
 {
     auto exec = b->get_executor();
     auto b_norm = gko::initialize<vec<ValueType>>({0.0}, exec);
@@ -87,8 +87,9 @@ double compute_norm(const vec<ValueType> *b)
 
 
 template <typename ValueType>
-double compute_residual_norm(const gko::LinOp *system_matrix,
-                             const vec<ValueType> *b, const vec<ValueType> *x)
+gko::remove_complex<ValueType> compute_residual_norm(
+    const gko::LinOp *system_matrix, const vec<ValueType> *b,
+    const vec<ValueType> *x)
 {
     auto exec = system_matrix->get_executor();
     auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
@@ -324,7 +325,8 @@ void print_usage(const char *filename)
 }
 
 
-void print_vector(const gko::matrix::Dense<> *vec)
+template <typename ValueType>
+void print_vector(const gko::matrix::Dense<ValueType> *vec)
 {
     auto elements_to_print = std::min(gko::size_type(10), vec->get_size()[0]);
     std::cout << "[" << std::endl;
@@ -342,23 +344,22 @@ int main(int argc, char *argv[])
 {
     // Parametrize the benchmark here
     // Pick a value type
-    using vtype = double;
+    using ValueType = double;
+    using IndexType = int;
     // Pick a matrix format
-    using mtx = gko::matrix::Csr<vtype>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
     // Pick a solver
-    using solver = gko::solver::Cg<vtype>;
+    using solver = gko::solver::Cg<ValueType>;
     // Pick a preconditioner type
-    using preconditioner = gko::matrix::IdentityFactory<vtype>;
+    using preconditioner = gko::matrix::IdentityFactory<ValueType>;
     // Pick a residual norm reduction value
-    auto reduction_factor = 1e-8;
-    // Pick a maximum iteration count
-    auto max_iters = 2000u;
+    const gko::remove_complex<ValueType> reduction_factor = 1e-12;
     // Pick an output file name
-    auto of_name = "log.txt";
+    const auto of_name = "log.txt";
 
 
     // Simple shortcut
-    using vec = gko::matrix::Dense<vtype>;
+    using vec = gko::matrix::Dense<ValueType>;
 
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
@@ -371,7 +372,7 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc > 1 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         print_usage(argv[0]);
     }
@@ -392,16 +393,18 @@ int main(int argc, char *argv[])
     // Remove the storage logger
     exec->remove_logger(gko::lend(storage_logger));
 
+    // Pick a maximum iteration count
+    const auto max_iters = A->get_size()[0];
     // Generate b and x vectors
-    auto b = utils::create_vector<vtype>(exec, A->get_size()[0], 1.0);
-    auto x = utils::create_vector<vtype>(exec, A->get_size()[0], 0.0);
+    auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
+    auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
 
     // Declare the solver factory. The preconditioner's arguments should be
     // adapted if needed.
     auto solver_factory =
         solver::build()
             .with_criteria(
-                gko::stop::ResidualNormReduction<vtype>::build()
+                gko::stop::ResidualNormReduction<ValueType>::build()
                     .with_reduction_factor(reduction_factor)
                     .on(exec),
                 gko::stop::Iteration::build().with_max_iters(max_iters).on(
@@ -461,9 +464,6 @@ int main(int argc, char *argv[])
 
     // Log the internal operations using the OperationLogger without timing
     {
-        // Clone x to not overwrite the original one
-        auto x_clone = gko::clone(x);
-
         // Create an OperationLogger to analyze the generate step
         auto gen_logger = std::make_shared<loggers::OperationLogger>(exec);
         // Add the generate logger to the executor
@@ -480,11 +480,11 @@ int main(int argc, char *argv[])
         auto apply_logger = std::make_shared<loggers::OperationLogger>(exec);
         exec->add_logger(apply_logger);
         // Create a ResidualLogger to log the recurent residual
-        auto res_logger = std::make_shared<loggers::ResidualLogger<vtype>>(
+        auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
             exec, gko::lend(A), gko::lend(b));
         generated_solver->add_logger(res_logger);
         // Solve the system
-        generated_solver->apply(gko::lend(b), gko::lend(x_clone));
+        generated_solver->apply(gko::lend(b), gko::lend(x));
         exec->remove_logger(gko::lend(apply_logger));
         // Write the data to the output file
         output_file << "Apply operations times (ns):" << std::endl;
diff --git a/examples/poisson-solver/build.sh b/examples/poisson-solver/build.sh
index 6a0a0c40515..09ecd8ce987 100755
--- a/examples/poisson-solver/build.sh
+++ b/examples/poisson-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp
index 0539b2f0ac6..6abb2a52560 100644
--- a/examples/poisson-solver/poisson-solver.cpp
+++ b/examples/poisson-solver/poisson-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,14 +39,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
-void generate_stencil_matrix(gko::matrix::Csr<> *matrix)
+template <typename ValueType, typename IndexType>
+void generate_stencil_matrix(gko::matrix::Csr<ValueType, IndexType> *matrix)
 {
     const auto discretization_points = matrix->get_size()[0];
     auto row_ptrs = matrix->get_row_ptrs();
     auto col_idxs = matrix->get_col_idxs();
     auto values = matrix->get_values();
     int pos = 0;
-    const double coefs[] = {-1, 2, -1};
+    const ValueType coefs[] = {-1, 2, -1};
     row_ptrs[0] = pos;
     for (int i = 0; i < discretization_points; ++i) {
         for (auto ofs : {-1, 0, 1}) {
@@ -62,14 +63,15 @@ void generate_stencil_matrix(gko::matrix::Csr<> *matrix)
 
 
 // Generates the RHS vector given `f` and the boundary conditions.
-template <typename Closure>
-void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs)
+template <typename Closure, typename ValueType>
+void generate_rhs(Closure f, ValueType u0, ValueType u1,
+                  gko::matrix::Dense<ValueType> *rhs)
 {
     const auto discretization_points = rhs->get_size()[0];
     auto values = rhs->get_values();
-    const auto h = 1.0 / (discretization_points + 1);
-    for (int i = 0; i < discretization_points; ++i) {
-        const auto xi = (i + 1) * h;
+    const ValueType h = 1.0 / static_cast<ValueType>(discretization_points + 1);
+    for (gko::size_type i = 0; i < discretization_points; ++i) {
+        const auto xi = static_cast<ValueType>(i + 1) * h;
         values[i] = -f(xi) * h * h;
     }
     values[0] += u0;
@@ -78,7 +80,9 @@ void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs)
 
 
 // Prints the solution `u`.
-void print_solution(double u0, double u1, const gko::matrix::Dense<> *u)
+template <typename Closure, typename ValueType>
+void print_solution(ValueType u0, ValueType u1,
+                    const gko::matrix::Dense<ValueType> *u)
 {
     std::cout << u0 << '\n';
     for (int i = 0; i < u->get_size()[0]; ++i) {
@@ -90,15 +94,16 @@ void print_solution(double u0, double u1, const gko::matrix::Dense<> *u)
 
 // Computes the 1-norm of the error given the computed `u` and the correct
 // solution function `correct_u`.
-template <typename Closure>
-double calculate_error(int discretization_points, const gko::matrix::Dense<> *u,
-                       Closure correct_u)
+template <typename Closure, typename ValueType>
+gko::remove_complex<ValueType> calculate_error(
+    int discretization_points, const gko::matrix::Dense<ValueType> *u,
+    Closure correct_u)
 {
-    const auto h = 1.0 / (discretization_points + 1);
+    const ValueType h = 1.0 / static_cast<ValueType>(discretization_points + 1);
     auto error = 0.0;
     for (int i = 0; i < discretization_points; ++i) {
         using std::abs;
-        const auto xi = (i + 1) * h;
+        const auto xi = static_cast<ValueType>(i + 1) * h;
         error +=
             abs(u->get_const_values()[i] - correct_u(xi)) / abs(correct_u(xi));
     }
@@ -109,10 +114,13 @@ double calculate_error(int discretization_points, const gko::matrix::Dense<> *u,
 int main(int argc, char *argv[])
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<double>;
-    using mtx = gko::matrix::Csr<double, int>;
-    using cg = gko::solver::Cg<double>;
-    using bj = gko::preconditioner::Jacobi<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
 
     if (argc < 2) {
         std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]"
@@ -129,7 +137,8 @@ int main(int argc, char *argv[])
     const auto omp = gko::OmpExecutor::create();
     std::map<std::string, std::shared_ptr<gko::Executor>> exec_map{
         {"omp", omp},
-        {"cuda", gko::CudaExecutor::create(0, omp)},
+        {"cuda", gko::CudaExecutor::create(0, omp, true)},
+        {"hip", gko::HipExecutor::create(0, omp, true)},
         {"reference", gko::ReferenceExecutor::create()}};
 
     // executor where Ginkgo will perform the computation
@@ -138,8 +147,8 @@ int main(int argc, char *argv[])
     const auto app_exec = exec_map["omp"];
 
     // problem:
-    auto correct_u = [](double x) { return x * x * x; };
-    auto f = [](double x) { return 6 * x; };
+    auto correct_u = [](ValueType x) { return x * x * x; };
+    auto f = [](ValueType x) { return ValueType(6) * x; };
     auto u0 = correct_u(0);
     auto u1 = correct_u(1);
 
@@ -154,22 +163,24 @@ int main(int argc, char *argv[])
         u->get_values()[i] = 0.0;
     }
 
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     // Generate solver and solve the system
     cg::build()
         .with_criteria(gko::stop::Iteration::build()
                            .with_max_iters(discretization_points)
                            .on(exec),
-                       gko::stop::ResidualNormReduction<>::build()
-                           .with_reduction_factor(1e-6)
+                       gko::stop::ResidualNormReduction<ValueType>::build()
+                           .with_reduction_factor(reduction_factor)
                            .on(exec))
         .with_preconditioner(bj::build().on(exec))
         .on(exec)
         ->generate(clone(exec, matrix))  // copy the matrix to the executor
         ->apply(lend(rhs), lend(u));
 
-    print_solution(u0, u1, lend(u));
+    print_solution<ValueType>(u0, u1, lend(u));
     std::cout << "The average relative error is "
               << calculate_error(discretization_points, lend(u), correct_u) /
-                     discretization_points
+                     static_cast<gko::remove_complex<ValueType>>(
+                         discretization_points)
               << std::endl;
 }
diff --git a/examples/preconditioned-solver/build.sh b/examples/preconditioned-solver/build.sh
index efbbb19aadc..5022e339ae5 100755
--- a/examples/preconditioned-solver/build.sh
+++ b/examples/preconditioned-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp
index db771e94af8..531f803fd80 100644
--- a/examples/preconditioned-solver/preconditioned-solver.cpp
+++ b/examples/preconditioned-solver/preconditioned-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,10 +42,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int main(int argc, char *argv[])
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<>;
-    using mtx = gko::matrix::Csr<>;
-    using cg = gko::solver::Cg<>;
-    using bj = gko::preconditioner::Jacobi<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
 
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
@@ -58,7 +61,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
@@ -69,13 +75,14 @@ int main(int argc, char *argv[])
     auto b = gko::read<vec>(std::ifstream("data/b.mtx"), exec);
     auto x = gko::read<vec>(std::ifstream("data/x0.mtx"), exec);
 
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     // Create solver factory
     auto solver_gen =
         cg::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-20)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             // Add preconditioner, these 2 lines are the only
             // difference from the simple solver example
diff --git a/examples/simple-solver-logging/build.sh b/examples/simple-solver-logging/build.sh
index 5062f0faf03..0a664512469 100755
--- a/examples/simple-solver-logging/build.sh
+++ b/examples/simple-solver-logging/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/simple-solver-logging/doc/results.dox b/examples/simple-solver-logging/doc/results.dox
index dbb01d1a63c..98dffea7d6a 100644
--- a/examples/simple-solver-logging/doc/results.dox
+++ b/examples/simple-solver-logging/doc/results.dox
@@ -49,9 +49,9 @@ gko::ReferenceExecutor,0x55ae09d8f2a0]
 [LOG] >>> check started for stop::Criterion[gko::stop::ResidualNormReduction<double>,0x55ae09d99260] at iteration 0 with ID 1 and finalized set to 1
 [LOG] >>> Operation[gko::matrix::dense::compute_norm2_operation<gko::matrix::Dense<double> const*, gko::matrix::Dense<double>*>,0x7ffcab765740] started on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0]
 [LOG] >>> Operation[gko::matrix::dense::compute_norm2_operation<gko::matrix::Dense<double> const*, gko::matrix::Dense<double>*>,0x7ffcab765740] completed on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0]
-[LOG] >>> Operation[gko::stop::residual_norm_reduction::residual_norm_reduction_operation<gko::matrix::Dense<double> const*&, gko::matrix::Dense<double>*, double&, unsigned char&, bool&, gko::Array<gko::stopping_status>*&, gko::Array<bool>*, bool*, bool*&>,0x7ffcab765980]
+[LOG] >>> Operation[gko::stop::residual_norm::residual_norm_operation<gko::matrix::Dense<double> const*&, gko::matrix::Dense<double>*, double&, unsigned char&, bool&, gko::Array<gko::stopping_status>*&, gko::Array<bool>*, bool*, bool*&>,0x7ffcab765980]
  started on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0]
-[LOG] >>> Operation[gko::stop::residual_norm_reduction::residual_norm_reduction_operation<gko::matrix::Dense<double> const*&, gko::matrix::Dense<double>*, double&, unsigned char&, bool&, gko::Array<gko::stopping_status>*&, gko::Array<bool>*, bool*, bool*&>,0x7ffcab765980]
+[LOG] >>> Operation[gko::stop::residual_norm::residual_norm_operation<gko::matrix::Dense<double> const*&, gko::matrix::Dense<double>*, double&, unsigned char&, bool&, gko::Array<gko::stopping_status>*&, gko::Array<bool>*, bool*, bool*&>,0x7ffcab765980]
  completed on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0]
 [LOG] >>> check completed for stop::Criterion[gko::stop::ResidualNormReduction<double>,0x55ae09d99260] at iteration 0 with ID 1 and finalized set to 1. It changed one RHS 0, stopped the iteration process 0
 [LOG] >>> allocation started on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0] with Bytes[152]
diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp
index 2fccdb65de9..3e48bcdacda 100644
--- a/examples/simple-solver-logging/simple-solver-logging.cpp
+++ b/examples/simple-solver-logging/simple-solver-logging.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
-void print_vector(const std::string &name, const gko::matrix::Dense<> *vec)
+template <typename ValueType>
+void print_vector(const std::string &name,
+                  const gko::matrix::Dense<ValueType> *vec)
 {
     std::cout << name << " = [" << std::endl;
     for (int i = 0; i < vec->get_size()[0]; ++i) {
@@ -58,9 +60,12 @@ void print_vector(const std::string &name, const gko::matrix::Dense<> *vec)
 int main(int argc, char *argv[])
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<>;
-    using mtx = gko::matrix::Csr<>;
-    using cg = gko::solver::Cg<>;
+    using ValueType = double;
+    using IndexType = int;
+
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
 
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
@@ -73,7 +78,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
@@ -88,8 +96,8 @@ int main(int argc, char *argv[])
     // file. We log all events except for all linop factory and polymorphic
     // object events. Events masks are group of events which are provided
     // for convenience.
-    std::shared_ptr<gko::log::Stream<>> stream_logger =
-        gko::log::Stream<>::create(
+    std::shared_ptr<gko::log::Stream<ValueType>> stream_logger =
+        gko::log::Stream<ValueType>::create(
             exec,
             gko::log::Logger::all_events_mask ^
                 gko::log::Logger::linop_factory_events_mask ^
@@ -102,11 +110,13 @@ int main(int argc, char *argv[])
     // Add stream_logger only to the ResidualNormReduction criterion Factory
     // Note that the logger will get automatically propagated to every criterion
     // generated from this factory.
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     using ResidualCriterionFactory =
-        gko::stop::ResidualNormReduction<>::Factory;
+        gko::stop::ResidualNormReduction<ValueType>::Factory;
     std::shared_ptr<ResidualCriterionFactory> residual_criterion =
-        ResidualCriterionFactory::create().with_reduction_factor(1e-20).on(
-            exec);
+        ResidualCriterionFactory::create()
+            .with_reduction_factor(reduction_factor)
+            .on(exec);
     residual_criterion->add_logger(stream_logger);
 
     // Generate solver
@@ -124,7 +134,7 @@ int main(int argc, char *argv[])
     // gko::log::Logger::iteration_complete_mask. See the documentation of
     // Logger class for more information.
     std::ofstream filestream("my_file.txt");
-    solver->add_logger(gko::log::Stream<>::create(
+    solver->add_logger(gko::log::Stream<ValueType>::create(
         exec, gko::log::Logger::all_events_mask, filestream));
     solver->add_logger(stream_logger);
 
@@ -153,7 +163,7 @@ int main(int argc, char *argv[])
     // convergence happened)
     auto residual =
         record_logger->get().criterion_check_completed.back()->residual.get();
-    auto residual_d = gko::as<gko::matrix::Dense<>>(residual);
+    auto residual_d = gko::as<vec>(residual);
     print_vector("Residual", residual_d);
 
     // Print solution
diff --git a/examples/simple-solver/build.sh b/examples/simple-solver/build.sh
index dd4dd0fd710..f2c94bc239c 100755
--- a/examples/simple-solver/build.sh
+++ b/examples/simple-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/simple-solver/doc/intro.dox b/examples/simple-solver/doc/intro.dox
index 2869e91a12c..70bc1ce3cc7 100644
--- a/examples/simple-solver/doc/intro.dox
+++ b/examples/simple-solver/doc/intro.dox
@@ -7,13 +7,13 @@ change the parameters and see what is best suited for your purposes.
 <h3> About the example </h3>
 Each example has the following sections:
 <ol>
-  <li> <b>Introduction:</b>This gives an overview of the example and mentions
-  any interesting aspects in the example that might help the reader.
-  <li> <b>The commented program:</b> This section is intended for you to
-  understand the details of the example so that you can play with it and understand
-  Ginkgo and its features better.
-  <li> <b>Results:</b> This section shows the results of the code when run. Though the
-  results may not be completely the same, you can expect the behaviour to be similar. 
-  <li> <b>The plain program:</b> This is the complete code without any comments to have
-  an complete overview of the code.
-  </ol>
+    <li> <b>Introduction:</b>This gives an overview of the example and mentions
+    any interesting aspects in the example that might help the reader.
+    <li> <b>The commented program:</b> This section is intended for you to
+    understand the details of the example so that you can play with it and understand
+    Ginkgo and its features better.
+    <li> <b>Results:</b> This section shows the results of the code when run. Though the
+    results may not be completely the same, you can expect the behaviour to be similar.
+    <li> <b>The plain program:</b> This is the complete code without any comments to have
+    an complete overview of the code.
+</ol>
diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp
index 0026e2bcacc..ad1f43b69f2 100644
--- a/examples/simple-solver/simple-solver.cpp
+++ b/examples/simple-solver/simple-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -49,14 +49,16 @@ int main(int argc, char *argv[])
     // with one column/one row. The advantage of this concept is that using
     // multiple vectors is a now a natural extension of adding columns/rows are
     // necessary.
-    using vec = gko::matrix::Dense<>;
+    using ValueType = double;
+    using IndexType = int;
+    using vec = gko::matrix::Dense<ValueType>;
     // The gko::matrix::Csr class is used here, but any other matrix class such
     // as gko::matrix::Coo, gko::matrix::Hybrid, gko::matrix::Ell or
     // gko::matrix::Sellp could also be used.
-    using mtx = gko::matrix::Csr<>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
     // The gko::solver::Cg is used here, but any other solver class can also be
     // used.
-    using cg = gko::solver::Cg<>;
+    using cg = gko::solver::Cg<ValueType>;
 
     // Print the ginkgo version information.
     std::cout << gko::version_info::get() << std::endl;
@@ -78,7 +80,10 @@ int main(int argc, char *argv[])
         exec = gko::OmpExecutor::create();
     } else if (argc == 2 && std::string(argv[1]) == "cuda" &&
                gko::CudaExecutor::get_num_devices() > 0) {
-        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+        exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    } else if (argc == 2 && std::string(argv[1]) == "hip" &&
+               gko::HipExecutor::get_num_devices() > 0) {
+        exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true);
     } else {
         std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
         std::exit(-1);
@@ -105,12 +110,13 @@ int main(int argc, char *argv[])
     // criteria(gko::stop) are also generated from factories using their build
     // methods. You need to specify the executors which each of the object needs
     // to be built on.
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     auto solver_gen =
         cg::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-15)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             .on(exec);
     // Generate the solver from the matrix. The solver factory built in the
diff --git a/examples/three-pt-stencil-solver/build.sh b/examples/three-pt-stencil-solver/build.sh
index 3594d40eda1..882e9c22bdf 100755
--- a/examples/three-pt-stencil-solver/build.sh
+++ b/examples/three-pt-stencil-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
index 339f4239519..504278d0ed8 100644
--- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
+++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -78,13 +78,15 @@ use Ginkgo, and the only part where Ginkgo is introduced is inside the
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
-void generate_stencil_matrix(int discretization_points, int *row_ptrs,
-                             int *col_idxs, double *values)
+template <typename ValueType, typename IndexType>
+void generate_stencil_matrix(IndexType discretization_points,
+                             IndexType *row_ptrs, IndexType *col_idxs,
+                             ValueType *values)
 {
-    int pos = 0;
-    const double coefs[] = {-1, 2, -1};
+    IndexType pos = 0;
+    const ValueType coefs[] = {-1, 2, -1};
     row_ptrs[0] = pos;
-    for (int i = 0; i < discretization_points; ++i) {
+    for (IndexType i = 0; i < discretization_points; ++i) {
         for (auto ofs : {-1, 0, 1}) {
             if (0 <= i + ofs && i + ofs < discretization_points) {
                 values[pos] = coefs[ofs + 1];
@@ -98,13 +100,13 @@ void generate_stencil_matrix(int discretization_points, int *row_ptrs,
 
 
 // Generates the RHS vector given `f` and the boundary conditions.
-template <typename Closure>
-void generate_rhs(int discretization_points, Closure f, double u0, double u1,
-                  double *rhs)
+template <typename Closure, typename ValueType, typename IndexType>
+void generate_rhs(IndexType discretization_points, Closure f, ValueType u0,
+                  ValueType u1, ValueType *rhs)
 {
-    const auto h = 1.0 / (discretization_points + 1);
-    for (int i = 0; i < discretization_points; ++i) {
-        const auto xi = (i + 1) * h;
+    const ValueType h = 1.0 / (discretization_points + 1);
+    for (IndexType i = 0; i < discretization_points; ++i) {
+        const ValueType xi = ValueType(i + 1) * h;
         rhs[i] = -f(xi) * h * h;
     }
     rhs[0] += u0;
@@ -113,11 +115,12 @@ void generate_rhs(int discretization_points, Closure f, double u0, double u1,
 
 
 // Prints the solution `u`.
-void print_solution(int discretization_points, double u0, double u1,
-                    const double *u)
+template <typename ValueType, typename IndexType>
+void print_solution(IndexType discretization_points, ValueType u0, ValueType u1,
+                    const ValueType *u)
 {
     std::cout << u0 << '\n';
-    for (int i = 0; i < discretization_points; ++i) {
+    for (IndexType i = 0; i < discretization_points; ++i) {
         std::cout << u[i] << '\n';
     }
     std::cout << u1 << std::endl;
@@ -126,40 +129,42 @@ void print_solution(int discretization_points, double u0, double u1,
 
 // Computes the 1-norm of the error given the computed `u` and the correct
 // solution function `correct_u`.
-template <typename Closure>
-double calculate_error(int discretization_points, const double *u,
-                       Closure correct_u)
+template <typename Closure, typename ValueType, typename IndexType>
+gko::remove_complex<ValueType> calculate_error(IndexType discretization_points,
+                                               const ValueType *u,
+                                               Closure correct_u)
 {
-    const auto h = 1.0 / (discretization_points + 1);
-    auto error = 0.0;
-    for (int i = 0; i < discretization_points; ++i) {
+    const ValueType h = 1.0 / (discretization_points + 1);
+    gko::remove_complex<ValueType> error = 0.0;
+    for (IndexType i = 0; i < discretization_points; ++i) {
         using std::abs;
-        const auto xi = (i + 1) * h;
+        const ValueType xi = ValueType(i + 1) * h;
         error += abs(u[i] - correct_u(xi)) / abs(correct_u(xi));
     }
     return error;
 }
 
-
+template <typename ValueType, typename IndexType>
 void solve_system(const std::string &executor_string,
-                  unsigned int discretization_points, int *row_ptrs,
-                  int *col_idxs, double *values, double *rhs, double *u,
-                  double accuracy)
+                  IndexType discretization_points, IndexType *row_ptrs,
+                  IndexType *col_idxs, ValueType *values, ValueType *rhs,
+                  ValueType *u, gko::remove_complex<ValueType> reduction_factor)
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<double>;
-    using mtx = gko::matrix::Csr<double, int>;
-    using cg = gko::solver::Cg<double>;
-    using bj = gko::preconditioner::Jacobi<double, int>;
-    using val_array = gko::Array<double>;
-    using idx_array = gko::Array<int>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
+    using val_array = gko::Array<ValueType>;
+    using idx_array = gko::Array<IndexType>;
     const auto &dp = discretization_points;
 
     // Figure out where to run the code
     const auto omp = gko::OmpExecutor::create();
     std::map<std::string, std::shared_ptr<gko::Executor>> exec_map{
         {"omp", omp},
-        {"cuda", gko::CudaExecutor::create(0, omp)},
+        {"cuda", gko::CudaExecutor::create(0, omp, true)},
+        {"hip", gko::HipExecutor::create(0, omp, true)},
         {"reference", gko::ReferenceExecutor::create()}};
     // executor where Ginkgo will perform the computation
     const auto exec = exec_map.at(executor_string);  // throws if not valid
@@ -198,11 +203,12 @@ void solve_system(const std::string &executor_string,
     // Generate solver
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(dp).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(accuracy)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build()
+                               .with_max_iters(gko::size_type(dp))
+                               .on(exec),
+                           gko::stop::ResidualNormReduction<ValueType>::build()
+                               .with_reduction_factor(reduction_factor)
+                               .on(exec))
             .with_preconditioner(bj::build().on(exec))
             .on(exec);
     auto solver = solver_gen->generate(gko::give(matrix));
@@ -214,29 +220,34 @@ void solve_system(const std::string &executor_string,
 
 int main(int argc, char *argv[])
 {
+    using ValueType = double;
+    using IndexType = int;
+
     if (argc < 2) {
         std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]"
                   << std::endl;
         std::exit(-1);
     }
 
-    const int discretization_points = argc >= 2 ? std::atoi(argv[1]) : 100;
+    const IndexType discretization_points =
+        argc >= 2 ? std::atoi(argv[1]) : 100;
     const auto executor_string = argc >= 3 ? argv[2] : "reference";
 
     // problem:
-    auto correct_u = [](double x) { return x * x * x; };
-    auto f = [](double x) { return 6 * x; };
+    auto correct_u = [](ValueType x) { return x * x * x; };
+    auto f = [](ValueType x) { return ValueType(6) * x; };
     auto u0 = correct_u(0);
     auto u1 = correct_u(1);
 
     // matrix
-    std::vector<int> row_ptrs(discretization_points + 1);
-    std::vector<int> col_idxs(3 * discretization_points - 2);
-    std::vector<double> values(3 * discretization_points - 2);
+    std::vector<IndexType> row_ptrs(discretization_points + 1);
+    std::vector<IndexType> col_idxs(3 * discretization_points - 2);
+    std::vector<ValueType> values(3 * discretization_points - 2);
     // right hand side
-    std::vector<double> rhs(discretization_points);
+    std::vector<ValueType> rhs(discretization_points);
     // solution
-    std::vector<double> u(discretization_points, 0.0);
+    std::vector<ValueType> u(discretization_points, 0.0);
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
 
     generate_stencil_matrix(discretization_points, row_ptrs.data(),
                             col_idxs.data(), values.data());
@@ -244,9 +255,10 @@ int main(int argc, char *argv[])
     generate_rhs(discretization_points, f, u0, u1, rhs.data());
 
     solve_system(executor_string, discretization_points, row_ptrs.data(),
-                 col_idxs.data(), values.data(), rhs.data(), u.data(), 1e-12);
+                 col_idxs.data(), values.data(), rhs.data(), u.data(),
+                 reduction_factor);
 
-    print_solution(discretization_points, 0, 1, u.data());
+    print_solution<ValueType, IndexType>(discretization_points, 0, 1, u.data());
     std::cout << "The average relative error is "
               << calculate_error(discretization_points, u.data(), correct_u) /
                      discretization_points
diff --git a/examples/twentyseven-pt-stencil-solver/build.sh b/examples/twentyseven-pt-stencil-solver/build.sh
old mode 100644
new mode 100755
index d38c973164b..f4d33aa2d37
--- a/examples/twentyseven-pt-stencil-solver/build.sh
+++ b/examples/twentyseven-pt-stencil-solver/build.sh
@@ -9,8 +9,8 @@ BUILD_DIR=$1
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 
 # copy libraries
-LIBRARY_DIRS="core core/device_hooks reference omp cuda"
-LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda"
+LIBRARY_DIRS="core core/device_hooks reference omp cuda hip"
+LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
 for prefix in ${LIBRARY_DIRS}; do
     for name in ${LIBRARY_NAMES}; do
@@ -23,9 +23,9 @@ done
 
 # figure out correct compiler flags
 if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then
-    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference"
+    LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip"
 else
-    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced"
+    LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd"
 fi
 if [ -z "${CXX}" ]; then
     CXX="c++"
diff --git a/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp b/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp
index f319ed35513..75fd314ebdf 100644
--- a/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp
+++ b/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -93,21 +93,20 @@ constexpr double default_delta = -1.0 / 24.0;
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
-void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs,
-                             double *values, double *coefs)
+template <typename ValueType, typename IndexType>
+void generate_stencil_matrix(IndexType dp, IndexType *row_ptrs,
+                             IndexType *col_idxs, ValueType *values,
+                             ValueType *coefs)
 {
-    int pos = 0;
-    size_t dp_2 = dp * dp;
-
-
+    IndexType pos = 0;
     row_ptrs[0] = pos;
     for (int64_t z = 0; z < dp; ++z) {
         for (int64_t y = 0; y < dp; ++y) {
             for (int64_t x = 0; x < dp; ++x) {
                 const auto index = x + dp * (y + dp * z);
-                for (int k = -1; k <= 1; ++k) {
-                    for (int j = -1; j <= 1; ++j) {
-                        for (int i = -1; i <= 1; ++i) {
+                for (IndexType k = -1; k <= 1; ++k) {
+                    for (IndexType j = -1; j <= 1; ++j) {
+                        for (IndexType i = -1; i <= 1; ++i) {
                             const int64_t offset =
                                 i + 1 + 3 * (j + 1 + 3 * (k + 1));
                             if ((x + i) >= 0 && (x + i) < dp && (y + j) >= 0 &&
@@ -127,17 +126,18 @@ void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs,
 
 
 // Generates the RHS vector given `f` and the boundary conditions.
-template <typename Closure, typename ClosureT>
-void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
+template <typename Closure, typename ClosureT, typename ValueType,
+          typename IndexType>
+void generate_rhs(IndexType dp, Closure f, ClosureT u, ValueType *rhs,
+                  ValueType *coefs)
 {
-    const size_t dp_2 = dp * dp;
-    const auto h = 1.0 / (dp + 1.0);
+    const ValueType h = 1.0 / (dp + 1.0);
     for (size_t k = 0; k < dp; ++k) {
-        const auto zi = (k + 1) * h;
+        const auto zi = ValueType(k + 1) * h;
         for (size_t j = 0; j < dp; ++j) {
-            const auto yi = (j + 1) * h;
+            const auto yi = ValueType(j + 1) * h;
             for (size_t i = 0; i < dp; ++i) {
-                const auto xi = (i + 1) * h;
+                const auto xi = ValueType(i + 1) * h;
                 const auto index = i + dp * (j + dp * k);
                 rhs[index] = -f(xi, yi, zi) * h * h;
             }
@@ -150,17 +150,18 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
     // z - ortho to front, back
     for (size_t j = 0; j < dp; ++j) {
         for (size_t k = 0; k < dp; ++k) {
-            const auto yi = (j + 1) * h;
-            const auto zi = (k + 1) * h;
+            const auto yi = ValueType(j + 1) * h;
+            const auto zi = ValueType(k + 1) * h;
             const auto index_left = dp * j + dp * dp * k;
             const auto index_right = dp * j + dp * dp * k + (dp - 1);
 
-            for (int b = -1; b <= 1; ++b) {
-                for (int c = -1; c <= 1; ++c) {
-                    rhs[index_left] -= u(0.0, yi + b * h, zi + c * h) *
-                                       coefs[3 * (b + 1) + 3 * 3 * (c + 1)];
+            for (IndexType b = -1; b <= 1; ++b) {
+                for (IndexType c = -1; c <= 1; ++c) {
+                    rhs[index_left] -=
+                        u(0.0, yi + ValueType(b) * h, zi + ValueType(c) * h) *
+                        coefs[3 * (b + 1) + 3 * 3 * (c + 1)];
                     rhs[index_right] -=
-                        u(1.0, yi + b * h, zi + c * h) *
+                        u(1.0, yi + ValueType(b) * h, zi + ValueType(c) * h) *
                         coefs[3 * (b + 1) + 3 * 3 * (c + 1) + 2];
                 }
             }
@@ -171,18 +172,20 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
     // included this case
     for (size_t i = 0; i < dp; ++i) {
         for (size_t k = 0; k < dp; ++k) {
-            const auto xi = (i + 1) * h;
-            const auto zi = (k + 1) * h;
+            const auto xi = ValueType(i + 1) * h;
+            const auto zi = ValueType(k + 1) * h;
             const auto index_top = i + dp * dp * k;
             const auto index_bot = i + dp * dp * k + dp * (dp - 1);
 
-            for (int a = -1; a <= 1; ++a) {
+            for (IndexType a = -1; a <= 1; ++a) {
                 if ((i < (dp - 1) || a < 1) && (i > 0 || a > -1)) {
-                    for (int c = -1; c <= 1; ++c) {
-                        rhs[index_top] -= u(xi + a * h, 0.0, zi + c * h) *
+                    for (IndexType c = -1; c <= 1; ++c) {
+                        rhs[index_top] -= u(xi + ValueType(a) * h, 0.0,
+                                            zi + ValueType(c) * h) *
                                           coefs[(a + 1) + 3 * 3 * (c + 1)];
                         rhs[index_bot] -=
-                            u(xi + a * h, 1.0, zi + c * h) *
+                            u(xi + ValueType(a) * h, 1.0,
+                              zi + ValueType(c) * h) *
                             coefs[(a + 1) + 3 * 3 * (c + 1) + 3 * 2];
                     }
                 }
@@ -193,19 +196,21 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
     // Now every side has to be checked
     for (size_t i = 0; i < dp; ++i) {
         for (size_t j = 0; j < dp; ++j) {
-            const auto xi = (i + 1) * h;
-            const auto yi = (j + 1) * h;
+            const auto xi = ValueType(i + 1) * h;
+            const auto yi = ValueType(j + 1) * h;
             const auto index_front = i + dp * j;
             const auto index_back = i + dp * j + dp * dp * (dp - 1);
 
-            for (int a = -1; a <= 1; ++a) {
+            for (IndexType a = -1; a <= 1; ++a) {
                 if ((i < (dp - 1) || a < 1) && (i > 0 || a > -1)) {
-                    for (int b = -1; b <= 1; ++b) {
+                    for (IndexType b = -1; b <= 1; ++b) {
                         if ((j < (dp - 1) || b < 1) && (j > 0 || j > -1)) {
-                            rhs[index_front] -= u(xi + a * h, yi + b * h, 0.0) *
+                            rhs[index_front] -= u(xi + ValueType(a) * h,
+                                                  yi + ValueType(b) * h, 0.0) *
                                                 coefs[(a + 1) + 3 * (b + 1)];
                             rhs[index_back] -=
-                                u(xi + a * h, yi + b * h, 1.0) *
+                                u(xi + ValueType(a) * h, yi + ValueType(b) * h,
+                                  1.0) *
                                 coefs[(a + 1) + 3 * (b + 1) + 3 * 3 * 2];
                         }
                     }
@@ -217,7 +222,8 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs)
 
 
 // Prints the solution `u`.
-void print_solution(int dp, const double *u)
+template <typename ValueType, typename IndexType>
+void print_solution(IndexType dp, const ValueType *u)
 {
     for (size_t k = 0; k < dp; ++k) {
         for (size_t j = 0; j < dp; ++j) {
@@ -234,18 +240,19 @@ void print_solution(int dp, const double *u)
 
 // Computes the 1-norm of the error given the computed `u` and the correct
 // solution function `correct_u`.
-template <typename Closure>
-double calculate_error(int dp, const double *u, Closure correct_u)
+template <typename Closure, typename ValueType, typename IndexType>
+gko::remove_complex<ValueType> calculate_error(IndexType dp, const ValueType *u,
+                                               Closure correct_u)
 {
     using std::abs;
     const auto h = 1.0 / (dp + 1);
-    auto error = 0.0;
-    for (int k = 0; k < dp; ++k) {
-        const auto zi = (k + 1) * h;
-        for (int j = 0; j < dp; ++j) {
-            const auto yi = (j + 1) * h;
-            for (int i = 0; i < dp; ++i) {
-                const auto xi = (i + 1) * h;
+    gko::remove_complex<ValueType> error = 0.0;
+    for (IndexType k = 0; k < dp; ++k) {
+        const auto zi = ValueType(k + 1) * h;
+        for (IndexType j = 0; j < dp; ++j) {
+            const auto yi = ValueType(j + 1) * h;
+            for (IndexType i = 0; i < dp; ++i) {
+                const auto xi = ValueType(i + 1) * h;
                 error +=
                     abs(u[k * dp * dp + i * dp + j] - correct_u(xi, yi, zi)) /
                     abs(correct_u(xi, yi, zi));
@@ -256,27 +263,28 @@ double calculate_error(int dp, const double *u, Closure correct_u)
 }
 
 
+template <typename ValueType, typename IndexType>
 void solve_system(const std::string &executor_string,
-                  unsigned int discretization_points, int *row_ptrs,
-                  int *col_idxs, double *values, double *rhs, double *u,
-                  double accuracy)
+                  IndexType discretization_points, IndexType *row_ptrs,
+                  IndexType *col_idxs, ValueType *values, ValueType *rhs,
+                  ValueType *u, gko::remove_complex<ValueType> reduction_factor)
 {
     // Some shortcuts
-    using vec = gko::matrix::Dense<double>;
-    using mtx = gko::matrix::Csr<double, int>;
-    using cg = gko::solver::Cg<double>;
-    using bj = gko::preconditioner::Jacobi<double, int>;
-    using val_array = gko::Array<double>;
-    using idx_array = gko::Array<int>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+    using cg = gko::solver::Cg<ValueType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, IndexType>;
+    using val_array = gko::Array<ValueType>;
+    using idx_array = gko::Array<IndexType>;
     const auto &dp = discretization_points;
-    const size_t dp_2 = dp * dp;
     const size_t dp_3 = dp * dp * dp;
 
     // Figure out where to run the code
     const auto omp = gko::OmpExecutor::create();
     std::map<std::string, std::shared_ptr<gko::Executor>> exec_map{
         {"omp", omp},
-        {"cuda", gko::CudaExecutor::create(0, omp)},
+        {"cuda", gko::CudaExecutor::create(0, omp, true)},
+        {"hip", gko::HipExecutor::create(0, omp, true)},
         {"reference", gko::ReferenceExecutor::create()}};
     // executor where Ginkgo will perform the computation
     const auto exec = exec_map.at(executor_string);  // throws if not valid
@@ -320,8 +328,8 @@ void solve_system(const std::string &executor_string,
         cg::build()
             .with_criteria(
                 gko::stop::Iteration::build().with_max_iters(dp_3).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(accuracy)
+                gko::stop::ResidualNormReduction<ValueType>::build()
+                    .with_reduction_factor(reduction_factor)
                     .on(exec))
             .with_preconditioner(bj::build().on(exec))
             .on(exec);
@@ -333,6 +341,8 @@ void solve_system(const std::string &executor_string,
 
 int main(int argc, char *argv[])
 {
+    using ValueType = double;
+    using IndexType = int;
     if (argc < 2) {
         std::cerr
             << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]"
@@ -341,15 +351,16 @@ int main(int argc, char *argv[])
         std::exit(-1);
     }
 
-    const int discretization_points = argc >= 2 ? std::atoi(argv[1]) : 100;
+    const IndexType discretization_points =
+        argc >= 2 ? std::atoi(argv[1]) : 100;
     const auto executor_string = argc >= 3 ? argv[2] : "reference";
-    const double alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha;
-    const double beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta;
-    const double gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma;
-    const double delta_c = argc >= 7 ? std::atof(argv[6]) : default_delta;
+    const ValueType alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha;
+    const ValueType beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta;
+    const ValueType gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma;
+    const ValueType delta_c = argc >= 7 ? std::atof(argv[6]) : default_delta;
 
     // clang-format off
-    std::array<double,27> coefs{
+    std::array<ValueType,27> coefs{
         delta_c, gamma_c, delta_c,
         gamma_c, beta_c, gamma_c,
         delta_c, gamma_c, delta_c,
@@ -369,40 +380,47 @@ int main(int argc, char *argv[])
     const size_t dp_3 = dp * dp * dp;
 
     // problem:
-    auto correct_u = [](double x, double y, double z) {
+    auto correct_u = [](ValueType x, ValueType y, ValueType z) {
         return x * x * x + y * y * y + z * z * z;
     };
-    auto f = [](double x, double y, double z) { return 6 * x + 6 * y + 6 * z; };
+    auto f = [](ValueType x, ValueType y, ValueType z) {
+        return ValueType(6) * x + ValueType(6) * y + ValueType(6) * z;
+    };
 
     // matrix
-    std::vector<int> row_ptrs(dp_3 + 1);
-    std::vector<int> col_idxs((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2));
-    std::vector<double> values((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2));
+    std::vector<IndexType> row_ptrs(dp_3 + 1);
+    std::vector<IndexType> col_idxs((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2));
+    std::vector<ValueType> values((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2));
     // right hand side
-    std::vector<double> rhs(dp_3);
+    std::vector<ValueType> rhs(dp_3);
     // solution
-    std::vector<double> u(dp_3, 0.0);
+    std::vector<ValueType> u(dp_3, 0.0);
 
     generate_stencil_matrix(dp, row_ptrs.data(), col_idxs.data(), values.data(),
                             coefs.data());
     // looking for solution u = x^3: f = 6x, u(0) = 0, u(1) = 1
     generate_rhs(dp, f, correct_u, rhs.data(), coefs.data());
 
-    auto start_time = std::chrono::steady_clock::now();
 
-    solve_system(executor_string, dp, row_ptrs.data(), col_idxs.data(),
-                 values.data(), rhs.data(), u.data(), 1e-12);
+    const gko::remove_complex<ValueType> reduction_factor = 1e-7;
 
+    auto start_time = std::chrono::steady_clock::now();
+    solve_system(executor_string, dp, row_ptrs.data(), col_idxs.data(),
+                 values.data(), rhs.data(), u.data(), reduction_factor);
     auto stop_time = std::chrono::steady_clock::now();
-    double runtime_duration =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time -
-                                                             start_time)
-            .count() *
+
+    const auto runtime_duration =
+        static_cast<double>(
+            std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time -
+                                                                 start_time)
+                .count()) *
         1e-6;
 
-    print_solution(dp, u.data());
+    print_solution<ValueType, IndexType>(dp, u.data());
     std::cout << "The average relative error is "
-              << calculate_error(dp, u.data(), correct_u) / dp_3 << std::endl;
+              << calculate_error(dp, u.data(), correct_u) /
+                     static_cast<gko::remove_complex<ValueType>>(dp_3)
+              << std::endl;
 
     std::cout << "The runtime is " << std::to_string(runtime_duration) << " ms"
               << std::endl;
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
new file mode 100644
index 00000000000..6ae0ff2c655
--- /dev/null
+++ b/hip/CMakeLists.txt
@@ -0,0 +1,293 @@
+if(NOT DEFINED ROCM_PATH)
+    if(DEFINED ENV{ROCM_PATH})
+        set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
+    elseif(DEFINED ENV{HIP_PATH})
+        set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed")
+    else()
+        set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed")
+    endif()
+endif()
+
+if(NOT DEFINED HIPBLAS_PATH)
+    if(DEFINED ENV{HIPBLAS_PATH})
+        set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed")
+    else()
+        set(HIPBLAS_PATH "${ROCM_PATH}/hipblas" CACHE PATH "Path to which HIPBLAS has been installed")
+    endif()
+endif()
+
+if(NOT DEFINED HIPSPARSE_PATH)
+    if(DEFINED ENV{HIPSPARSE_PATH})
+        set(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH} CACHE PATH "Path to which HIPSPARSE has been installed")
+    else()
+        set(HIPSPARSE_PATH "${ROCM_PATH}/hipsparse" CACHE PATH "Path to which HIPSPARSE has been installed")
+    endif()
+endif()
+
+## Both the definition of `HCC_PATH` and `HIP_HIPCC_CMAKE_LINKER_HELPER` are required
+## before including `FindHIP`, as these are essential but not defined in the beginning
+## of the `FindHIP` file itself. Not defining these currently results in:
+## 1. Without `HCC_PATH`: the `hcc` backend not working properly if it is wrongly set,
+##    if it is not set, popentially all compilation could fail.
+## 2. Without `HIP_HIPCC_CMAKE_LINKER_HELPER` two compilations are required, since
+##    `FindHIP` defines this only in macro calls, which we call much later on after
+##    including the file itself.
+if(NOT DEFINED HCC_PATH)
+    if(DEFINED ENV{HCC_PATH})
+        set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed")
+    else()
+        set(HCC_PATH "${ROCM_PATH}/hcc" CACHE PATH "Path to which HCC has been installed")
+    endif()
+    set(HCC_HOME "${HCC_PATH}")
+endif()
+
+if(NOT DEFINED HIP_CLANG_PATH)
+    if(NOT DEFINED ENV{HIP_CLANG_PATH})
+        set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed")
+    else()
+        set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed")
+    endif()
+endif()
+
+# Find HIPCC_CMAKE_LINKER_HELPER executable
+find_program(
+    HIP_HIPCC_CMAKE_LINKER_HELPER
+    NAMES hipcc_cmake_linker_helper
+    PATHS
+    "${HIP_ROOT_DIR}"
+    ENV ROCM_PATH
+    ENV HIP_PATH
+    /opt/rocm
+    /opt/rocm/hip
+    PATH_SUFFIXES bin
+    NO_DEFAULT_PATH
+)
+if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
+    # Now search in default paths
+    find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
+endif()
+
+find_program(
+    HIP_HIPCONFIG_EXECUTABLE
+    NAMES hipconfig
+    PATHS
+    "${HIP_ROOT_DIR}"
+    ENV ROCM_PATH
+    ENV HIP_PATH
+    /opt/rocm
+    /opt/rocm/hip
+    PATH_SUFFIXES bin
+    NO_DEFAULT_PATH
+)
+if(NOT HIP_HIPCONFIG_EXECUTABLE)
+    # Now search in default paths
+    find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
+endif()
+
+execute_process(
+            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
+            OUTPUT_VARIABLE GINKGO_HIP_VERSION
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_STRIP_TRAILING_WHITESPACE
+            )
+set(GINKGO_HIP_VERSION ${GINKGO_HIP_VERSION} PARENT_SCOPE)
+
+if (GINKGO_HIP_PLATFORM MATCHES "nvcc") # ensure ENV{CUDA_PATH} is set by the user
+    if (NOT DEFINED ENV{CUDA_PATH})
+        find_path(GINKGO_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH)
+        if (NOT GINKGO_HIP_DEFAULT_CUDA_PATH)
+            message(FATAL_ERROR "HIP nvcc backend was requested but CUDA could not be located. "
+                "Set and export the environment variable CUDA_PATH.")
+         endif()
+     endif()
+endif()
+
+if (GINKGO_HIP_PLATFORM STREQUAL "hcc")
+    # This is required by hipblas/hipsparse in the case where the platform is hcc.
+    # For nvcc platform, these aren't required and only cause trouble.
+    list(APPEND CMAKE_PREFIX_PATH
+        "${HIP_PATH}/lib/cmake"
+        "${HIP_PATH}/../lib/cmake" # hopefully catches all extra HIP dependencies, e.g. hcc
+    )
+endif()
+
+
+## Setup all CMAKE variables to find HIP and its dependencies
+list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake")
+list(APPEND CMAKE_PREFIX_PATH
+    "${HIPBLAS_PATH}/lib/cmake"
+    "${HIPSPARSE_PATH}/lib/cmake"
+)
+# Set CMAKE_MODULE_PATH and CMAKE_PREFIX_PATH as PARENT_SCOPE to easily find HIP again
+set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE)
+set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH}" PARENT_SCOPE)
+
+# setting the default flags like CMAKE_{LANG}_FLAGS_{TYPE}
+# the setting is copied from the default CMAKE_CXX_FLAGS_{TYPE}
+set(HIP_HIPCC_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the HIPCC compiler during DEBUG builds")
+set(HIP_HIPCC_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the HIPCC compiler during MINSIZEREL builds")
+set(HIP_HIPCC_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the HIPCC compiler during RELEASE builds")
+set(HIP_HIPCC_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the HIPCC compiler during RELWITHDEBINFO builds")
+
+find_package(HIP REQUIRED)
+find_package(hipblas REQUIRED)
+find_package(hipsparse REQUIRED)
+find_path(GINKGO_HIP_THRUST_PATH "thrust/complex.h"
+    PATHS "${HIP_PATH}/../include"
+    ENV HIP_THRUST_PATH)
+if (NOT GINKGO_HIP_THRUST_PATH)
+    message(FATAL_ERROR "Could not find the ROCm header thrust/complex.h which is required by Ginkgo HIP.")
+endif()
+
+set(GINKGO_HIP_SOURCES
+    base/exception.hip.cpp
+    base/executor.hip.cpp
+    base/version.hip.cpp
+    components/fill_array.hip.cpp
+    components/precision_conversion.hip.cpp
+    components/prefix_sum.hip.cpp
+    factorization/ilu_kernels.hip.cpp
+    factorization/factorization_kernels.hip.cpp
+    factorization/par_ict_kernels.hip.cpp
+    factorization/par_ilu_kernels.hip.cpp
+    factorization/par_ilut_approx_filter_kernel.hip.cpp
+    factorization/par_ilut_filter_kernel.hip.cpp
+    factorization/par_ilut_select_common.hip.cpp
+    factorization/par_ilut_select_kernel.hip.cpp
+    factorization/par_ilut_spgeam_kernel.hip.cpp
+    factorization/par_ilut_sweep_kernel.hip.cpp
+    matrix/coo_kernels.hip.cpp
+    matrix/csr_kernels.hip.cpp
+    matrix/dense_kernels.hip.cpp
+    matrix/ell_kernels.hip.cpp
+    matrix/hybrid_kernels.hip.cpp
+    matrix/sellp_kernels.hip.cpp
+    matrix/sparsity_csr_kernels.hip.cpp
+    preconditioner/isai_kernels.hip.cpp
+    preconditioner/jacobi_advanced_apply_kernel.hip.cpp
+    preconditioner/jacobi_generate_kernel.hip.cpp
+    preconditioner/jacobi_kernels.hip.cpp
+    preconditioner/jacobi_simple_apply_kernel.hip.cpp
+    solver/bicg_kernels.hip.cpp
+    solver/bicgstab_kernels.hip.cpp
+    solver/cg_kernels.hip.cpp
+    solver/cgs_kernels.hip.cpp
+    solver/fcg_kernels.hip.cpp
+    solver/gmres_kernels.hip.cpp
+    solver/ir_kernels.hip.cpp
+    solver/lower_trs_kernels.hip.cpp
+    solver/upper_trs_kernels.hip.cpp
+    stop/criterion_kernels.hip.cpp
+    stop/residual_norm_kernels.hip.cpp)
+
+set(GINKGO_HIP_NVCC_ARCH "")
+if (GINKGO_HIP_PLATFORM MATCHES "nvcc")
+    if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER)
+        set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE)
+    elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER)
+        unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    endif()
+    if (CMAKE_CUDA_HOST_COMPILER)
+        set(GINKGO_HIP_CUDA_HOST_COMPILER "-ccbin=${CMAKE_CUDA_HOST_COMPILER}")
+    endif()
+
+    # Remove false positive CUDA warnings when calling one<T>() and zero<T>()
+    # This creates a compilation bug on nvcc 9.0.102 *with* the new array_deleter
+    # merged at commit ed12b3df5d26
+    if(NOT CMAKE_CUDA_COMPILER_VERSION MATCHES "9.0")
+        set(GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr)
+    endif()
+    # add gpu architecture flags
+    include(CudaArchitectureSelector)
+    cas_target_cuda_architectures_plain(GINKGO_HIP_NVCC_ARCH
+        ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
+        UNSUPPORTED "20" "21")
+endif()
+set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS})
+set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS})
+set(GINKGO_HIP_HCC_OPTIONS ${GINKGO_HIP_HCC_COMPILER_FLAGS})
+set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS})
+
+set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
+if (GINKGO_HIP_VERSION VERSION_GREATER_EQUAL "3.5")
+    hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES}
+        HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} "-std=c++11"
+        HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS}
+        CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}
+        NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} ${GINKGO_HIP_CUDA_HOST_COMPILER}
+        ${GINKGO_STATIC_OR_SHARED})
+else()
+    hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES}
+        HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} "-std=c++11"
+        HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS}
+        NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} ${GINKGO_HIP_CUDA_HOST_COMPILER}
+        ${GINKGO_STATIC_OR_SHARED})
+endif()
+
+if(GINKGO_HIP_AMDGPU AND GINKGO_HIP_PLATFORM MATCHES "hcc")
+    foreach(target ${GINKGO_HIP_AMDGPU})
+        target_compile_options(ginkgo_hip PRIVATE --amdgpu-target=${target})
+        target_link_libraries(ginkgo_hip PRIVATE --amdgpu-target=${target})
+    endforeach()
+endif()
+
+target_compile_options(ginkgo_hip PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${GINKGO_COMPILER_FLAGS}>)
+if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH)
+    set_property(TARGET ginkgo_hip PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*")
+endif()
+if(GINKGO_WITH_IWYU AND GINKGO_IWYU_PATH)
+    set_property(TARGET ginkgo_hip PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${GINKGO_IWYU_PATH})
+endif()
+
+if(GINKGO_HIP_PLATFORM MATCHES "hcc")
+    # Fix the exception thrown bug with `hcc` backend and shared libraries
+    set_target_properties(ginkgo_hip PROPERTIES LINKER_LANGUAGE HIP)
+
+    # Ban `-hc` flag as INTERFACE_LINK_LIBRARIES since that is propagated when building
+    # a static library, and it's definitely not a known option to any compiler.
+    ginkgo_hip_ban_link_hcflag(hcc::hccrt)
+
+    if (NOT BUILD_SHARED_LIBS)
+        # Do not let hip::device flags propagate to executables which don't
+        # directly use HIP
+        ginkgo_hip_clang_ban_hip_device_flags()
+    endif()
+    target_link_libraries(ginkgo_hip PRIVATE hip::device)
+elseif(GINKGO_HIP_PLATFORM MATCHES "nvcc")
+    find_package(CUDA 9.0 REQUIRED)
+    target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES})
+    set(HIP_CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE)
+endif()
+
+target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse)
+
+target_include_directories(ginkgo_hip
+    PUBLIC
+        ${HIP_INCLUDE_DIRS}
+    PRIVATE
+        ${GINKGO_HIP_THRUST_PATH}
+        ${HIPBLAS_INCLUDE_DIRS}
+        ${HIPSPARSE_INCLUDE_DIRS}
+        $<BUILD_INTERFACE:${ROCPRIM_INCLUDE_DIRS}>)
+
+ginkgo_compile_features(ginkgo_hip)
+ginkgo_default_includes(ginkgo_hip)
+ginkgo_install_library(ginkgo_hip hip)
+
+if (GINKGO_CHECK_CIRCULAR_DEPS)
+    ginkgo_check_headers(ginkgo_hip)
+endif()
+
+if(GINKGO_BUILD_TESTS)
+  # Here, we go through all of Ginkgo's dependencies to build a `-Wl,-rpath` string since for
+  # some reason `hipcc` through CMake does not have rpath settings unlike the other compilers.
+    get_target_property(GINKGO_LINK_LIBRARIES ginkgo LINK_LIBRARIES)
+    set(GINKGO_RPATH_FOR_HIP "-Wl,-rpath,$<TARGET_FILE_DIR:ginkgo>")
+    foreach(target ${GINKGO_LINK_LIBRARIES})
+        if("${target}" MATCHES "^ginkgo")
+            set(GINKGO_RPATH_FOR_HIP "${GINKGO_RPATH_FOR_HIP}:$<TARGET_FILE_DIR:${target}>")
+        endif()
+    endforeach()
+
+    add_subdirectory(test)
+endif()
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
new file mode 100644
index 00000000000..d698a6a8d83
--- /dev/null
+++ b/hip/base/config.hip.hpp
@@ -0,0 +1,97 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_CONFIG_HIP_HPP_
+#define GKO_HIP_BASE_CONFIG_HIP_HPP_
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <hip/device_functions.h>
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+struct config {
+    /**
+     * The type containing a bitmask over all lanes of a warp.
+     */
+#if GINKGO_HIP_PLATFORM_HCC
+    using lane_mask_type = uint64;
+#else  // GINKGO_HIP_PLATFORM_NVCC
+    using lane_mask_type = uint32;
+#endif
+
+    /**
+     * The number of threads within a HIP warp. Here, we use the definition from
+     * `device_functions.h`.
+     */
+#if GINKGO_HIP_PLATFORM_HCC
+    static constexpr uint32 warp_size = warpSize;
+#else  // GINKGO_HIP_PLATFORM_NVCC
+    static constexpr uint32 warp_size = 32;
+#endif
+
+    /**
+     * The bitmask of the entire warp.
+     */
+    static constexpr auto full_lane_mask = ~zero<lane_mask_type>();
+
+    /**
+     * The maximal number of threads allowed in a HIP warp.
+     */
+    static constexpr uint32 max_block_size = 1024;
+
+    /**
+     * The minimal amount of warps that need to be scheduled for each block
+     * to maximize GPU occupancy.
+     */
+    static constexpr uint32 min_warps_per_block = 4;
+};
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_CONFIG_HIP_HPP_
diff --git a/hip/base/device_guard.hip.hpp b/hip/base/device_guard.hip.hpp
new file mode 100644
index 00000000000..b7d63ebc152
--- /dev/null
+++ b/hip/base/device_guard.hip.hpp
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_
+#define GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_
+
+
+#include <exception>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace hip {
+
+
+/**
+ * This class defines a device guard for the hip functions and the hip module.
+ * The guard is used to make sure that the device code is run on the correct
+ * hip device, when run with multiple devices. The class records the current
+ * device id and uses `hipSetDevice` to set the device id to the one being
+ * passed in. After the scope has been exited, the destructor sets the device_id
+ * back to the one before entering the scope.
+ */
+class device_guard {
+public:
+    device_guard(int device_id)
+    {
+        GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&original_device_id));
+        GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(device_id));
+    }
+
+    device_guard(device_guard &other) = delete;
+
+    device_guard &operator=(const device_guard &other) = delete;
+
+    device_guard(device_guard &&other) = delete;
+
+    device_guard const &operator=(device_guard &&other) = delete;
+
+    ~device_guard() noexcept(false)
+    {
+        /* Ignore the error during stack unwinding for this call */
+        if (std::uncaught_exception()) {
+            hipSetDevice(original_device_id);
+        } else {
+            GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(original_device_id));
+        }
+    }
+
+private:
+    int original_device_id{};
+};
+
+
+}  // namespace hip
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
new file mode 100644
index 00000000000..9e6f2ff7a00
--- /dev/null
+++ b/hip/base/exception.hip.cpp
@@ -0,0 +1,101 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/exception.hpp>
+
+
+#include <string>
+
+
+#include <hip/hip_runtime.h>
+#include <hipblas.h>
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+
+
+std::string HipError::get_error(int64 error_code)
+{
+    std::string name = hipGetErrorName(static_cast<hipError_t>(error_code));
+    std::string message =
+        hipGetErrorString(static_cast<hipError_t>(error_code));
+    return name + ": " + message;
+}
+
+
+std::string HipblasError::get_error(int64 error_code)
+{
+#define GKO_REGISTER_HIPBLAS_ERROR(error_name)          \
+    if (error_code == static_cast<int64>(error_name)) { \
+        return #error_name;                             \
+    }
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_SUCCESS);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_NOT_INITIALIZED);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_ALLOC_FAILED);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_INVALID_VALUE);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_ARCH_MISMATCH);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_MAPPING_ERROR);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_EXECUTION_FAILED);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_INTERNAL_ERROR);
+    GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_NOT_SUPPORTED);
+    return "Unknown error";
+
+#undef GKO_REGISTER_HIPBLAS_ERROR
+}
+
+
+std::string HipsparseError::get_error(int64 error_code)
+{
+#define GKO_REGISTER_HIPSPARSE_ERROR(error_name) \
+    if (error_code == int64(error_name)) {       \
+        return #error_name;                      \
+    }
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_SUCCESS);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_NOT_INITIALIZED);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_ALLOC_FAILED);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_INVALID_VALUE);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_ARCH_MISMATCH);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_MAPPING_ERROR);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_EXECUTION_FAILED);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_INTERNAL_ERROR);
+    GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    return "Unknown error";
+
+#undef GKO_REGISTER_HIPSPARSE_ERROR
+}
+
+
+}  // namespace gko
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
new file mode 100644
index 00000000000..9592bc20b8d
--- /dev/null
+++ b/hip/base/executor.hip.cpp
@@ -0,0 +1,226 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include <iostream>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/config.hip.hpp"
+#include "hip/base/device_guard.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+
+
+namespace gko {
+
+
+#include "common/base/executor.hpp.inc"
+
+
+std::shared_ptr<HipExecutor> HipExecutor::create(
+    int device_id, std::shared_ptr<Executor> master, bool device_reset)
+{
+    return std::shared_ptr<HipExecutor>(
+        new HipExecutor(device_id, std::move(master), device_reset),
+        [device_id](HipExecutor *exec) {
+            delete exec;
+            if (!HipExecutor::get_num_execs(device_id) &&
+                exec->get_device_reset()) {
+                hip::device_guard g(device_id);
+                hipDeviceReset();
+            }
+        });
+}
+
+
+void OmpExecutor::raw_copy_to(const HipExecutor *dest, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+{
+    if (num_bytes > 0) {
+        hip::device_guard g(dest->get_device_id());
+        GKO_ASSERT_NO_HIP_ERRORS(
+            hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyHostToDevice));
+    }
+}
+
+
+void HipExecutor::raw_free(void *ptr) const noexcept
+{
+    hip::device_guard g(this->get_device_id());
+    auto error_code = hipFree(ptr);
+    if (error_code != hipSuccess) {
+#if GKO_VERBOSE_LEVEL >= 1
+        // Unfortunately, if memory free fails, there's not much we can do
+        std::cerr << "Unrecoverable HIP error on device " << this->device_id_
+                  << " in " << __func__ << ": " << hipGetErrorName(error_code)
+                  << ": " << hipGetErrorString(error_code) << std::endl
+                  << "Exiting program" << std::endl;
+#endif
+        std::exit(error_code);
+    }
+}
+
+
+void *HipExecutor::raw_alloc(size_type num_bytes) const
+{
+    void *dev_ptr = nullptr;
+    hip::device_guard g(this->get_device_id());
+    auto error_code = hipMalloc(&dev_ptr, num_bytes);
+    if (error_code != hipErrorMemoryAllocation) {
+        GKO_ASSERT_NO_HIP_ERRORS(error_code);
+    }
+    GKO_ENSURE_ALLOCATED(dev_ptr, "hip", num_bytes);
+    return dev_ptr;
+}
+
+
+void HipExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+{
+    if (num_bytes > 0) {
+        hip::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_HIP_ERRORS(
+            hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyDeviceToHost));
+    }
+}
+
+
+void HipExecutor::raw_copy_to(const CudaExecutor *src, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+{
+#if GINKGO_HIP_PLATFORM_NVCC == 1
+    if (num_bytes > 0) {
+        hip::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, this->device_id_,
+                                               src_ptr, src->get_device_id(),
+                                               num_bytes));
+    }
+#else
+    GKO_NOT_SUPPORTED(this);
+#endif
+}
+
+
+void HipExecutor::raw_copy_to(const HipExecutor *src, size_type num_bytes,
+                              const void *src_ptr, void *dest_ptr) const
+{
+    if (num_bytes > 0) {
+        hip::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, this->device_id_,
+                                               src_ptr, src->get_device_id(),
+                                               num_bytes));
+    }
+}
+
+
+void HipExecutor::synchronize() const
+{
+    hip::device_guard g(this->get_device_id());
+    GKO_ASSERT_NO_HIP_ERRORS(hipDeviceSynchronize());
+}
+
+
+void HipExecutor::run(const Operation &op) const
+{
+    this->template log<log::Logger::operation_launched>(this, &op);
+    hip::device_guard g(this->get_device_id());
+    op.run(
+        std::static_pointer_cast<const HipExecutor>(this->shared_from_this()));
+    this->template log<log::Logger::operation_completed>(this, &op);
+}
+
+
+int HipExecutor::get_num_devices()
+{
+    int deviceCount = 0;
+    auto error_code = hipGetDeviceCount(&deviceCount);
+    if (error_code == hipErrorNoDevice) {
+        return 0;
+    }
+    GKO_ASSERT_NO_HIP_ERRORS(error_code);
+    return deviceCount;
+}
+
+
+void HipExecutor::set_gpu_property()
+{
+    if (device_id_ < this->get_num_devices() && device_id_ >= 0) {
+        hip::device_guard g(this->get_device_id());
+        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+            &num_multiprocessor_, hipDeviceAttributeMultiprocessorCount,
+            device_id_));
+        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+            &major_, hipDeviceAttributeComputeCapabilityMajor, device_id_));
+        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+            &minor_, hipDeviceAttributeComputeCapabilityMinor, device_id_));
+#if GINKGO_HIP_PLATFORM_NVCC
+        num_warps_per_sm_ = convert_sm_ver_to_cores(major_, minor_) /
+                            kernels::hip::config::warp_size;
+#else
+        // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD
+        // Reference: https://en.wikipedia.org/wiki/Graphics_Core_Next
+        num_warps_per_sm_ = 4;
+#endif  // GINKGO_HIP_PLATFORM_NVCC
+        warp_size_ = kernels::hip::config::warp_size;
+    }
+}
+
+
+void HipExecutor::init_handles()
+{
+    if (device_id_ < this->get_num_devices() && device_id_ >= 0) {
+        const auto id = this->get_device_id();
+        hip::device_guard g(id);
+        this->hipblas_handle_ = handle_manager<hipblasContext>(
+            kernels::hip::hipblas::init(), [id](hipblasContext *handle) {
+                hip::device_guard g(id);
+                kernels::hip::hipblas::destroy_hipblas_handle(handle);
+            });
+        this->hipsparse_handle_ = handle_manager<hipsparseContext>(
+            kernels::hip::hipsparse::init(), [id](hipsparseContext *handle) {
+                hip::device_guard g(id);
+                kernels::hip::hipsparse::destroy_hipsparse_handle(handle);
+            });
+    }
+}
+
+
+}  // namespace gko
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
new file mode 100644
index 00000000000..7bef3278f79
--- /dev/null
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -0,0 +1,275 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
+#define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
+
+
+#include <hipblas.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+/**
+ * @brief The device specific kernels namespace.
+ *
+ * @ingroup kernels
+ */
+namespace kernels {
+/**
+ * @brief The HIP namespace.
+ *
+ * @ingroup hip
+ */
+namespace hip {
+/**
+ * @brief The HIPBLAS namespace.
+ *
+ * @ingroup hipblas
+ */
+namespace hipblas {
+/**
+ * @brief The detail namespace.
+ *
+ * @ingroup detail
+ */
+namespace detail {
+
+
+template <typename... Args>
+inline int64 not_implemented(Args &&...)
+{
+    return static_cast<int64>(HIPBLAS_STATUS_NOT_SUPPORTED);
+}
+
+
+}  // namespace detail
+
+
+template <typename ValueType>
+struct is_supported : std::false_type {};
+
+template <>
+struct is_supported<float> : std::true_type {};
+
+template <>
+struct is_supported<double> : std::true_type {};
+
+// hipblas supports part of complex function version is >= 0.19, but the version
+// is not set now.
+/* not implemented
+template <>
+struct is_supported<std::complex<float>> : std::true_type {};
+
+template <>
+struct is_supported<std::complex<double>> : std::true_type {};
+*/
+
+
+#define GKO_BIND_HIPBLAS_GEMM(ValueType, HipblasName)                        \
+    inline void gemm(hipblasHandle_t handle, hipblasOperation_t transa,      \
+                     hipblasOperation_t transb, int m, int n, int k,         \
+                     const ValueType *alpha, const ValueType *a, int lda,    \
+                     const ValueType *b, int ldb, const ValueType *beta,     \
+                     ValueType *c, int ldc)                                  \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(HipblasName(                            \
+            handle, transa, transb, m, n, k, as_hiplibs_type(alpha),         \
+            as_hiplibs_type(a), lda, as_hiplibs_type(b), ldb,                \
+            as_hiplibs_type(beta), as_hiplibs_type(c), ldc));                \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPBLAS_GEMM(float, hipblasSgemm);
+GKO_BIND_HIPBLAS_GEMM(double, hipblasDgemm);
+/* not implemented
+GKO_BIND_HIPBLAS_GEMM(std::complex<float>, hipblasCgemm);
+GKO_BIND_HIPBLAS_GEMM(std::complex<double>, hipblasZgemm);
+*/
+template <typename ValueType>
+GKO_BIND_HIPBLAS_GEMM(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPBLAS_GEMM
+
+
+#define GKO_BIND_HIPBLAS_GEAM(ValueType, HipblasName)                         \
+    inline void geam(hipblasHandle_t handle, hipblasOperation_t transa,       \
+                     hipblasOperation_t transb, int m, int n,                 \
+                     const ValueType *alpha, const ValueType *a, int lda,     \
+                     const ValueType *beta, const ValueType *b, int ldb,      \
+                     ValueType *c, int ldc)                                   \
+    {                                                                         \
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(                                         \
+            HipblasName(handle, transa, transb, m, n, as_hiplibs_type(alpha), \
+                        as_hiplibs_type(a), lda, as_hiplibs_type(beta),       \
+                        as_hiplibs_type(b), ldb, as_hiplibs_type(c), ldc));   \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPBLAS_GEAM(float, hipblasSgeam);
+GKO_BIND_HIPBLAS_GEAM(double, hipblasDgeam);
+// Hipblas does not provide geam complex version yet.
+template <typename ValueType>
+GKO_BIND_HIPBLAS_GEAM(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPBLAS_GEAM
+
+
+#define GKO_BIND_HIPBLAS_SCAL(ValueType, HipblasName)                        \
+    inline void scal(hipblasHandle_t handle, int n, const ValueType *alpha,  \
+                     ValueType *x, int incx)                                 \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(HipblasName(                            \
+            handle, n, as_hiplibs_type(alpha), as_hiplibs_type(x), incx));   \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPBLAS_SCAL(float, hipblasSscal);
+GKO_BIND_HIPBLAS_SCAL(double, hipblasDscal);
+/* not implemented
+GKO_BIND_HIPBLAS_SCAL(std::complex<float>, hipblasCscal);
+GKO_BIND_HIPBLAS_SCAL(std::complex<double>, hipblasZscal);
+*/
+template <typename ValueType>
+GKO_BIND_HIPBLAS_SCAL(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPBLAS_SCAL
+
+
+#define GKO_BIND_HIPBLAS_AXPY(ValueType, HipblasName)                          \
+    inline void axpy(hipblasHandle_t handle, int n, const ValueType *alpha,    \
+                     const ValueType *x, int incx, ValueType *y, int incy)     \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(                                          \
+            HipblasName(handle, n, as_hiplibs_type(alpha), as_hiplibs_type(x), \
+                        incx, as_hiplibs_type(y), incy));                      \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPBLAS_AXPY(float, hipblasSaxpy);
+GKO_BIND_HIPBLAS_AXPY(double, hipblasDaxpy);
+/* not implemented
+GKO_BIND_HIPBLAS_AXPY(std::complex<float>, hipblasCaxpy);
+GKO_BIND_HIPBLAS_AXPY(std::complex<double>, hipblasZaxpy);
+*/
+template <typename ValueType>
+GKO_BIND_HIPBLAS_AXPY(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPBLAS_AXPY
+
+
+#define GKO_BIND_HIPBLAS_DOT(ValueType, HipblasName)                           \
+    inline void dot(hipblasHandle_t handle, int n, const ValueType *x,         \
+                    int incx, const ValueType *y, int incy, ValueType *result) \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(                                          \
+            HipblasName(handle, n, as_hiplibs_type(x), incx,                   \
+                        as_hiplibs_type(y), incy, as_hiplibs_type(result)));   \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPBLAS_DOT(float, hipblasSdot);
+GKO_BIND_HIPBLAS_DOT(double, hipblasDdot);
+/* not implemented
+GKO_BIND_HIPBLAS_DOT(std::complex<float>, hipblasCdotc);
+GKO_BIND_HIPBLAS_DOT(std::complex<double>, hipblasZdotc);
+*/
+template <typename ValueType>
+GKO_BIND_HIPBLAS_DOT(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPBLAS_DOT
+
+
+#define GKO_BIND_HIPBLAS_NORM2(ValueType, HipblasName)                       \
+    inline void norm2(hipblasHandle_t handle, int n, const ValueType *x,     \
+                      int incx, remove_complex<ValueType> *result)           \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(HipblasName(                            \
+            handle, n, as_hiplibs_type(x), incx, as_hiplibs_type(result)));  \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPBLAS_NORM2(float, hipblasSnrm2);
+GKO_BIND_HIPBLAS_NORM2(double, hipblasDnrm2);
+/* not implemented
+GKO_BIND_HIPBLAS_NORM2(std::complex<float>, hipblasScnrm2);
+GKO_BIND_HIPBLAS_NORM2(std::complex<double>, hipblasDznrm2);
+*/
+template <typename ValueType>
+GKO_BIND_HIPBLAS_NORM2(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPBLAS_NORM2
+
+
+inline hipblasContext *init()
+{
+    hipblasHandle_t handle;
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasCreate(&handle));
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(
+        hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE));
+    return reinterpret_cast<hipblasContext *>(handle);
+}
+
+
+inline void destroy_hipblas_handle(hipblasContext *handle)
+{
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(
+        hipblasDestroy(reinterpret_cast<hipblasHandle_t>(handle)));
+}
+
+
+}  // namespace hipblas
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
new file mode 100644
index 00000000000..3b7c8a978a4
--- /dev/null
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -0,0 +1,816 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
+#define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
+
+
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The HIPSPARSE namespace.
+ *
+ * @ingroup hipsparse
+ */
+namespace hipsparse {
+/**
+ * @brief The detail namespace.
+ *
+ * @ingroup detail
+ */
+namespace detail {
+
+
+template <typename... Args>
+inline int64 not_implemented(Args...)
+{
+    return static_cast<int64>(HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+}
+
+
+}  // namespace detail
+
+
+template <typename ValueType, typename IndexType>
+struct is_supported : std::false_type {};
+
+template <>
+struct is_supported<float, int32> : std::true_type {};
+
+template <>
+struct is_supported<double, int32> : std::true_type {};
+
+
+#define GKO_BIND_HIPSPARSE32_SPMV(ValueType, HipsparseName)                  \
+    inline void spmv(hipsparseHandle_t handle, hipsparseOperation_t transA,  \
+                     int32 m, int32 n, int32 nnz, const ValueType *alpha,    \
+                     const hipsparseMatDescr_t descrA,                       \
+                     const ValueType *csrValA, const int32 *csrRowPtrA,      \
+                     const int32 *csrColIndA, const ValueType *x,            \
+                     const ValueType *beta, ValueType *y)                    \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                        \
+            handle, transA, m, n, nnz, as_hiplibs_type(alpha), descrA,       \
+            as_hiplibs_type(csrValA), csrRowPtrA, csrColIndA,                \
+            as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y))); \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_SPMV(ValueType, HipsparseName)                    \
+    inline void spmv(hipsparseHandle_t handle, hipsparseOperation_t transA,    \
+                     int64 m, int64 n, int64 nnz, const ValueType *alpha,      \
+                     const hipsparseMatDescr_t descrA,                         \
+                     const ValueType *csrValA, const int64 *csrRowPtrA,        \
+                     const int64 *csrColIndA, const ValueType *x,              \
+                     const ValueType *beta, ValueType *y) GKO_NOT_IMPLEMENTED; \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_SPMV(float, hipsparseScsrmv);
+GKO_BIND_HIPSPARSE32_SPMV(double, hipsparseDcsrmv);
+GKO_BIND_HIPSPARSE64_SPMV(float, hipsparseScsrmv);
+GKO_BIND_HIPSPARSE64_SPMV(double, hipsparseDcsrmv);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_SPMV(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_SPMV(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_SPMV
+#undef GKO_BIND_HIPSPARSE64_SPMV
+
+
+#define GKO_BIND_HIPSPARSE32_SPMM(ValueType, HipsparseName)                    \
+    inline void spmm(hipsparseHandle_t handle, hipsparseOperation_t transA,    \
+                     int32 m, int32 n, int32 k, int32 nnz,                     \
+                     const ValueType *alpha, const hipsparseMatDescr_t descrA, \
+                     const ValueType *csrValA, const int32 *csrRowPtrA,        \
+                     const int32 *csrColIndA, const ValueType *B, int32 ldb,   \
+                     const ValueType *beta, ValueType *C, int32 ldc)           \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, transA, m, n, k, nnz, as_hiplibs_type(alpha), descrA,      \
+            as_hiplibs_type(csrValA), csrRowPtrA, csrColIndA,                  \
+            as_hiplibs_type(B), ldb, as_hiplibs_type(beta),                    \
+            as_hiplibs_type(C), ldc));                                         \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_SPMM(ValueType, HipsparseName)                    \
+    inline void spmm(hipsparseHandle_t handle, hipsparseOperation_t transA,    \
+                     int64 m, int64 n, int64 k, int64 nnz,                     \
+                     const ValueType *alpha, const hipsparseMatDescr_t descrA, \
+                     const ValueType *csrValA, const int64 *csrRowPtrA,        \
+                     const int64 *csrColIndA, const ValueType *B, int64 ldb,   \
+                     const ValueType *beta, ValueType *C, int64 ldc)           \
+        GKO_NOT_IMPLEMENTED;                                                   \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_SPMM(float, hipsparseScsrmm);
+GKO_BIND_HIPSPARSE32_SPMM(double, hipsparseDcsrmm);
+GKO_BIND_HIPSPARSE64_SPMM(float, hipsparseScsrmm);
+GKO_BIND_HIPSPARSE64_SPMM(double, hipsparseDcsrmm);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_SPMM(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_SPMM(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_SPMM
+#undef GKO_BIND_HIPSPARSE64_SPMM
+
+
+#define GKO_BIND_HIPSPARSE32_SPMV(ValueType, HipsparseName)                    \
+    inline void spmv(hipsparseHandle_t handle, hipsparseOperation_t transA,    \
+                     const ValueType *alpha, const hipsparseMatDescr_t descrA, \
+                     const hipsparseHybMat_t hybA, const ValueType *x,         \
+                     const ValueType *beta, ValueType *y)                      \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, transA, as_hiplibs_type(alpha), descrA, hybA,              \
+            as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y)));   \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_SPMV(float, hipsparseShybmv);
+GKO_BIND_HIPSPARSE32_SPMV(double, hipsparseDhybmv);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_SPMV(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_SPMV
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_buffer_size(
+    hipsparseHandle_t handle, IndexType m, IndexType n, IndexType k,
+    const ValueType *alpha, const hipsparseMatDescr_t descrA, IndexType nnzA,
+    const IndexType *csrRowPtrA, const IndexType *csrColIndA,
+    const hipsparseMatDescr_t descrB, IndexType nnzB,
+    const IndexType *csrRowPtrB, const IndexType *csrColIndB,
+    const ValueType *beta, const hipsparseMatDescr_t descrD, IndexType nnzD,
+    const IndexType *csrRowPtrD, const IndexType *csrColIndD,
+    csrgemm2Info_t info, size_type &result) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(ValueType, HipsparseName)        \
+    template <>                                                                \
+    inline void spgemm_buffer_size<ValueType, int32>(                          \
+        hipsparseHandle_t handle, int32 m, int32 n, int32 k,                   \
+        const ValueType *alpha, const hipsparseMatDescr_t descrA, int32 nnzA,  \
+        const int32 *csrRowPtrA, const int32 *csrColIndA,                      \
+        const hipsparseMatDescr_t descrB, int32 nnzB, const int32 *csrRowPtrB, \
+        const int32 *csrColIndB, const ValueType *beta,                        \
+        const hipsparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD, \
+        const int32 *csrColIndD, csrgemm2Info_t info, size_type &result)       \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, m, n, k, as_hiplibs_type(alpha), descrA, nnzA, csrRowPtrA, \
+            csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB,                  \
+            as_hiplibs_type(beta), descrD, nnzD, csrRowPtrD, csrColIndD, info, \
+            &result));                                                         \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(float, hipsparseScsrgemm2_bufferSizeExt);
+GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(double, hipsparseDcsrgemm2_bufferSizeExt);
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(std::complex<float>,
+                                      hipsparseCcsrgemm2_bufferSizeExt);
+GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(std::complex<double>,
+                                      hipsparseZcsrgemm2_bufferSizeExt);
+#endif  // hipsparse version >= 1.4
+
+
+#undef GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE
+
+
+template <typename IndexType>
+void spgemm_nnz(hipsparseHandle_t handle, IndexType m, IndexType n, IndexType k,
+                const hipsparseMatDescr_t descrA, IndexType nnzA,
+                const IndexType *csrRowPtrA, const IndexType *csrColIndA,
+                const hipsparseMatDescr_t descrB, IndexType nnzB,
+                const IndexType *csrRowPtrB, const IndexType *csrColIndB,
+                const hipsparseMatDescr_t descrD, IndexType nnzD,
+                const IndexType *csrRowPtrD, const IndexType *csrColIndD,
+                const hipsparseMatDescr_t descrC, IndexType *csrRowPtrC,
+                IndexType *nnzC, csrgemm2Info_t info,
+                void *buffer) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void spgemm_nnz<int32>(
+    hipsparseHandle_t handle, int32 m, int32 n, int32 k,
+    const hipsparseMatDescr_t descrA, int32 nnzA, const int32 *csrRowPtrA,
+    const int32 *csrColIndA, const hipsparseMatDescr_t descrB, int32 nnzB,
+    const int32 *csrRowPtrB, const int32 *csrColIndB,
+    const hipsparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD,
+    const int32 *csrColIndD, const hipsparseMatDescr_t descrC,
+    int32 *csrRowPtrC, int32 *nnzC, csrgemm2Info_t info, void *buffer)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseXcsrgemm2Nnz(
+        handle, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB,
+        csrRowPtrB, csrColIndB, descrD, nnzD, csrRowPtrD, csrColIndD, descrC,
+        csrRowPtrC, nnzC, info, buffer));
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm(hipsparseHandle_t handle, IndexType m, IndexType n, IndexType k,
+            const ValueType *alpha, const hipsparseMatDescr_t descrA,
+            IndexType nnzA, const ValueType *csrValA,
+            const IndexType *csrRowPtrA, const IndexType *csrColIndA,
+            const hipsparseMatDescr_t descrB, IndexType nnzB,
+            const ValueType *csrValB, const IndexType *csrRowPtrB,
+            const IndexType *csrColIndB, const ValueType *beta,
+            const hipsparseMatDescr_t descrD, IndexType nnzD,
+            const ValueType *csrValD, const IndexType *csrRowPtrD,
+            const IndexType *csrColIndD, const hipsparseMatDescr_t descrC,
+            ValueType *csrValC, const IndexType *csrRowPtrC,
+            IndexType *csrColIndC, csrgemm2Info_t info,
+            void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_HIPSPARSE_SPGEMM(ValueType, HipsparseName)                    \
+    template <>                                                                \
+    inline void spgemm<ValueType, int32>(                                      \
+        hipsparseHandle_t handle, int32 m, int32 n, int32 k,                   \
+        const ValueType *alpha, const hipsparseMatDescr_t descrA, int32 nnzA,  \
+        const ValueType *csrValA, const int32 *csrRowPtrA,                     \
+        const int32 *csrColIndA, const hipsparseMatDescr_t descrB, int32 nnzB, \
+        const ValueType *csrValB, const int32 *csrRowPtrB,                     \
+        const int32 *csrColIndB, const ValueType *beta,                        \
+        const hipsparseMatDescr_t descrD, int32 nnzD,                          \
+        const ValueType *csrValD, const int32 *csrRowPtrD,                     \
+        const int32 *csrColIndD, const hipsparseMatDescr_t descrC,             \
+        ValueType *csrValC, const int32 *csrRowPtrC, int32 *csrColIndC,        \
+        csrgemm2Info_t info, void *buffer)                                     \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, m, n, k, as_hiplibs_type(alpha), descrA, nnzA,             \
+            as_hiplibs_type(csrValA), csrRowPtrA, csrColIndA, descrB, nnzB,    \
+            as_hiplibs_type(csrValB), csrRowPtrB, csrColIndB,                  \
+            as_hiplibs_type(beta), descrD, nnzD, as_hiplibs_type(csrValD),     \
+            csrRowPtrD, csrColIndD, descrC, as_hiplibs_type(csrValC),          \
+            csrRowPtrC, csrColIndC, info, buffer));                            \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_SPGEMM(float, hipsparseScsrgemm2);
+GKO_BIND_HIPSPARSE_SPGEMM(double, hipsparseDcsrgemm2);
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+GKO_BIND_HIPSPARSE_SPGEMM(std::complex<float>, hipsparseCcsrgemm2);
+GKO_BIND_HIPSPARSE_SPGEMM(std::complex<double>, hipsparseZcsrgemm2);
+#endif  // hipsparse version >= 1.4
+
+
+#undef GKO_BIND_HIPSPARSE_SPGEMM
+
+
+#define GKO_BIND_HIPSPARSE32_CSR2HYB(ValueType, HipsparseName)               \
+    inline void csr2hyb(hipsparseHandle_t handle, int32 m, int32 n,          \
+                        const hipsparseMatDescr_t descrA,                    \
+                        const ValueType *csrValA, const int32 *csrRowPtrA,   \
+                        const int32 *csrColIndA, hipsparseHybMat_t hybA,     \
+                        int32 userEllWidth,                                  \
+                        hipsparseHybPartition_t partitionType)               \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                        \
+            handle, m, n, descrA, as_hiplibs_type(csrValA), csrRowPtrA,      \
+            csrColIndA, hybA, userEllWidth, partitionType));                 \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_CSR2HYB(ValueType, HipsparseName)               \
+    inline void csr2hyb(                                                     \
+        hipsparseHandle_t handle, int64 m, int64 n,                          \
+        const hipsparseMatDescr_t descrA, const ValueType *csrValA,          \
+        const int64 *csrRowPtrA, const int64 *csrColIndA,                    \
+        hipsparseHybMat_t hybA, int64 userEllWidth,                          \
+        hipsparseHybPartition_t partitionType) GKO_NOT_IMPLEMENTED;          \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_CSR2HYB(float, hipsparseScsr2hyb);
+GKO_BIND_HIPSPARSE32_CSR2HYB(double, hipsparseDcsr2hyb);
+GKO_BIND_HIPSPARSE64_CSR2HYB(float, hipsparseScsr2hyb);
+GKO_BIND_HIPSPARSE64_CSR2HYB(double, hipsparseDcsr2hyb);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_CSR2HYB(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_CSR2HYB(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_CSR2HYB
+#undef GKO_BIND_HIPSPARSE64_CSR2HYB
+
+
+#define GKO_BIND_HIPSPARSE_TRANSPOSE32(ValueType, HipsparseName)              \
+    inline void transpose(hipsparseHandle_t handle, size_type m, size_type n, \
+                          size_type nnz, const ValueType *OrigValA,           \
+                          const int32 *OrigRowPtrA, const int32 *OrigColIndA, \
+                          ValueType *TransValA, int32 *TransRowPtrA,          \
+                          int32 *TransColIndA, hipsparseAction_t copyValues,  \
+                          hipsparseIndexBase_t idxBase)                       \
+    {                                                                         \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                         \
+            handle, m, n, nnz, as_hiplibs_type(OrigValA), OrigRowPtrA,        \
+            OrigColIndA, as_hiplibs_type(TransValA), TransRowPtrA,            \
+            TransColIndA, copyValues, idxBase));                              \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE_TRANSPOSE64(ValueType, HipsparseName)              \
+    inline void transpose(hipsparseHandle_t handle, size_type m, size_type n, \
+                          size_type nnz, const ValueType *OrigValA,           \
+                          const int64 *OrigRowPtrA, const int64 *OrigColIndA, \
+                          ValueType *TransValA, int64 *TransRowPtrA,          \
+                          int64 *TransColIndA, hipsparseAction_t copyValues,  \
+                          hipsparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;  \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_TRANSPOSE32(float, hipsparseScsr2csc);
+GKO_BIND_HIPSPARSE_TRANSPOSE32(double, hipsparseDcsr2csc);
+GKO_BIND_HIPSPARSE_TRANSPOSE64(float, hipsparseScsr2csc);
+GKO_BIND_HIPSPARSE_TRANSPOSE64(double, hipsparseDcsr2csc);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE_TRANSPOSE32(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE_TRANSPOSE64(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPSPARSE_TRANSPOSE
+
+#define GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(ValueType, HipsparseName)        \
+    inline void conj_transpose(                                              \
+        hipsparseHandle_t handle, size_type m, size_type n, size_type nnz,   \
+        const ValueType *OrigValA, const int32 *OrigRowPtrA,                 \
+        const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \
+        int32 *TransColIndA, hipsparseAction_t copyValues,                   \
+        hipsparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;                   \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(ValueType, HipsparseName)        \
+    inline void conj_transpose(                                              \
+        hipsparseHandle_t handle, size_type m, size_type n, size_type nnz,   \
+        const ValueType *OrigValA, const int64 *OrigRowPtrA,                 \
+        const int64 *OrigColIndA, ValueType *TransValA, int64 *TransRowPtrA, \
+        int64 *TransColIndA, hipsparseAction_t copyValues,                   \
+        hipsparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;                   \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(float, hipsparseScsr2csc);
+GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(double, hipsparseDcsr2csc);
+GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(float, hipsparseScsr2csc);
+GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(double, hipsparseDcsr2csc);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(ValueType, detail::not_implemented);
+
+#undef GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE
+
+
+#define GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(ValueType, HipsparseName)     \
+    inline void csrsv2_buffer_size(                                          \
+        hipsparseHandle_t handle, hipsparseOperation_t trans,                \
+        const size_type m, size_type nnz, const hipsparseMatDescr_t descr,   \
+        const ValueType *csrVal, const int32 *csrRowPtr,                     \
+        const int32 *csrColInd, csrsv2Info_t factor_info,                    \
+        int *factor_work_size)                                               \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                        \
+            handle, trans, m, nnz, descr,                                    \
+            as_hiplibs_type(const_cast<ValueType *>(csrVal)), csrRowPtr,     \
+            csrColInd, factor_info, factor_work_size));                      \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(ValueType, HipsparseName)   \
+    inline void csrsv2_buffer_size(                                        \
+        hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \
+        size_type nnz, const hipsparseMatDescr_t descr,                    \
+        const ValueType *csrVal, const int64 *csrRowPtr,                   \
+        const int64 *csrColInd, csrsv2Info_t factor_info,                  \
+        int *factor_work_size) GKO_NOT_IMPLEMENTED;                        \
+    static_assert(true,                                                    \
+                  "This assert is used to counter the "                    \
+                  "false positive extra "                                  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(float, hipsparseScsrsv2_bufferSize);
+GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(double, hipsparseDcsrsv2_bufferSize);
+GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(float, hipsparseScsrsv2_bufferSize);
+GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(double, hipsparseDcsrsv2_bufferSize);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(ValueType, detail::not_implemented);
+#undef GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE
+#undef GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE
+
+#define GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(ValueType, HipsparseName)        \
+    inline void csrsv2_analysis(                                              \
+        hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m,    \
+        size_type nnz, const hipsparseMatDescr_t descr,                       \
+        const ValueType *csrVal, const int32 *csrRowPtr,                      \
+        const int32 *csrColInd, csrsv2Info_t factor_info,                     \
+        hipsparseSolvePolicy_t policy, void *factor_work_vec)                 \
+    {                                                                         \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                         \
+            handle, trans, m, nnz, descr, as_hiplibs_type(csrVal), csrRowPtr, \
+            csrColInd, factor_info, policy, factor_work_vec));                \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(ValueType, HipsparseName)     \
+    inline void csrsv2_analysis(                                           \
+        hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \
+        size_type nnz, const hipsparseMatDescr_t descr,                    \
+        const ValueType *csrVal, const int64 *csrRowPtr,                   \
+        const int64 *csrColInd, csrsv2Info_t factor_info,                  \
+        hipsparseSolvePolicy_t policy, void *factor_work_vec)              \
+        GKO_NOT_IMPLEMENTED;                                               \
+    static_assert(true,                                                    \
+                  "This assert is used to counter the "                    \
+                  "false positive extra "                                  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(float, hipsparseScsrsv2_analysis);
+GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(double, hipsparseDcsrsv2_analysis);
+GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(float, hipsparseScsrsv2_analysis);
+GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(double, hipsparseDcsrsv2_analysis);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(ValueType, detail::not_implemented);
+#undef GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS
+#undef GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS
+
+#define GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(ValueType, HipsparseName)           \
+    inline void csrsv2_solve(                                                 \
+        hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m,    \
+        size_type nnz, const ValueType *one, const hipsparseMatDescr_t descr, \
+        const ValueType *csrVal, const int32 *csrRowPtr,                      \
+        const int32 *csrColInd, csrsv2Info_t factor_info,                     \
+        const ValueType *rhs, ValueType *sol, hipsparseSolvePolicy_t policy,  \
+        void *factor_work_vec)                                                \
+    {                                                                         \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(                                       \
+            HipsparseName(handle, trans, m, nnz, as_hiplibs_type(one), descr, \
+                          as_hiplibs_type(csrVal), csrRowPtr, csrColInd,      \
+                          factor_info, as_hiplibs_type(rhs),                  \
+                          as_hiplibs_type(sol), policy, factor_work_vec));    \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(ValueType, HipsparseName)           \
+    inline void csrsv2_solve(                                                 \
+        hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m,    \
+        size_type nnz, const ValueType *one, const hipsparseMatDescr_t descr, \
+        const ValueType *csrVal, const int64 *csrRowPtr,                      \
+        const int64 *csrColInd, csrsv2Info_t factor_info,                     \
+        const ValueType *rhs, ValueType *sol, hipsparseSolvePolicy_t policy,  \
+        void *factor_work_vec) GKO_NOT_IMPLEMENTED;                           \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(float, hipsparseScsrsv2_solve);
+GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(double, hipsparseDcsrsv2_solve);
+GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(float, hipsparseScsrsv2_solve);
+GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(double, hipsparseDcsrsv2_solve);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(ValueType, detail::not_implemented);
+#undef GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE
+#undef GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE
+
+
+inline hipsparseContext *init()
+{
+    hipsparseHandle_t handle{};
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreate(&handle));
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+        hipsparseSetPointerMode(handle, HIPSPARSE_POINTER_MODE_DEVICE));
+    return reinterpret_cast<hipsparseContext *>(handle);
+}
+
+
+inline void destroy_hipsparse_handle(hipsparseContext *handle)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+        hipsparseDestroy(reinterpret_cast<hipsparseHandle_t>(handle)));
+}
+
+
+inline hipsparseMatDescr_t create_mat_descr()
+{
+    hipsparseMatDescr_t descr{};
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateMatDescr(&descr));
+    return descr;
+}
+
+
+inline void destroy(hipsparseMatDescr_t descr)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyMatDescr(descr));
+}
+
+
+inline csrgemm2Info_t create_spgemm_info()
+{
+    csrgemm2Info_t info{};
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrgemm2Info(&info));
+    return info;
+}
+
+
+inline void destroy_spgemm_info(csrgemm2Info_t info)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyCsrgemm2Info(info));
+}
+
+
+inline csrilu02Info_t create_ilu0_info()
+{
+    csrilu02Info_t info{};
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrilu02Info(&info));
+    return info;
+}
+
+
+inline void destroy_ilu0_info(csrilu02Info_t info)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyCsrilu02Info(info));
+}
+
+
+template <typename IndexType>
+void create_identity_permutation(hipsparseHandle_t handle, IndexType size,
+                                 IndexType *permutation) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void create_identity_permutation<int32>(hipsparseHandle_t handle,
+                                               int32 size, int32 *permutation)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+        hipsparseCreateIdentityPermutation(handle, size, permutation));
+}
+
+
+template <typename IndexType>
+void csrsort_buffer_size(hipsparseHandle_t handle, IndexType m, IndexType n,
+                         IndexType nnz, const IndexType *row_ptrs,
+                         const IndexType *col_idxs,
+                         size_type &buffer_size) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void csrsort_buffer_size<int32>(hipsparseHandle_t handle, int32 m,
+                                       int32 n, int32 nnz,
+                                       const int32 *row_ptrs,
+                                       const int32 *col_idxs,
+                                       size_type &buffer_size)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseXcsrsort_bufferSizeExt(
+        handle, m, n, nnz, row_ptrs, col_idxs, &buffer_size));
+}
+
+
+template <typename IndexType>
+void csrsort(hipsparseHandle_t handle, IndexType m, IndexType n, IndexType nnz,
+             const hipsparseMatDescr_t descr, const IndexType *row_ptrs,
+             IndexType *col_idxs, IndexType *permutation,
+             void *buffer) GKO_NOT_IMPLEMENTED;
+
+template <>
+inline void csrsort<int32>(hipsparseHandle_t handle, int32 m, int32 n,
+                           int32 nnz, const hipsparseMatDescr_t descr,
+                           const int32 *row_ptrs, int32 *col_idxs,
+                           int32 *permutation, void *buffer)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseXcsrsort(
+        handle, m, n, nnz, descr, row_ptrs, col_idxs, permutation, buffer));
+}
+
+
+template <typename ValueType, typename IndexType>
+void gather(hipsparseHandle_t handle, IndexType nnz, const ValueType *in,
+            ValueType *out, const IndexType *permutation) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_HIPSPARSE_GATHER(ValueType, HipsparseName)                   \
+    template <>                                                               \
+    inline void gather<ValueType, int32>(hipsparseHandle_t handle, int32 nnz, \
+                                         const ValueType *in, ValueType *out, \
+                                         const int32 *permutation)            \
+    {                                                                         \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                         \
+            handle, nnz, as_hiplibs_type(in), as_hiplibs_type(out),           \
+            permutation, HIPSPARSE_INDEX_BASE_ZERO));                         \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_GATHER(float, hipsparseSgthr);
+GKO_BIND_HIPSPARSE_GATHER(double, hipsparseDgthr);
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+GKO_BIND_HIPSPARSE_GATHER(std::complex<float>, hipsparseCgthr);
+GKO_BIND_HIPSPARSE_GATHER(std::complex<double>, hipsparseZgthr);
+#endif  // hipsparse version >= 1.4
+
+#undef GKO_BIND_HIPSPARSE_GATHER
+
+
+template <typename ValueType, typename IndexType>
+void ilu0_buffer_size(hipsparseHandle_t handle, IndexType m, IndexType nnz,
+                      const hipsparseMatDescr_t descr, const ValueType *vals,
+                      const IndexType *row_ptrs, const IndexType *col_idxs,
+                      csrilu02Info_t info,
+                      size_type &buffer_size) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(ValueType, HipsparseName)        \
+    template <>                                                              \
+    inline void ilu0_buffer_size<ValueType, int32>(                          \
+        hipsparseHandle_t handle, int32 m, int32 nnz,                        \
+        const hipsparseMatDescr_t descr, const ValueType *vals,              \
+        const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info,   \
+        size_type &buffer_size)                                              \
+    {                                                                        \
+        int tmp_buffer_size{};                                               \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(                                      \
+            HipsparseName(handle, m, nnz, descr,                             \
+                          as_hiplibs_type(const_cast<ValueType *>(vals)),    \
+                          row_ptrs, col_idxs, info, &tmp_buffer_size));      \
+        buffer_size = tmp_buffer_size;                                       \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(float, hipsparseScsrilu02_bufferSize);
+GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(double, hipsparseDcsrilu02_bufferSize);
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(std::complex<float>,
+                                    hipsparseCcsrilu02_bufferSize);
+GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(std::complex<double>,
+                                    hipsparseZcsrilu02_bufferSize);
+#endif  // hipsparse version >= 1.4
+
+#undef GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE
+
+
+template <typename ValueType, typename IndexType>
+void ilu0_analysis(hipsparseHandle_t handle, IndexType m, IndexType nnz,
+                   const hipsparseMatDescr_t descr, const ValueType *vals,
+                   const IndexType *row_ptrs, const IndexType *col_idxs,
+                   csrilu02Info_t info, hipsparseSolvePolicy_t policy,
+                   void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(ValueType, HipsparseName)           \
+    template <>                                                              \
+    inline void ilu0_analysis<ValueType, int32>(                             \
+        hipsparseHandle_t handle, int32 m, int32 nnz,                        \
+        const hipsparseMatDescr_t descr, const ValueType *vals,              \
+        const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info,   \
+        hipsparseSolvePolicy_t policy, void *buffer)                         \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(                                      \
+            HipsparseName(handle, m, nnz, descr, as_hiplibs_type(vals),      \
+                          row_ptrs, col_idxs, info, policy, buffer));        \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(float, hipsparseScsrilu02_analysis);
+GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(double, hipsparseDcsrilu02_analysis);
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(std::complex<float>,
+                                 hipsparseCcsrilu02_analysis);
+GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(std::complex<double>,
+                                 hipsparseZcsrilu02_analysis);
+#endif  // hipsparse version >= 1.4
+
+#undef GKO_BIND_HIPSPARSE_ILU0_ANALYSIS
+
+
+template <typename ValueType, typename IndexType>
+void ilu0(hipsparseHandle_t handle, IndexType m, IndexType nnz,
+          const hipsparseMatDescr_t descr, ValueType *vals,
+          const IndexType *row_ptrs, const IndexType *col_idxs,
+          csrilu02Info_t info, hipsparseSolvePolicy_t policy,
+          void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_HIPSPARSE_ILU0(ValueType, HipsparseName)                    \
+    template <>                                                              \
+    inline void ilu0<ValueType, int32>(                                      \
+        hipsparseHandle_t handle, int32 m, int32 nnz,                        \
+        const hipsparseMatDescr_t descr, ValueType *vals,                    \
+        const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info,   \
+        hipsparseSolvePolicy_t policy, void *buffer)                         \
+    {                                                                        \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(                                      \
+            HipsparseName(handle, m, nnz, descr, as_hiplibs_type(vals),      \
+                          row_ptrs, col_idxs, info, policy, buffer));        \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE_ILU0(float, hipsparseScsrilu02);
+GKO_BIND_HIPSPARSE_ILU0(double, hipsparseDcsrilu02);
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+GKO_BIND_HIPSPARSE_ILU0(std::complex<float>, hipsparseCcsrilu02);
+GKO_BIND_HIPSPARSE_ILU0(std::complex<double>, hipsparseZcsrilu02);
+#endif  // hipsparse version >= 1.4
+
+#undef GKO_BIND_HIPSPARSE_ILU0
+
+
+}  // namespace hipsparse
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp
new file mode 100644
index 00000000000..a80cc24f989
--- /dev/null
+++ b/hip/base/math.hip.hpp
@@ -0,0 +1,52 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_MATH_HIP_HPP_
+#define GKO_HIP_BASE_MATH_HIP_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include <thrust/complex.h>
+
+
+namespace gko {
+
+
+#include "common/base/math.hpp.inc"
+
+
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_MATH_HIP_HPP_
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
new file mode 100644
index 00000000000..f5601c5003a
--- /dev/null
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -0,0 +1,156 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_POINTER_MODE_GUARD_HIP_HPP_
+#define GKO_HIP_BASE_POINTER_MODE_GUARD_HIP_HPP_
+
+
+#include <exception>
+
+
+#include <hip/hip_runtime.h>
+#include <hipblas.h>
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace hipblas {
+
+
+/**
+ * This class defines a pointer mode guard for the hip functions and the hip
+ * module. The guard is used to make sure that the correct pointer mode has been
+ * set when using scalars for the hipblas functions. The class records the
+ * current handle and sets the pointer mode to host for the current scope. After
+ * the scope has been exited, the destructor sets the pointer mode back to
+ * device.
+ */
+class pointer_mode_guard {
+public:
+    pointer_mode_guard(hipblasContext *handle)
+    {
+        l_handle = handle;
+        GKO_ASSERT_NO_HIPBLAS_ERRORS(
+            hipblasSetPointerMode(reinterpret_cast<hipblasHandle_t>(handle),
+                                  HIPBLAS_POINTER_MODE_HOST));
+    }
+
+    pointer_mode_guard(pointer_mode_guard &other) = delete;
+
+    pointer_mode_guard &operator=(const pointer_mode_guard &other) = delete;
+
+    pointer_mode_guard(pointer_mode_guard &&other) = delete;
+
+    pointer_mode_guard const &operator=(pointer_mode_guard &&other) = delete;
+
+    ~pointer_mode_guard() noexcept(false)
+    {
+        /* Ignore the error during stack unwinding for this call */
+        if (std::uncaught_exception()) {
+            hipblasSetPointerMode(reinterpret_cast<hipblasHandle_t>(l_handle),
+                                  HIPBLAS_POINTER_MODE_DEVICE);
+        } else {
+            GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasSetPointerMode(
+                reinterpret_cast<hipblasHandle_t>(l_handle),
+                HIPBLAS_POINTER_MODE_DEVICE));
+        }
+    }
+
+private:
+    hipblasContext *l_handle;
+};
+
+
+}  // namespace hipblas
+
+
+namespace hipsparse {
+
+
+/**
+ * This class defines a pointer mode guard for the hip functions and the hip
+ * module. The guard is used to make sure that the correct pointer mode has been
+ * set when using scalars for the hipsparse functions. The class records the
+ * current handle and sets the pointer mode to host for the current scope. After
+ * the scope has been exited, the destructor sets the pointer mode back to
+ * device.
+ */
+class pointer_mode_guard {
+public:
+    pointer_mode_guard(hipsparseContext *handle)
+    {
+        l_handle = handle;
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+            hipsparseSetPointerMode(reinterpret_cast<hipsparseHandle_t>(handle),
+                                    HIPSPARSE_POINTER_MODE_HOST));
+    }
+
+    pointer_mode_guard(pointer_mode_guard &other) = delete;
+
+    pointer_mode_guard &operator=(const pointer_mode_guard &other) = delete;
+
+    pointer_mode_guard(pointer_mode_guard &&other) = delete;
+
+    pointer_mode_guard const &operator=(pointer_mode_guard &&other) = delete;
+
+    ~pointer_mode_guard() noexcept(false)
+    {
+        /* Ignore the error during stack unwinding for this call */
+        if (std::uncaught_exception()) {
+            hipsparseSetPointerMode(
+                reinterpret_cast<hipsparseHandle_t>(l_handle),
+                HIPSPARSE_POINTER_MODE_DEVICE);
+        } else {
+            GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetPointerMode(
+                reinterpret_cast<hipsparseHandle_t>(l_handle),
+                HIPSPARSE_POINTER_MODE_DEVICE));
+        }
+    }
+
+private:
+    hipsparseContext *l_handle;
+};
+
+
+}  // namespace hipsparse
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_POINTER_MODE_GUARD_HIP_HPP_
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
new file mode 100644
index 00000000000..11ed7c9d847
--- /dev/null
+++ b/hip/base/types.hip.hpp
@@ -0,0 +1,260 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_TYPES_HIP_HPP_
+#define GKO_HIP_BASE_TYPES_HIP_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+#include <hip/hip_complex.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipblas.h>
+#include <thrust/complex.h>
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+namespace gko {
+
+
+namespace kernels {
+namespace hip {
+namespace detail {
+
+
+template <typename T>
+struct hiplibs_type_impl {
+    using type = T;
+};
+
+template <typename T>
+struct hiplibs_type_impl<T *> {
+    using type = typename hiplibs_type_impl<T>::type *;
+};
+
+template <typename T>
+struct hiplibs_type_impl<T &> {
+    using type = typename hiplibs_type_impl<T>::type &;
+};
+
+template <typename T>
+struct hiplibs_type_impl<const T> {
+    using type = const typename hiplibs_type_impl<T>::type;
+};
+
+template <typename T>
+struct hiplibs_type_impl<volatile T> {
+    using type = volatile typename hiplibs_type_impl<T>::type;
+};
+
+template <>
+struct hiplibs_type_impl<std::complex<float>> {
+    using type = hipComplex;
+};
+
+template <>
+struct hiplibs_type_impl<std::complex<double>> {
+    using type = hipDoubleComplex;
+};
+
+template <typename T>
+struct hiplibs_type_impl<thrust::complex<T>> {
+    using type = typename hiplibs_type_impl<std::complex<T>>::type;
+};
+
+template <typename T>
+struct hip_type_impl {
+    using type = T;
+};
+
+template <typename T>
+struct hip_type_impl<T *> {
+    using type = typename hip_type_impl<T>::type *;
+};
+
+template <typename T>
+struct hip_type_impl<T &> {
+    using type = typename hip_type_impl<T>::type &;
+};
+
+template <typename T>
+struct hip_type_impl<const T> {
+    using type = const typename hip_type_impl<T>::type;
+};
+
+template <typename T>
+struct hip_type_impl<volatile T> {
+    using type = volatile typename hip_type_impl<T>::type;
+};
+
+template <typename T>
+struct hip_type_impl<std::complex<T>> {
+    using type = thrust::complex<T>;
+};
+
+template <>
+struct hip_type_impl<hipDoubleComplex> {
+    using type = thrust::complex<double>;
+};
+
+template <>
+struct hip_type_impl<hipComplex> {
+    using type = thrust::complex<float>;
+};
+
+template <typename T>
+constexpr hipblasDatatype_t hip_data_type_impl()
+{
+    return HIPBLAS_C_16F;
+}
+
+template <>
+constexpr hipblasDatatype_t hip_data_type_impl<float16>()
+{
+    return HIPBLAS_R_16F;
+}
+
+template <>
+constexpr hipblasDatatype_t hip_data_type_impl<float>()
+{
+    return HIPBLAS_R_32F;
+}
+
+template <>
+constexpr hipblasDatatype_t hip_data_type_impl<double>()
+{
+    return HIPBLAS_R_64F;
+}
+
+template <>
+constexpr hipblasDatatype_t hip_data_type_impl<std::complex<float>>()
+{
+    return HIPBLAS_C_32F;
+}
+
+template <>
+constexpr hipblasDatatype_t hip_data_type_impl<std::complex<double>>()
+{
+    return HIPBLAS_C_64F;
+}
+
+
+}  // namespace detail
+
+
+/**
+ * This is an alias for the `hipblasDataType_t` equivalent of `T`. By default,
+ * HIPBLAS_C_8U (which is unsupported by C++) is returned.
+ *
+ * @tparam T  a type
+ *
+ * @returns the actual `hipblasDatatype_t`
+ */
+template <typename T>
+constexpr hipblasDatatype_t hip_data_type()
+{
+    return detail::hip_data_type_impl<T>();
+}
+
+
+/**
+ * This is an alias for HIP's equivalent of `T`.
+ *
+ * @tparam T  a type
+ */
+template <typename T>
+using hip_type = typename detail::hip_type_impl<T>::type;
+
+
+/**
+ * Reinterprets the passed in value as a HIP type.
+ *
+ * @param val  the value to reinterpret
+ *
+ * @return `val` reinterpreted to HIP type
+ */
+template <typename T>
+inline xstd::enable_if_t<
+    std::is_pointer<T>::value || std::is_reference<T>::value, hip_type<T>>
+as_hip_type(T val)
+{
+    return reinterpret_cast<hip_type<T>>(val);
+}
+
+
+/**
+ * @copydoc as_hip_type()
+ */
+template <typename T>
+inline xstd::enable_if_t<
+    !std::is_pointer<T>::value && !std::is_reference<T>::value, hip_type<T>>
+as_hip_type(T val)
+{
+    return *reinterpret_cast<hip_type<T> *>(&val);
+}
+
+
+/**
+ * This is an alias for equivalent of type T used in HIP libraries (HIPBLAS,
+ * HIPSPARSE, etc.).
+ *
+ * @tparam T  a type
+ */
+template <typename T>
+using hiplibs_type = typename detail::hiplibs_type_impl<T>::type;
+
+
+/**
+ * Reinterprets the passed in value as an equivalent type used by the HIP
+ * libraries.
+ *
+ * @param val  the value to reinterpret
+ *
+ * @return `val` reinterpreted to type used by HIP libraries
+ */
+template <typename T>
+inline hiplibs_type<T> as_hiplibs_type(T val)
+{
+    return reinterpret_cast<hiplibs_type<T>>(val);
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_TYPES_HIP_HPP_
diff --git a/hip/base/version.hip.cpp b/hip/base/version.hip.cpp
new file mode 100644
index 00000000000..5c5473cbd55
--- /dev/null
+++ b/hip/base/version.hip.cpp
@@ -0,0 +1,48 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/version.hpp>
+
+
+namespace gko {
+
+
+version version_info::get_hip_version() noexcept
+{
+    // When compiling the module, the header version is the same as the library
+    // version. Mismatch between the header and the module versions may happen
+    // if using shared libraries from different versions of Ginkgo.
+    return version_info::get_header_version();
+}
+
+
+}  // namespace gko
diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp
new file mode 100644
index 00000000000..c5ef42dba80
--- /dev/null
+++ b/hip/components/atomic.hip.hpp
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
+#define GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/atomic.hpp.inc"
+
+
+/**
+ * @internal
+ *
+ * @note It is not 'real' complex<float> atomic add operation
+ */
+__forceinline__ __device__ thrust::complex<float> atomic_add(
+    thrust::complex<float> *__restrict__ address, thrust::complex<float> val)
+{
+    hipComplex *addr = reinterpret_cast<hipComplex *>(address);
+    // Separate to real part and imag part
+    auto real = atomic_add(static_cast<float *>(&(addr->x)), val.real());
+    auto imag = atomic_add(static_cast<float *>(&(addr->y)), val.imag());
+    return {real, imag};
+}
+
+
+/**
+ * @internal
+ *
+ * @note It is not 'real' complex<double> atomic add operation
+ */
+__forceinline__ __device__ thrust::complex<double> atomic_add(
+    thrust::complex<double> *__restrict__ address, thrust::complex<double> val)
+{
+    hipDoubleComplex *addr = reinterpret_cast<hipDoubleComplex *>(address);
+    // Separate to real part and imag part
+    auto real = atomic_add(static_cast<double *>(&(addr->x)), val.real());
+    auto imag = atomic_add(static_cast<double *>(&(addr->y)), val.imag());
+    return {real, imag};
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
new file mode 100644
index 00000000000..a893479ec98
--- /dev/null
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -0,0 +1,511 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_COOPERATIVE_GROUPS_HIP_HPP_
+#define GKO_HIP_COMPONENTS_COOPERATIVE_GROUPS_HIP_HPP_
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "hip/base/config.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+/**
+ * Ginkgo uses cooperative groups to handle communication among the threads.
+ *
+ * However, HIP's implementation of cooperative groups is still quite limited
+ * in functionality, and some parts are not supported on all hardware
+ * interesting for Ginkgo. For this reason, Ginkgo exposes only a part of the
+ * original functionality, and possibly extends it if it is required. Thus,
+ * developers should include and use this header and the gko::group namespace
+ * instead of the standard cooperative_groups.h header. The interface exposed
+ * by Ginkgo's implementation is equivalent to the standard interface, with some
+ * useful extensions.
+ *
+ * A cooperative group (both from standard HIP and from Ginkgo) is not a
+ * specific type, but a concept. That is, any type satisfying the interface
+ * imposed by the cooperative groups API is considered a cooperative
+ * group (a.k.a. "duck typing"). To maximize the generality of components that
+ * need cooperative groups, instead of creating the group manually, consider
+ * requesting one as an input parameter. Make sure its type is a template
+ * parameter to maximize the set of groups for which your algorithm can be
+ * invoked. To maximize the amount of contexts in which your algorithm can be
+ * called and avoid hidden requirements, do not depend on a specific setup of
+ * kernel launch parameters (i.e. grid dimensions and block dimensions).
+ * Instead, use the thread_rank() method of the group to distinguish between
+ * distinct threads of a group.
+ *
+ * The original HIP implementation does not provide ways to verify if a certain
+ * type represents a cooperative group. Ginkgo's implementation provides
+ * metafunctions which do that. Additionally, not all cooperative groups have
+ * equivalent functionality, so Ginkgo splits the cooperative group concept into
+ * three sub-concepts which describe what functionality is available. Here is a
+ * list of concepts and their interfaces:
+ *
+ * ```c++
+ * concept Group {
+ *   unsigned size() const;
+ *   unsigned thread_rank() const;
+ * };
+ *
+ * concept SynchronizableGroup : Group {
+ *   void sync();
+ * };
+ *
+ * concept CommunicatorGroup : SynchronizableGroup {
+ *   template <typename T>
+ *   T shfl(T var, int srcLane);
+ *   T shfl_up(T var, unsigned delta);
+ *   T shfl_down(T var, unsigned delta);
+ *   T shfl_xor(T var, int laneMask);
+ *   int all(int predicate);
+ *   int any(int predicate);
+ *   unsigned ballot(int predicate);
+ * };
+ * ```
+ *
+ * To check if a group T satisfies one of the concepts, one can use the
+ * metafunctions is_group<T>::value, is_synchronizable_group<T>::value and
+ * is_communicator_group<T>::value.
+ *
+ * @note Please note that the current implementation of cooperative groups
+ *       contains only a subset of functionalities provided by those APIs. If
+ *       you need more functionality, please add the appropriate implementations
+ *       to existing cooperative groups, or create new groups if the existing
+ *       groups do not cover your use-case. For an example, see the
+ *       enable_extended_shuffle mixin, which adds extended shuffles support
+ *       to built-in HIP cooperative groups.
+ */
+namespace group {
+
+
+// metafunctions
+
+
+namespace detail {
+
+
+template <typename T>
+struct is_group_impl : std::false_type {};
+
+
+template <typename T>
+struct is_synchronizable_group_impl : std::false_type {};
+
+
+template <typename T>
+struct is_communicator_group_impl : std::true_type {};
+
+}  // namespace detail
+
+
+/**
+ * Check if T is a Group.
+ */
+template <typename T>
+using is_group = detail::is_group_impl<xstd::decay_t<T>>;
+
+
+/**
+ * Check if T is a SynchronizableGroup.
+ */
+template <typename T>
+using is_synchronizable_group =
+    detail::is_synchronizable_group_impl<xstd::decay_t<T>>;
+
+
+/**
+ * Check if T is a CommunicatorGroup.
+ */
+template <typename T>
+using is_communicator_group =
+    detail::is_communicator_group_impl<xstd::decay_t<T>>;
+
+
+// types
+
+
+namespace detail {
+
+
+/**
+ * This is a limited implementation of the HIP thread_block_tile.
+ * `any` and `all` are only supported when the size is config::warp_size
+ *
+ */
+template <unsigned Size>
+class thread_block_tile {
+    /**
+     * Mask with Size consecutive ones starting at the least significant bit.
+     */
+    static constexpr auto lane_mask_base = ~config::lane_mask_type{} >>
+                                           (config::warp_size - Size);
+
+public:
+    __device__ thread_block_tile() : data_{Size, 0, 0, lane_mask_base}
+    {
+        auto tid =
+            unsigned(threadIdx.x +
+                     blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z));
+        data_.rank = tid % Size;
+        data_.lane_offset = (tid % config::warp_size) / Size * Size;
+        data_.mask <<= data_.lane_offset;
+    }
+
+    __device__ __forceinline__ unsigned thread_rank() const noexcept
+    {
+        return data_.rank;
+    }
+
+    __device__ __forceinline__ unsigned size() const noexcept { return Size; }
+
+    __device__ __forceinline__ void sync() const noexcept
+    {
+#if GINKGO_HIP_PLATFORM_NVCC
+        __syncwarp(data_.mask);
+#endif
+    }
+
+#if GINKGO_HIP_PLATFORM_HCC
+#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType)                       \
+    __device__ __forceinline__ ValueType ShflOp(                             \
+        ValueType var, SelectorType selector) const noexcept                 \
+    {                                                                        \
+        return __##ShflOp(var, selector, Size);                              \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#else
+#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType)                       \
+    __device__ __forceinline__ ValueType ShflOp(                             \
+        ValueType var, SelectorType selector) const noexcept                 \
+    {                                                                        \
+        return __##ShflOp##_sync(data_.mask, var, selector, Size);           \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif
+
+    GKO_BIND_SHFL(shfl, int32, int32);
+    GKO_BIND_SHFL(shfl, float, int32);
+    GKO_BIND_SHFL(shfl, uint32, int32);
+    GKO_BIND_SHFL(shfl, double, int32);
+
+    GKO_BIND_SHFL(shfl_up, int32, uint32);
+    GKO_BIND_SHFL(shfl_up, uint32, uint32);
+    GKO_BIND_SHFL(shfl_up, float, uint32);
+    GKO_BIND_SHFL(shfl_up, double, uint32);
+
+    GKO_BIND_SHFL(shfl_down, int32, uint32);
+    GKO_BIND_SHFL(shfl_down, uint32, uint32);
+    GKO_BIND_SHFL(shfl_down, float, uint32);
+    GKO_BIND_SHFL(shfl_down, double, uint32);
+
+    GKO_BIND_SHFL(shfl_xor, int32, int32);
+    GKO_BIND_SHFL(shfl_xor, float, int32);
+    GKO_BIND_SHFL(shfl_xor, uint32, int32);
+    GKO_BIND_SHFL(shfl_xor, double, int32);
+
+    /**
+     * Returns true iff the predicate is true for at least one threads in the
+     * group. Note that the whole group needs to execute the same operation.
+     */
+    __device__ __forceinline__ int any(int predicate) const noexcept
+    {
+#if GINKGO_HIP_PLATFORM_HCC
+        if (Size == config::warp_size) {
+            return __any(predicate);
+        } else {
+            return (__ballot(predicate) & data_.mask) != 0;
+        }
+#else
+        return __any_sync(data_.mask, predicate);
+#endif
+    }
+
+    /**
+     * Returns true iff the predicate is true for all threads in the group.
+     * Note that the whole group needs to execute the same operation.
+     */
+    __device__ __forceinline__ int all(int predicate) const noexcept
+    {
+#if GINKGO_HIP_PLATFORM_HCC
+        if (Size == config::warp_size) {
+            return __all(predicate);
+        } else {
+            return (__ballot(predicate) & data_.mask) == data_.mask;
+        }
+#else
+        return __all_sync(data_.mask, predicate);
+#endif
+    }
+
+    /**
+     * Returns a bitmask containing the value of the given predicate
+     * for all threads in the group.
+     * This means that the ith bit is equal to the predicate of the
+     * thread with thread_rank() == i in the group.
+     * Note that the whole group needs to execute the same operation.
+     */
+    __device__ __forceinline__ config::lane_mask_type ballot(
+        int predicate) const noexcept
+    {
+#if GINKGO_HIP_PLATFORM_HCC
+        if (Size == config::warp_size) {
+            return __ballot(predicate);
+        } else {
+            return (__ballot(predicate) & data_.mask) >> data_.lane_offset;
+        }
+#else
+        if (Size == config::warp_size) {
+            return __ballot_sync(data_.mask, predicate);
+        } else {
+            return __ballot_sync(data_.mask, predicate) >> data_.lane_offset;
+        }
+#endif
+    }
+
+private:
+    struct alignas(8) {
+        unsigned size;
+        unsigned rank;
+        unsigned lane_offset;
+        config::lane_mask_type mask;
+    } data_;
+};
+
+
+}  // namespace detail
+
+
+namespace detail {
+
+
+// Adds generalized shuffles that support any type to the group.
+template <typename Group>
+class enable_extended_shuffle : public Group {
+public:
+    using Group::Group;
+    using Group::shfl;
+    using Group::shfl_down;
+    using Group::shfl_up;
+    using Group::shfl_xor;
+
+#define GKO_ENABLE_SHUFFLE_OPERATION(_name, SelectorType)                   \
+    template <typename ValueType>                                           \
+    __device__ __forceinline__ ValueType _name(const ValueType &var,        \
+                                               SelectorType selector) const \
+    {                                                                       \
+        return shuffle_impl(                                                \
+            [this](uint32 v, SelectorType s) {                              \
+                return static_cast<const Group *>(this)->_name(v, s);       \
+            },                                                              \
+            var, selector);                                                 \
+    }
+
+    GKO_ENABLE_SHUFFLE_OPERATION(shfl, int32)
+    GKO_ENABLE_SHUFFLE_OPERATION(shfl_up, uint32)
+    GKO_ENABLE_SHUFFLE_OPERATION(shfl_down, uint32)
+    GKO_ENABLE_SHUFFLE_OPERATION(shfl_xor, int32)
+
+#undef GKO_ENABLE_SHUFFLE_OPERATION
+
+private:
+    template <typename ShuffleOperator, typename ValueType,
+              typename SelectorType>
+    static __device__ __forceinline__ ValueType
+    shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var,
+                 SelectorType selector)
+    {
+        static_assert(sizeof(ValueType) % sizeof(uint32) == 0,
+                      "Unable to shuffle sizes which are not 4-byte multiples");
+        constexpr auto value_size = sizeof(ValueType) / sizeof(uint32);
+        ValueType result;
+        auto var_array = reinterpret_cast<const uint32 *>(&var);
+        auto result_array = reinterpret_cast<uint32 *>(&result);
+#pragma unroll
+        for (std::size_t i = 0; i < value_size; ++i) {
+            result_array[i] = intrinsic_shuffle(var_array[i], selector);
+        }
+        return result;
+    }
+};
+
+
+}  // namespace detail
+
+
+// Implementing this as a using directive messes up with SFINAE for some reason,
+// probably a bug in NVCC. If it is a complete type, everything works fine.
+template <size_type Size>
+struct thread_block_tile
+    : detail::enable_extended_shuffle<detail::thread_block_tile<Size>> {
+    using detail::enable_extended_shuffle<
+        detail::thread_block_tile<Size>>::enable_extended_shuffle;
+};
+
+
+// Only support tile_partition with 1, 2, 4, 8, 16, 32, 64 (hip).
+template <size_type Size, typename Group>
+__device__ __forceinline__ gko::xstd::enable_if_t<
+    (Size <= kernels::hip::config::warp_size) && (Size > 0) &&
+        (kernels::hip::config::warp_size % Size == 0),
+    thread_block_tile<Size>>
+tiled_partition(const Group &)
+{
+    return thread_block_tile<Size>();
+}
+
+
+namespace detail {
+
+
+template <size_type Size>
+struct is_group_impl<thread_block_tile<Size>> : std::true_type {};
+template <size_type Size>
+struct is_synchronizable_group_impl<thread_block_tile<Size>> : std::true_type {
+};
+template <size_type Size>
+struct is_communicator_group_impl<thread_block_tile<Size>> : std::true_type {};
+
+
+}  // namespace detail
+
+
+class thread_block {
+    friend __device__ __forceinline__ thread_block this_thread_block();
+
+public:
+    __device__ __forceinline__ unsigned thread_rank() const noexcept
+    {
+        return data_.rank;
+    }
+
+    __device__ __forceinline__ unsigned size() const noexcept
+    {
+        return data_.size;
+    }
+
+    __device__ __forceinline__ void sync() const noexcept { __syncthreads(); }
+
+private:
+    __device__ thread_block()
+        : data_{static_cast<unsigned>(blockDim.x * blockDim.y * blockDim.z),
+                static_cast<unsigned>(
+                    threadIdx.x +
+                    blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z))}
+    {}
+    struct alignas(8) {
+        unsigned size;
+        unsigned rank;
+    } data_;
+};
+
+
+__device__ __forceinline__ thread_block this_thread_block()
+{
+    return thread_block();
+}
+
+
+namespace detail {
+
+template <>
+struct is_group_impl<thread_block> : std::true_type {};
+template <>
+struct is_synchronizable_group_impl<thread_block> : std::true_type {};
+
+
+}  // namespace detail
+
+
+/**
+ * This is a limited implementation of the CUDA grid_group that works even on
+ * devices that do not support device-wide synchronization and without special
+ * kernel launch syntax.
+ *
+ * Note that this implementation (as well as the one from CUDA's cooperative
+ * groups) does not support large grids, since it uses 32 bits to represent
+ * sizes and ranks, while at least 73 bits (63 bit grid + 10 bit block) would
+ * have to be used to represent the full space of thread ranks.
+ */
+class grid_group {
+    friend __device__ grid_group this_grid();
+
+public:
+    __device__ unsigned size() const noexcept { return data_.size; }
+
+    __device__ unsigned thread_rank() const noexcept { return data_.rank; }
+
+private:
+    // clang-format off
+    __device__ grid_group()
+        : data_{
+                blockDim.x * blockDim.y * blockDim.z *
+                    gridDim.x * gridDim.y * gridDim.z,
+                threadIdx.x + blockDim.x *
+                    (threadIdx.y + blockDim.y *
+                        (threadIdx.z + blockDim.z *
+                            (blockIdx.x + gridDim.x *
+                                (blockIdx.y + gridDim.y * blockIdx.z))))}                      
+    {}
+    // clang-format on
+
+    struct alignas(8) {
+        unsigned size;
+        unsigned rank;
+    } data_;
+};
+
+// Not using this, as grid_group is not universally supported.
+// grid_group this_grid()
+// using cooperative_groups::this_grid;
+// Instead, use our limited implementation:
+__device__ inline grid_group this_grid() { return {}; }
+
+
+}  // namespace group
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_COOPERATIVE_GROUPS_HIP_HPP_
diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp
new file mode 100644
index 00000000000..729e5c3336c
--- /dev/null
+++ b/hip/components/diagonal_block_manipulation.hip.hpp
@@ -0,0 +1,57 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
+#define GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
+
+
+#include "hip/base/config.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace csr {
+
+
+#include "common/components/diagonal_block_manipulation.hpp.inc"
+
+
+}  // namespace csr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
diff --git a/hip/components/fill_array.hip.cpp b/hip/components/fill_array.hip.cpp
new file mode 100644
index 00000000000..e738a68811e
--- /dev/null
+++ b/hip/components/fill_array.hip.cpp
@@ -0,0 +1,76 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/fill_array.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace components {
+
+constexpr int default_block_size = 512;
+
+
+#include "common/components/fill_array.hpp.inc"
+
+
+template <typename ValueType>
+void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType *array,
+                size_type n, ValueType val)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
+    hipLaunchKernelGGL(kernel::fill_array, dim3(grid_size), dim3(block_size), 0,
+                       0, n, as_hip_type(array), as_hip_type(val));
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
new file mode 100644
index 00000000000..1c731862be8
--- /dev/null
+++ b/hip/components/format_conversion.hip.hpp
@@ -0,0 +1,133 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_
+#define GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace ell {
+namespace kernel {
+
+
+/**
+ * @internal
+ *
+ * It counts the number of explicit nonzeros per row of Ell.
+ */
+template <typename ValueType, typename IndexType>
+__global__ void count_nnz_per_row(size_type num_rows, size_type max_nnz_per_row,
+                                  size_type stride,
+                                  const ValueType *__restrict__ values,
+                                  IndexType *__restrict__ result);
+
+
+}  // namespace kernel
+}  // namespace ell
+
+
+namespace coo {
+namespace kernel {
+
+
+/**
+ * @internal
+ *
+ * It converts the row index of Coo to the row pointer of Csr.
+ */
+template <typename IndexType>
+__global__ void convert_row_idxs_to_ptrs(const IndexType *__restrict__ idxs,
+                                         size_type num_nonzeros,
+                                         IndexType *__restrict__ ptrs,
+                                         size_type length);
+
+
+}  // namespace kernel
+
+
+namespace host_kernel {
+
+
+/**
+ * @internal
+ *
+ * It calculates the number of warps used in Coo Spmv depending on the GPU
+ * architecture and the number of stored elements.
+ */
+template <size_type subwarp_size = config::warp_size>
+__host__ size_type calculate_nwarps(std::shared_ptr<const HipExecutor> exec,
+                                    const size_type nnz)
+{
+    size_type nwarps_in_hip = exec->get_num_multiprocessor() *
+                              exec->get_num_warps_per_sm() * config::warp_size /
+                              subwarp_size;
+#if GINKGO_HIP_PLATFORM_NVCC
+    size_type multiple = 8;
+    if (nnz >= 2e6) {
+        multiple = 128;
+    } else if (nnz >= 2e5) {
+        multiple = 32;
+    }
+#else
+    size_type multiple = 2;
+    if (nnz >= 1e7) {
+        multiple = 32;
+    } else if (nnz >= 1e5) {
+        multiple = 8;
+    }
+#endif  // GINKGO_HIP_PLATFORM_NVCC
+    return std::min(multiple * nwarps_in_hip,
+                    size_type(ceildiv(nnz, config::warp_size)));
+}
+
+
+}  // namespace host_kernel
+}  // namespace coo
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_
diff --git a/hip/components/intrinsics.hip.hpp b/hip/components/intrinsics.hip.hpp
new file mode 100644
index 00000000000..8d9d0579013
--- /dev/null
+++ b/hip/components/intrinsics.hip.hpp
@@ -0,0 +1,53 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
+#define GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/intrinsics.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp
new file mode 100644
index 00000000000..30289d41ed2
--- /dev/null
+++ b/hip/components/merging.hip.hpp
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
+#define GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
+
+
+#include "core/base/utils.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/merging.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
diff --git a/hip/components/precision_conversion.hip.cpp b/hip/components/precision_conversion.hip.cpp
new file mode 100644
index 00000000000..6720cf8c92b
--- /dev/null
+++ b/hip/components/precision_conversion.hip.cpp
@@ -0,0 +1,68 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/precision_conversion.hpp"
+
+
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace components {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/components/precision_conversion.hpp.inc"
+
+
+template <typename SourceType, typename TargetType>
+void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
+                       size_type size, const SourceType *in, TargetType *out)
+{
+    auto num_blocks = ceildiv(size, default_block_size);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(convert_precision), num_blocks,
+                       default_block_size, 0, 0, size, as_hip_type(in),
+                       as_hip_type(out));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+
+
+}  // namespace components
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp
new file mode 100644
index 00000000000..2fe526cdb2d
--- /dev/null
+++ b/hip/components/prefix_sum.hip.cpp
@@ -0,0 +1,73 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include "hip/components/prefix_sum.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace components {
+
+
+constexpr int prefix_sum_block_size = 512;
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+    Array<IndexType> block_sum_array(exec, num_blocks);
+    auto block_sums = block_sum_array.get_data();
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<prefix_sum_block_size>),
+                       dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0,
+                       num_entries, counts, block_sums);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(finalize_prefix_sum<prefix_sum_block_size>),
+        dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries,
+        counts, block_sums);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// instantiate for size_type as well, as this is used in the Sellp format
+template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp
new file mode 100644
index 00000000000..0f1059f964e
--- /dev/null
+++ b/hip/components/prefix_sum.hip.hpp
@@ -0,0 +1,59 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
+#define GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/prefix_sum.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
new file mode 100644
index 00000000000..953cead968f
--- /dev/null
+++ b/hip/components/reduction.hip.hpp
@@ -0,0 +1,105 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
+#define GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/components/reduction.hpp.inc"
+
+
+/**
+ * Compute a reduction using add operation (+).
+ *
+ * @param exec  Executor associated to the array
+ * @param size  size of the array
+ * @param source  the pointer of the array
+ *
+ * @return the reduction result
+ */
+template <typename ValueType>
+__host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
+                                    size_type size, const ValueType *source)
+{
+    auto block_results_val = source;
+    size_type grid_dim = size;
+    if (size > default_block_size) {
+        const auto n = ceildiv(size, default_block_size);
+        grid_dim = (n <= default_block_size) ? n : default_block_size;
+
+        auto block_results = Array<ValueType>(exec, grid_dim);
+
+        hipLaunchKernelGGL(
+            reduce_add_array, dim3(grid_dim), dim3(default_block_size), 0, 0,
+            size, as_hip_type(source), as_hip_type(block_results.get_data()));
+
+        block_results_val = block_results.get_const_data();
+    }
+
+    auto d_result = Array<ValueType>(exec, 1);
+
+    hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0,
+                       0, grid_dim, as_hip_type(block_results_val),
+                       as_hip_type(d_result.get_data()));
+    auto answer = exec->copy_val_to_host(d_result.get_const_data());
+    return answer;
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
new file mode 100644
index 00000000000..7611b23fdee
--- /dev/null
+++ b/hip/components/searching.hip.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
+#define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
+
+
+#include "hip/base/config.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/searching.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp
new file mode 100644
index 00000000000..8733778f7e4
--- /dev/null
+++ b/hip/components/segment_scan.hip.hpp
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
+#define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/segment_scan.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp
new file mode 100644
index 00000000000..704c8f9dd07
--- /dev/null
+++ b/hip/components/sorting.hip.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
+#define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
+
+
+#include "hip/base/config.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/sorting.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp
new file mode 100644
index 00000000000..6016c26cf68
--- /dev/null
+++ b/hip/components/thread_ids.hip.hpp
@@ -0,0 +1,60 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
+#define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
+
+
+#include "hip/base/config.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The HIP thread namespace.
+ *
+ * @ingroup hip_thread
+ */
+namespace thread {
+
+
+#include "common/components/thread_ids.hpp.inc"
+
+
+}  // namespace thread
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
diff --git a/hip/components/uninitialized_array.hip.hpp b/hip/components/uninitialized_array.hip.hpp
new file mode 100644
index 00000000000..7780ebb10f5
--- /dev/null
+++ b/hip/components/uninitialized_array.hip.hpp
@@ -0,0 +1,53 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
+#define GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/uninitialized_array.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp
new file mode 100644
index 00000000000..ee2abc649e1
--- /dev/null
+++ b/hip/components/warp_blas.hip.hpp
@@ -0,0 +1,60 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
+#define GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
+
+
+#include <cassert>
+
+
+#include <ginkgo/config.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/warp_blas.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp
new file mode 100644
index 00000000000..7a0f0a4f607
--- /dev/null
+++ b/hip/factorization/factorization_kernels.hip.cpp
@@ -0,0 +1,261 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/factorization_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+constexpr int default_block_size{512};
+
+
+#include "common/factorization/factorization_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void add_diagonal_elements(std::shared_ptr<const HipExecutor> exec,
+                           matrix::Csr<ValueType, IndexType> *mtx,
+                           bool is_sorted)
+{
+    // TODO: Runtime can be optimized by choosing a appropriate size for the
+    //       subwarp dependent on the matrix properties
+    constexpr int subwarp_size = config::warp_size;
+    auto mtx_size = mtx->get_size();
+    auto num_rows = static_cast<IndexType>(mtx_size[0]);
+    auto num_cols = static_cast<IndexType>(mtx_size[1]);
+    size_type row_ptrs_size = num_rows + 1;
+
+    Array<IndexType> row_ptrs_addition(exec, row_ptrs_size);
+    Array<bool> needs_change_host{exec->get_master(), 1};
+    needs_change_host.get_data()[0] = false;
+    Array<bool> needs_change_device{exec, 1};
+    needs_change_device = needs_change_host;
+
+    auto hip_old_values = as_hip_type(mtx->get_const_values());
+    auto hip_old_col_idxs = as_hip_type(mtx->get_const_col_idxs());
+    auto hip_old_row_ptrs = as_hip_type(mtx->get_row_ptrs());
+    auto hip_row_ptrs_add = as_hip_type(row_ptrs_addition.get_data());
+
+    const dim3 block_dim{default_block_size, 1, 1};
+    const dim3 grid_dim{
+        static_cast<uint32>(ceildiv(num_rows, block_dim.x / subwarp_size)), 1,
+        1};
+    if (is_sorted) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::find_missing_diagonal_elements<true, subwarp_size>),
+            grid_dim, block_dim, 0, 0, num_rows, num_cols, hip_old_col_idxs,
+            hip_old_row_ptrs, hip_row_ptrs_add,
+            as_hip_type(needs_change_device.get_data()));
+    } else {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::find_missing_diagonal_elements<false, subwarp_size>),
+            grid_dim, block_dim, 0, 0, num_rows, num_cols, hip_old_col_idxs,
+            hip_old_row_ptrs, hip_row_ptrs_add,
+            as_hip_type(needs_change_device.get_data()));
+    }
+    needs_change_host = needs_change_device;
+    if (!needs_change_host.get_const_data()[0]) {
+        return;
+    }
+
+    components::prefix_sum(exec, hip_row_ptrs_add, row_ptrs_size);
+    exec->synchronize();
+
+    auto total_additions =
+        exec->copy_val_to_host(hip_row_ptrs_add + row_ptrs_size - 1);
+    size_type new_num_elems = static_cast<size_type>(total_additions) +
+                              mtx->get_num_stored_elements();
+
+
+    Array<ValueType> new_values{exec, new_num_elems};
+    Array<IndexType> new_col_idxs{exec, new_num_elems};
+    auto hip_new_values = as_hip_type(new_values.get_data());
+    auto hip_new_col_idxs = as_hip_type(new_col_idxs.get_data());
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::add_missing_diagonal_elements<subwarp_size>),
+        grid_dim, block_dim, 0, 0, num_rows, hip_old_values, hip_old_col_idxs,
+        hip_old_row_ptrs, hip_new_values, hip_new_col_idxs, hip_row_ptrs_add);
+
+    const dim3 grid_dim_row_ptrs_update{
+        static_cast<uint32>(ceildiv(num_rows, block_dim.x)), 1, 1};
+    hipLaunchKernelGGL(kernel::update_row_ptrs, grid_dim_row_ptrs_update,
+                       block_dim, 0, 0, num_rows + 1, hip_old_row_ptrs,
+                       hip_row_ptrs_add);
+
+    matrix::CsrBuilder<ValueType, IndexType> mtx_builder{mtx};
+    mtx_builder.get_value_array() = std::move(new_values);
+    mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l_u(
+    std::shared_ptr<const HipExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+
+    const dim3 block_size{default_block_size, 1, 1};
+    const uint32 number_blocks =
+        ceildiv(num_rows, static_cast<size_type>(block_size.x));
+    const dim3 grid_dim{number_blocks, 1, 1};
+
+    hipLaunchKernelGGL(kernel::count_nnz_per_l_u_row, dim3(grid_dim),
+                       dim3(block_size), 0, 0, num_rows,
+                       as_hip_type(system_matrix->get_const_row_ptrs()),
+                       as_hip_type(system_matrix->get_const_col_idxs()),
+                       as_hip_type(system_matrix->get_const_values()),
+                       as_hip_type(l_row_ptrs), as_hip_type(u_row_ptrs));
+
+    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    components::prefix_sum(exec, u_row_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l_u(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *system_matrix,
+                    matrix::Csr<ValueType, IndexType> *csr_l,
+                    matrix::Csr<ValueType, IndexType> *csr_u)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    hipLaunchKernelGGL(
+        kernel::initialize_l_u, dim3(grid_dim), dim3(block_size), 0, 0,
+        num_rows, as_hip_type(system_matrix->get_const_row_ptrs()),
+        as_hip_type(system_matrix->get_const_col_idxs()),
+        as_hip_type(system_matrix->get_const_values()),
+        as_hip_type(csr_l->get_const_row_ptrs()),
+        as_hip_type(csr_l->get_col_idxs()), as_hip_type(csr_l->get_values()),
+        as_hip_type(csr_u->get_const_row_ptrs()),
+        as_hip_type(csr_u->get_col_idxs()), as_hip_type(csr_u->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l(
+    std::shared_ptr<const HipExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+
+    const dim3 block_size{default_block_size, 1, 1};
+    const uint32 number_blocks =
+        ceildiv(num_rows, static_cast<size_type>(block_size.x));
+    const dim3 grid_dim{number_blocks, 1, 1};
+
+    hipLaunchKernelGGL(kernel::count_nnz_per_l_row, dim3(grid_dim),
+                       dim3(block_size), 0, 0, num_rows,
+                       as_hip_type(system_matrix->get_const_row_ptrs()),
+                       as_hip_type(system_matrix->get_const_col_idxs()),
+                       as_hip_type(system_matrix->get_const_values()),
+                       as_hip_type(l_row_ptrs));
+
+    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l(std::shared_ptr<const HipExecutor> exec,
+                  const matrix::Csr<ValueType, IndexType> *system_matrix,
+                  matrix::Csr<ValueType, IndexType> *csr_l, bool diag_sqrt)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    hipLaunchKernelGGL(kernel::initialize_l, dim3(grid_dim), dim3(block_size),
+                       0, 0, num_rows,
+                       as_hip_type(system_matrix->get_const_row_ptrs()),
+                       as_hip_type(system_matrix->get_const_col_idxs()),
+                       as_hip_type(system_matrix->get_const_values()),
+                       as_hip_type(csr_l->get_const_row_ptrs()),
+                       as_hip_type(csr_l->get_col_idxs()),
+                       as_hip_type(csr_l->get_values()), diag_sqrt);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp
new file mode 100644
index 00000000000..8888856e898
--- /dev/null
+++ b/hip/factorization/ilu_kernels.hip.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/ilu_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "hip/base/device_guard.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace ilu_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
+                matrix::Csr<ValueType, IndexType> *m)
+{
+    const auto id = exec->get_device_id();
+    auto handle = exec->get_hipsparse_handle();
+    gko::hip::device_guard g{id};
+    auto desc = hipsparse::create_mat_descr();
+    auto info = hipsparse::create_ilu0_info();
+
+    // get buffer size for ILU
+    IndexType num_rows = m->get_size()[0];
+    IndexType nnz = m->get_num_stored_elements();
+    size_type buffer_size{};
+    hipsparse::ilu0_buffer_size(handle, num_rows, nnz, desc,
+                                m->get_const_values(), m->get_const_row_ptrs(),
+                                m->get_const_col_idxs(), info, buffer_size);
+
+    Array<char> buffer{exec, buffer_size};
+
+    // set up ILU(0)
+    hipsparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+                             m->get_const_row_ptrs(), m->get_const_col_idxs(),
+                             info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL,
+                             buffer.get_data());
+
+    hipsparse::ilu0(handle, num_rows, nnz, desc, m->get_values(),
+                    m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
+                    HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+
+    hipsparse::destroy_ilu0_info(info);
+    hipsparse::destroy(desc);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+
+
+}  // namespace ilu_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
new file mode 100644
index 00000000000..d987ff36856
--- /dev/null
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -0,0 +1,213 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ict_spgeam_kernels.hpp.inc"
+#include "common/factorization/par_ict_sweep_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *llt,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    matrix::Csr<ValueType, IndexType> *l_new)
+{
+    auto num_rows = static_cast<IndexType>(llt->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llt_row_ptrs = llt->get_const_row_ptrs();
+    auto llt_col_idxs = llt->get_const_col_idxs();
+    auto llt_vals = llt->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::ict_tri_spgeam_nnz<subwarp_size>),
+        dim3(num_blocks), dim3(default_block_size), 0, 0, llt_row_ptrs,
+        llt_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, num_rows);
+
+    // build row ptrs
+    components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::ict_tri_spgeam_init<subwarp_size>),
+        dim3(num_blocks), dim3(default_block_size), 0, 0, llt_row_ptrs,
+        llt_col_idxs, as_hip_type(llt_vals), a_row_ptrs, a_col_idxs,
+        as_hip_type(a_vals), l_row_ptrs, l_col_idxs, as_hip_type(l_vals),
+        l_new_row_ptrs, l_new_col_idxs, as_hip_type(l_new_vals), num_rows);
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Coo<ValueType, IndexType> *l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::ict_sweep<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                       as_hip_type(a->get_const_values()),
+                       l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
+                       l->get_const_col_idxs(), as_hip_type(l->get_values()),
+                       static_cast<IndexType>(l->get_num_stored_elements()));
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *llt,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    matrix::Csr<ValueType, IndexType> *l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llt->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llt, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Coo<ValueType, IndexType> *l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
new file mode 100644
index 00000000000..d8caeb90d16
--- /dev/null
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -0,0 +1,101 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilu_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilu_factorization {
+
+
+constexpr int default_block_size{512};
+
+
+#include "common/factorization/par_ilu_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const HipExecutor> exec,
+                         size_type iterations,
+                         const matrix::Coo<ValueType, IndexType> *system_matrix,
+                         matrix::Csr<ValueType, IndexType> *l_factor,
+                         matrix::Csr<ValueType, IndexType> *u_factor)
+{
+    iterations = (iterations == 0) ? 10 : iterations;
+    const auto num_elements = system_matrix->get_num_stored_elements();
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{
+        static_cast<uint32>(
+            ceildiv(num_elements, static_cast<size_type>(block_size.x))),
+        1, 1};
+    for (size_type i = 0; i < iterations; ++i) {
+        hipLaunchKernelGGL(kernel::compute_l_u_factors, dim3(grid_dim),
+                           dim3(block_size), 0, 0, num_elements,
+                           as_hip_type(system_matrix->get_const_row_idxs()),
+                           as_hip_type(system_matrix->get_const_col_idxs()),
+                           as_hip_type(system_matrix->get_const_values()),
+                           as_hip_type(l_factor->get_const_row_ptrs()),
+                           as_hip_type(l_factor->get_const_col_idxs()),
+                           as_hip_type(l_factor->get_values()),
+                           as_hip_type(u_factor->get_const_row_ptrs()),
+                           as_hip_type(u_factor->get_const_col_idxs()),
+                           as_hip_type(u_factor->get_values()));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
+
+
+}  // namespace par_ilu_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
new file mode 100644
index 00000000000..e4cd1d6bff7
--- /dev/null
+++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
@@ -0,0 +1,211 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
+#include "hip/components/sorting.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/factorization/par_ilut_select_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_filter_kernels.hpp.inc"
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter_approx(syn::value_list<int, subwarp_size>,
+                             std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> *tmp,
+                             remove_complex<ValueType> *threshold,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp->resize_and_reset(tmp_size);
+
+    auto total_counts = reinterpret_cast<IndexType *>(tmp->get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType *>(tmp->get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char *>(
+        tmp->get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType *>(tmp->get_data() + tmp_size_totals +
+                                    tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank
+    auto bucket = static_cast<unsigned char>(
+        sampleselect_find_bucket(exec, total_counts, rank).idx);
+    *threshold =
+        exec->copy_val_to_host(tree + kernel::searchtree_inner_size + bucket);
+    // we implicitly set the first splitter to -inf, but 0 works as well
+    if (bucket == 0) {
+        *threshold = zero<AbsType>();
+    }
+
+    // filter the elements
+    auto old_row_ptrs = m->get_const_row_ptrs();
+    auto old_col_idxs = m->get_const_col_idxs();
+    auto old_vals = m->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(m->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter_nnz<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs);
+
+    // build row pointers
+    components::prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType *new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            Array<ValueType>::view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       old_row_ptrs, old_col_idxs, as_hip_type(old_vals),
+                       oracles, num_rows, bucket, new_row_ptrs, new_row_idxs,
+                       new_col_idxs, as_hip_type(new_vals));
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx,
+                                    threshold_filter_approx);
+
+
+template <typename ValueType, typename IndexType>
+void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> &tmp,
+                             remove_complex<ValueType> &threshold,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto num_rows = m->get_size()[0];
+    auto total_nnz = m->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter_approx(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp,
+        &threshold, m_out, m_out_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp
new file mode 100644
index 00000000000..f1b57bd9f32
--- /dev/null
+++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp
@@ -0,0 +1,166 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_filter_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter(syn::value_list<int, subwarp_size>,
+                      std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType> *m_out,
+                      matrix::Coo<ValueType, IndexType> *m_out_coo, bool lower)
+{
+    auto old_row_ptrs = a->get_const_row_ptrs();
+    auto old_col_idxs = a->get_const_col_idxs();
+    auto old_vals = a->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::threshold_filter_nnz<subwarp_size>),
+        dim3(num_blocks), dim3(default_block_size), 0, 0, old_row_ptrs,
+        as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower);
+
+    // build row pointers
+    components::prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType *new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            Array<ValueType>::view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::threshold_filter<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       old_row_ptrs, old_col_idxs, as_hip_type(old_vals),
+                       num_rows, threshold, new_row_ptrs, new_row_idxs,
+                       new_col_idxs, as_hip_type(new_vals), lower);
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
+
+
+}  // namespace
+
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType> *m_out,
+                      matrix::Coo<ValueType, IndexType> *m_out_coo, bool lower)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
+        m_out_coo, lower);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
new file mode 100644
index 00000000000..ce3d65876b3
--- /dev/null
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -0,0 +1,127 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// prevent compilation failure related to disappearing assert(...) statements
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include "hip/factorization/par_ilut_select_common.hip.hpp"
+
+
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+#include "hip/components/sorting.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_count(const ValueType *values, IndexType size,
+                        remove_complex<ValueType> *tree, unsigned char *oracles,
+                        IndexType *partial_counts, IndexType *total_counts)
+{
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    // pick sample, build searchtree
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), dim3(1),
+                       dim3(bucket_count), 0, 0, as_hip_type(values), size,
+                       tree);
+    // determine bucket sizes
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), dim3(num_blocks),
+                       dim3(default_block_size), 0, 0, as_hip_type(values),
+                       size, tree, partial_counts, oracles, items_per_thread);
+    // compute prefix sum and total sum over block-local values
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum),
+                       dim3(bucket_count), dim3(default_block_size), 0, 0,
+                       partial_counts, total_counts, num_blocks);
+    // compute prefix sum over bucket counts
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<bucket_count>), dim3(1),
+                       dim3(bucket_count), 0, 0, bucket_count, total_counts,
+                       total_counts + bucket_count);
+}
+
+
+#define DECLARE_SSSS_COUNT(ValueType, IndexType)                               \
+    void sampleselect_count(const ValueType *values, IndexType size,           \
+                            remove_complex<ValueType> *tree,                   \
+                            unsigned char *oracles, IndexType *partial_counts, \
+                            IndexType *total_counts)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
+
+
+template <typename IndexType>
+sampleselect_bucket<IndexType> sampleselect_find_bucket(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType *prefix_sum,
+    IndexType rank)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::find_bucket), dim3(1),
+                       dim3(config::warp_size), 0, 0, prefix_sum, rank);
+    IndexType values[3]{};
+    exec->get_master()->copy_from(exec.get(), 3, prefix_sum, values);
+    return {values[0], values[1], values[2]};
+}
+
+
+#define DECLARE_SSSS_FIND_BUCKET(IndexType)                                 \
+    sampleselect_bucket<IndexType> sampleselect_find_bucket(                \
+        std::shared_ptr<const DefaultExecutor> exec, IndexType *prefix_sum, \
+        IndexType rank)
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/hip/factorization/par_ilut_select_common.hip.hpp b/hip/factorization/par_ilut_select_common.hip.hpp
new file mode 100644
index 00000000000..0758eaa2eaf
--- /dev/null
+++ b/hip/factorization/par_ilut_select_common.hip.hpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_
+#define GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+constexpr auto items_per_thread = 16;
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_count(const ValueType *values, IndexType size,
+                        remove_complex<ValueType> *tree, unsigned char *oracles,
+                        IndexType *partial_counts, IndexType *total_counts);
+
+
+template <typename IndexType>
+struct sampleselect_bucket {
+    IndexType idx;
+    IndexType begin;
+    IndexType size;
+};
+
+
+template <typename IndexType>
+sampleselect_bucket<IndexType> sampleselect_find_bucket(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType *prefix_sum,
+    IndexType rank);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_
\ No newline at end of file
diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp
new file mode 100644
index 00000000000..6916344f2bc
--- /dev/null
+++ b/hip/factorization/par_ilut_select_kernel.hip.cpp
@@ -0,0 +1,189 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+#include "hip/components/sorting.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/factorization/par_ilut_select_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_filter(const ValueType *values, IndexType size,
+                         const unsigned char *oracles,
+                         const IndexType *partial_counts, IndexType bucket,
+                         remove_complex<ValueType> *out)
+{
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), dim3(num_blocks),
+                       dim3(default_block_size), 0, 0, as_hip_type(values),
+                       size, bucket, oracles, partial_counts, out,
+                       items_per_thread);
+}
+
+
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *m,
+                      IndexType rank, Array<ValueType> &tmp1,
+                      Array<remove_complex<ValueType>> &tmp2,
+                      remove_complex<ValueType> &threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size_vals =
+        size / bucket_count * 4;  // pessimistic estimate for temporary storage
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp1.resize_and_reset(tmp_size);
+    tmp2.resize_and_reset(tmp_size_vals);
+
+    auto total_counts = reinterpret_cast<IndexType *>(tmp1.get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType *>(tmp1.get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char *>(
+        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType *>(tmp1.get_data() + tmp_size_totals +
+                                    tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank, use bucket-local rank
+    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
+    rank -= bucket.begin;
+
+    if (bucket.size * 2 > tmp_size_vals) {
+        // we need to reallocate tmp2
+        tmp2.resize_and_reset(bucket.size * 2);
+    }
+    auto tmp21 = tmp2.get_data();
+    auto tmp22 = tmp2.get_data() + bucket.size;
+    // extract target bucket
+    sampleselect_filter(values, size, oracles, partial_counts, bucket.idx,
+                        tmp22);
+
+    // recursively select from smaller buckets
+    int step{};
+    while (bucket.size > kernel::basecase_size) {
+        std::swap(tmp21, tmp22);
+        const auto *tmp_in = tmp21;
+        auto tmp_out = tmp22;
+
+        sampleselect_count(tmp_in, bucket.size, tree, oracles, partial_counts,
+                           total_counts);
+        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
+        sampleselect_filter(tmp_in, bucket.size, oracles, partial_counts,
+                            bucket.idx, tmp_out);
+
+        rank -= new_bucket.begin;
+        bucket.size = new_bucket.size;
+        // we should never need more than 5 recursion steps, this would mean
+        // 256^5 = 2^40. fall back to standard library algorithm in that case.
+        ++step;
+        if (step > 5) {
+            Array<AbsType> cpu_out_array{
+                exec->get_master(),
+                Array<AbsType>::view(exec, bucket.size, tmp_out)};
+            auto begin = cpu_out_array.get_data();
+            auto end = begin + bucket.size;
+            auto middle = begin + rank;
+            std::nth_element(begin, middle, end);
+            threshold = *middle;
+            return;
+        }
+    }
+
+    // base case
+    auto out_ptr = reinterpret_cast<AbsType *>(tmp1.get_data());
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), dim3(1),
+                       dim3(kernel::basecase_block_size), 0, 0, tmp22,
+                       bucket.size, rank, out_ptr);
+    threshold = exec->copy_val_to_host(out_ptr);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
new file mode 100644
index 00000000000..3d00ce153ba
--- /dev/null
+++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
@@ -0,0 +1,185 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_spgeam_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *lu,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Csr<ValueType, IndexType> *u,
+                    matrix::Csr<ValueType, IndexType> *l_new,
+                    matrix::Csr<ValueType, IndexType> *u_new)
+{
+    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
+    auto lu_row_ptrs = lu->get_const_row_ptrs();
+    auto lu_col_idxs = lu->get_const_col_idxs();
+    auto lu_vals = lu->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    // count non-zeros per row
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_nnz<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
+                       l_new_row_ptrs, u_new_row_ptrs, num_rows);
+
+    // build row ptrs
+    components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
+    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    // fill columns and values
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_init<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       lu_row_ptrs, lu_col_idxs, as_hip_type(lu_vals),
+                       a_row_ptrs, a_col_idxs, as_hip_type(a_vals), l_row_ptrs,
+                       l_col_idxs, as_hip_type(l_vals), u_row_ptrs, u_col_idxs,
+                       as_hip_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
+                       as_hip_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
+                       as_hip_type(u_new_vals), num_rows);
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *lu,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Csr<ValueType, IndexType> *u,
+                    matrix::Csr<ValueType, IndexType> *l_new,
+                    matrix::Csr<ValueType, IndexType> *u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        lu->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
+        u_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp
new file mode 100644
index 00000000000..15fb33ec34e
--- /dev/null
+++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp
@@ -0,0 +1,150 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_sweep_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_l_u_factors(syn::value_list<int, subwarp_size>,
+                         std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *a,
+                         matrix::Csr<ValueType, IndexType> *l,
+                         const matrix::Coo<ValueType, IndexType> *l_coo,
+                         matrix::Csr<ValueType, IndexType> *u,
+                         const matrix::Coo<ValueType, IndexType> *u_coo,
+                         matrix::Csr<ValueType, IndexType> *u_csc)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
+                                            u->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::sweep<subwarp_size>), dim3(num_blocks),
+        dim3(default_block_size), 0, 0, a->get_const_row_ptrs(),
+        a->get_const_col_idxs(), as_hip_type(a->get_const_values()),
+        l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
+        l->get_const_col_idxs(), as_hip_type(l->get_values()),
+        static_cast<IndexType>(l->get_num_stored_elements()),
+        u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
+        as_hip_type(u->get_values()), u_csc->get_const_row_ptrs(),
+        u_csc->get_const_col_idxs(), as_hip_type(u_csc->get_values()),
+        static_cast<IndexType>(u->get_num_stored_elements()));
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
+                                    compute_l_u_factors);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *a,
+                         matrix::Csr<ValueType, IndexType> *l,
+                         const matrix::Coo<ValueType, IndexType> *l_coo,
+                         matrix::Csr<ValueType, IndexType> *u,
+                         const matrix::Coo<ValueType, IndexType> *u_coo,
+                         matrix::Csr<ValueType, IndexType> *u_csc)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        l->get_num_stored_elements() + u->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_l_u_factors(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
+        u_csc);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
new file mode 100644
index 00000000000..c4907e899c7
--- /dev/null
+++ b/hip/matrix/coo_kernels.hip.cpp
@@ -0,0 +1,264 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/coo_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/format_conversion.hip.hpp"
+#include "hip/components/segment_scan.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+/**
+ * @brief The HIP namespace.
+ *
+ * @ingroup hip
+ */
+namespace hip {
+/**
+ * @brief The Coordinate matrix format namespace.
+ *
+ * @ingroup coo
+ */
+namespace coo {
+
+
+constexpr int default_block_size = 512;
+constexpr int warps_in_block = 4;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+
+
+#include "common/matrix/coo_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const HipExecutor> exec,
+          const matrix::Coo<ValueType, IndexType> *a,
+          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+{
+    components::fill_array(exec, c->get_values(), c->get_num_stored_elements(),
+                           zero<ValueType>());
+
+    spmv2(exec, a, b, c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *alpha,
+                   const matrix::Coo<ValueType, IndexType> *a,
+                   const matrix::Dense<ValueType> *b,
+                   const matrix::Dense<ValueType> *beta,
+                   matrix::Dense<ValueType> *c)
+{
+    dense::scale(exec, beta, c);
+    advanced_spmv2(exec, alpha, a, b, c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void spmv2(std::shared_ptr<const HipExecutor> exec,
+           const matrix::Coo<ValueType, IndexType> *a,
+           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+{
+    const auto nnz = a->get_num_stored_elements();
+    const auto b_ncols = b->get_size()[1];
+    const dim3 coo_block(config::warp_size, warps_in_block, 1);
+    const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
+
+    if (nwarps > 0) {
+        // TODO: b_ncols needs to be tuned.
+        if (b_ncols < 4) {
+            const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
+            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+            hipLaunchKernelGGL(
+                abstract_spmv, dim3(coo_grid), dim3(coo_block), 0, 0, nnz,
+                num_lines, as_hip_type(a->get_const_values()),
+                a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()),
+                as_hip_type(b->get_const_values()), b->get_stride(),
+                as_hip_type(c->get_values()), c->get_stride());
+        } else {
+            int num_elems =
+                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
+            const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
+                                ceildiv(b_ncols, config::warp_size));
+            hipLaunchKernelGGL(
+                abstract_spmm, dim3(coo_grid), dim3(coo_block), 0, 0, nnz,
+                num_elems, as_hip_type(a->get_const_values()),
+                a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()),
+                b_ncols, as_hip_type(b->get_const_values()), b->get_stride(),
+                as_hip_type(c->get_values()), c->get_stride());
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv2(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Dense<ValueType> *alpha,
+                    const matrix::Coo<ValueType, IndexType> *a,
+                    const matrix::Dense<ValueType> *b,
+                    matrix::Dense<ValueType> *c)
+{
+    const auto nnz = a->get_num_stored_elements();
+    const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
+    const dim3 coo_block(config::warp_size, warps_in_block, 1);
+    const auto b_ncols = b->get_size()[1];
+
+    if (nwarps > 0) {
+        // TODO: b_ncols needs to be tuned.
+        if (b_ncols < 4) {
+            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+            const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
+            hipLaunchKernelGGL(
+                abstract_spmv, dim3(coo_grid), dim3(coo_block), 0, 0, nnz,
+                num_lines, as_hip_type(alpha->get_const_values()),
+                as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+                as_hip_type(a->get_const_row_idxs()),
+                as_hip_type(b->get_const_values()), b->get_stride(),
+                as_hip_type(c->get_values()), c->get_stride());
+        } else {
+            int num_elems =
+                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
+            const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
+                                ceildiv(b_ncols, config::warp_size));
+            hipLaunchKernelGGL(
+                abstract_spmm, dim3(coo_grid), dim3(coo_block), 0, 0, nnz,
+                num_elems, as_hip_type(alpha->get_const_values()),
+                as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+                as_hip_type(a->get_const_row_idxs()), b_ncols,
+                as_hip_type(b->get_const_values()), b->get_stride(),
+                as_hip_type(c->get_values()), c->get_stride());
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
+
+
+template <typename IndexType>
+void convert_row_idxs_to_ptrs(std::shared_ptr<const HipExecutor> exec,
+                              const IndexType *idxs, size_type num_nonzeros,
+                              IndexType *ptrs, size_type length)
+{
+    const auto grid_dim = ceildiv(num_nonzeros, default_block_size);
+
+    hipLaunchKernelGGL(kernel::convert_row_idxs_to_ptrs, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, as_hip_type(idxs),
+                       num_nonzeros, as_hip_type(ptrs), length);
+}
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
+{
+    auto num_rows = result->get_size()[0];
+
+    auto row_ptrs = result->get_row_ptrs();
+    const auto nnz = result->get_num_stored_elements();
+
+    const auto source_row_idxs = source->get_const_row_idxs();
+
+    convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs,
+                             num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto stride = result->get_stride();
+
+    const auto nnz = source->get_num_stored_elements();
+
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
+    const dim3 init_grid_dim(ceildiv(stride, block_size.x),
+                             ceildiv(num_rows, block_size.y), 1);
+    hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim),
+                       dim3(block_size), 0, 0, num_rows, num_cols, stride,
+                       as_hip_type(result->get_values()));
+
+    const auto grid_dim = ceildiv(nnz, default_block_size);
+    hipLaunchKernelGGL(kernel::fill_in_dense, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, nnz,
+                       as_hip_type(source->get_const_row_idxs()),
+                       as_hip_type(source->get_const_col_idxs()),
+                       as_hip_type(source->get_const_values()), stride,
+                       as_hip_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL);
+
+
+}  // namespace coo
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
new file mode 100644
index 00000000000..cf49d441032
--- /dev/null
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -0,0 +1,1174 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/csr_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/segment_scan.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Compressed sparse row matrix format namespace.
+ *
+ * @ingroup csr
+ */
+namespace csr {
+
+
+constexpr int default_block_size = 512;
+constexpr int warps_in_block = 4;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+constexpr int wsize = config::warp_size;
+constexpr int classical_overweight = 32;
+
+
+/**
+ * A compile-time list of the number items per threads for which spmv kernel
+ * should be compiled.
+ */
+using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
+
+using classical_kernels =
+    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
+
+using spgeam_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/matrix/csr_kernels.hpp.inc"
+
+
+namespace host_kernel {
+
+
+template <int items_per_thread, typename ValueType, typename IndexType>
+void merge_path_spmv(syn::value_list<int, items_per_thread>,
+                     std::shared_ptr<const HipExecutor> exec,
+                     const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Dense<ValueType> *b,
+                     matrix::Dense<ValueType> *c,
+                     const matrix::Dense<ValueType> *alpha = nullptr,
+                     const matrix::Dense<ValueType> *beta = nullptr)
+{
+    const IndexType total = a->get_size()[0] + a->get_num_stored_elements();
+    const IndexType grid_num =
+        ceildiv(total, spmv_block_size * items_per_thread);
+    const dim3 grid(grid_num);
+    const dim3 block(spmv_block_size);
+    Array<IndexType> row_out(exec, grid_num);
+    Array<ValueType> val_out(exec, grid_num);
+
+    for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) {
+        if (alpha == nullptr && beta == nullptr) {
+            const auto b_vals = b->get_const_values() + column_id;
+            auto c_vals = c->get_values() + column_id;
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(
+                    kernel::abstract_merge_path_spmv<items_per_thread>),
+                dim3(grid), dim3(block), 0, 0,
+                static_cast<IndexType>(a->get_size()[0]),
+                as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+                as_hip_type(a->get_const_row_ptrs()),
+                as_hip_type(a->get_const_srow()), as_hip_type(b_vals),
+                b->get_stride(), as_hip_type(c_vals), c->get_stride(),
+                as_hip_type(row_out.get_data()),
+                as_hip_type(val_out.get_data()));
+            hipLaunchKernelGGL(kernel::abstract_reduce, dim3(1),
+                               dim3(spmv_block_size), 0, 0, grid_num,
+                               as_hip_type(val_out.get_data()),
+                               as_hip_type(row_out.get_data()),
+                               as_hip_type(c_vals), c->get_stride());
+
+        } else if (alpha != nullptr && beta != nullptr) {
+            const auto b_vals = b->get_const_values() + column_id;
+            auto c_vals = c->get_values() + column_id;
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(
+                    kernel::abstract_merge_path_spmv<items_per_thread>),
+                dim3(grid), dim3(block), 0, 0,
+                static_cast<IndexType>(a->get_size()[0]),
+                as_hip_type(alpha->get_const_values()),
+                as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+                as_hip_type(a->get_const_row_ptrs()),
+                as_hip_type(a->get_const_srow()), as_hip_type(b_vals),
+                b->get_stride(), as_hip_type(beta->get_const_values()),
+                as_hip_type(c_vals), c->get_stride(),
+                as_hip_type(row_out.get_data()),
+                as_hip_type(val_out.get_data()));
+            hipLaunchKernelGGL(kernel::abstract_reduce, dim3(1),
+                               dim3(spmv_block_size), 0, 0, grid_num,
+                               as_hip_type(val_out.get_data()),
+                               as_hip_type(row_out.get_data()),
+                               as_hip_type(alpha->get_const_values()),
+                               as_hip_type(c_vals), c->get_stride());
+        } else {
+            GKO_KERNEL_NOT_FOUND;
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
+
+
+template <typename ValueType, typename IndexType>
+int compute_items_per_thread(std::shared_ptr<const HipExecutor> exec)
+{
+#if GINKGO_HIP_PLATFORM_NVCC
+
+
+    const int version =
+        (exec->get_major_version() << 4) + exec->get_minor_version();
+    // The num_item is decided to make the occupancy 100%
+    // TODO: Extend this list when new GPU is released
+    //       Tune this parameter
+    // 128 threads/block the number of items per threads
+    // 3.0 3.5: 6
+    // 3.7: 14
+    // 5.0, 5.3, 6.0, 6.2: 8
+    // 5.2, 6.1, 7.0: 12
+    int num_item = 6;
+    switch (version) {
+    case 0x50:
+    case 0x53:
+    case 0x60:
+    case 0x62:
+        num_item = 8;
+        break;
+    case 0x52:
+    case 0x61:
+    case 0x70:
+        num_item = 12;
+        break;
+    case 0x37:
+        num_item = 14;
+    }
+
+
+#else
+
+
+    // HIP uses the minimal num_item to make the code work correctly.
+    // TODO: this parameter should be tuned.
+    int num_item = 6;
+
+
+#endif  // GINKGO_HIP_PLATFORM_NVCC
+
+
+    // Ensure that the following is satisfied:
+    // sizeof(IndexType) + sizeof(ValueType)
+    // <= items_per_thread * sizeof(IndexType)
+    constexpr int minimal_num =
+        ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType));
+    int items_per_thread = num_item * 4 / sizeof(IndexType);
+    return std::max(minimal_num, items_per_thread);
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void classical_spmv(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Dense<ValueType> *b,
+                    matrix::Dense<ValueType> *c,
+                    const matrix::Dense<ValueType> *alpha = nullptr,
+                    const matrix::Dense<ValueType> *beta = nullptr)
+{
+    const auto nwarps = exec->get_num_warps_per_sm() *
+                        exec->get_num_multiprocessor() * classical_overweight;
+    const auto gridx =
+        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
+                 int64(nwarps / warps_in_block));
+    const dim3 grid(gridx, b->get_size()[1]);
+    const dim3 block(spmv_block_size);
+
+    if (alpha == nullptr && beta == nullptr) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::abstract_classical_spmv<subwarp_size>),
+            dim3(grid), dim3(block), 0, 0, a->get_size()[0],
+            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+            as_hip_type(a->get_const_row_ptrs()),
+            as_hip_type(b->get_const_values()), b->get_stride(),
+            as_hip_type(c->get_values()), c->get_stride());
+
+    } else if (alpha != nullptr && beta != nullptr) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::abstract_classical_spmv<subwarp_size>),
+            dim3(grid), dim3(block), 0, 0, a->get_size()[0],
+            as_hip_type(alpha->get_const_values()),
+            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+            as_hip_type(a->get_const_row_ptrs()),
+            as_hip_type(b->get_const_values()), b->get_stride(),
+            as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()),
+            c->get_stride());
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
+
+
+}  // namespace host_kernel
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const HipExecutor> exec,
+          const matrix::Csr<ValueType, IndexType> *a,
+          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+{
+    if (a->get_strategy()->get_name() == "load_balance") {
+        components::fill_array(exec, c->get_values(),
+                               c->get_num_stored_elements(), zero<ValueType>());
+        const IndexType nwarps = a->get_num_srow_elements();
+        if (nwarps > 0) {
+            const dim3 csr_block(config::warp_size, warps_in_block, 1);
+            const dim3 csr_grid(ceildiv(nwarps, warps_in_block),
+                                b->get_size()[1]);
+            hipLaunchKernelGGL(
+                kernel::abstract_spmv, dim3(csr_grid), dim3(csr_block), 0, 0,
+                nwarps, static_cast<IndexType>(a->get_size()[0]),
+                as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+                as_hip_type(a->get_const_row_ptrs()),
+                as_hip_type(a->get_const_srow()),
+                as_hip_type(b->get_const_values()),
+                as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
+                as_hip_type(c->get_stride()));
+        } else {
+            GKO_NOT_SUPPORTED(nwarps);
+        }
+    } else if (a->get_strategy()->get_name() == "merge_path") {
+        int items_per_thread =
+            host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
+        host_kernel::select_merge_path_spmv(
+            compiled_kernels(),
+            [&items_per_thread](int compiled_info) {
+                return items_per_thread == compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+    } else if (a->get_strategy()->get_name() == "classical") {
+        IndexType max_length_per_row = 0;
+        using Tcsr = matrix::Csr<ValueType, IndexType>;
+        if (auto strategy =
+                std::dynamic_pointer_cast<const typename Tcsr::classical>(
+                    a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else if (auto strategy = std::dynamic_pointer_cast<
+                       const typename Tcsr::automatical>(a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else {
+            GKO_NOT_SUPPORTED(a->get_strategy());
+        }
+        host_kernel::select_classical_spmv(
+            classical_kernels(),
+            [&max_length_per_row](int compiled_info) {
+                return max_length_per_row >= compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+    } else if (a->get_strategy()->get_name() == "sparselib" ||
+               a->get_strategy()->get_name() == "cusparse") {
+        if (hipsparse::is_supported<ValueType, IndexType>::value) {
+            // TODO: add implementation for int64 and multiple RHS
+            auto handle = exec->get_hipsparse_handle();
+            auto descr = hipsparse::create_mat_descr();
+            {
+                hipsparse::pointer_mode_guard pm_guard(handle);
+                auto row_ptrs = a->get_const_row_ptrs();
+                auto col_idxs = a->get_const_col_idxs();
+                auto alpha = one<ValueType>();
+                auto beta = zero<ValueType>();
+                if (b->get_stride() != 1 || c->get_stride() != 1) {
+                    GKO_NOT_IMPLEMENTED;
+                }
+                hipsparse::spmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                a->get_size()[0], a->get_size()[1],
+                                a->get_num_stored_elements(), &alpha, descr,
+                                a->get_const_values(), row_ptrs, col_idxs,
+                                b->get_const_values(), &beta, c->get_values());
+            }
+            hipsparse::destroy(descr);
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *alpha,
+                   const matrix::Csr<ValueType, IndexType> *a,
+                   const matrix::Dense<ValueType> *b,
+                   const matrix::Dense<ValueType> *beta,
+                   matrix::Dense<ValueType> *c)
+{
+    if (a->get_strategy()->get_name() == "load_balance") {
+        dense::scale(exec, beta, c);
+
+        const IndexType nwarps = a->get_num_srow_elements();
+
+        if (nwarps > 0) {
+            const dim3 csr_block(config::warp_size, warps_in_block, 1);
+            const dim3 csr_grid(ceildiv(nwarps, warps_in_block),
+                                b->get_size()[1]);
+            hipLaunchKernelGGL(
+                kernel::abstract_spmv, dim3(csr_grid), dim3(csr_block), 0, 0,
+                nwarps, static_cast<IndexType>(a->get_size()[0]),
+                as_hip_type(alpha->get_const_values()),
+                as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+                as_hip_type(a->get_const_row_ptrs()),
+                as_hip_type(a->get_const_srow()),
+                as_hip_type(b->get_const_values()),
+                as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
+                as_hip_type(c->get_stride()));
+        } else {
+            GKO_NOT_SUPPORTED(nwarps);
+        }
+    } else if (a->get_strategy()->get_name() == "sparselib" ||
+               a->get_strategy()->get_name() == "cusparse") {
+        if (hipsparse::is_supported<ValueType, IndexType>::value) {
+            // TODO: add implementation for int64 and multiple RHS
+            auto descr = hipsparse::create_mat_descr();
+
+            auto row_ptrs = a->get_const_row_ptrs();
+            auto col_idxs = a->get_const_col_idxs();
+
+            if (b->get_stride() != 1 || c->get_stride() != 1)
+                GKO_NOT_IMPLEMENTED;
+
+            hipsparse::spmv(exec->get_hipsparse_handle(),
+                            HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0],
+                            a->get_size()[1], a->get_num_stored_elements(),
+                            alpha->get_const_values(), descr,
+                            a->get_const_values(), row_ptrs, col_idxs,
+                            b->get_const_values(), beta->get_const_values(),
+                            c->get_values());
+
+            hipsparse::destroy(descr);
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else if (a->get_strategy()->get_name() == "classical") {
+        IndexType max_length_per_row = 0;
+        using Tcsr = matrix::Csr<ValueType, IndexType>;
+        if (auto strategy =
+                std::dynamic_pointer_cast<const typename Tcsr::classical>(
+                    a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else if (auto strategy = std::dynamic_pointer_cast<
+                       const typename Tcsr::automatical>(a->get_strategy())) {
+            max_length_per_row = strategy->get_max_length_per_row();
+        } else {
+            GKO_NOT_SUPPORTED(a->get_strategy());
+        }
+        host_kernel::select_classical_spmv(
+            classical_kernels(),
+            [&max_length_per_row](int compiled_info) {
+                return max_length_per_row >= compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
+            beta);
+    } else if (a->get_strategy()->get_name() == "merge_path") {
+        int items_per_thread =
+            host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
+        host_kernel::select_merge_path_spmv(
+            compiled_kernels(),
+            [&items_per_thread](int compiled_info) {
+                return items_per_thread == compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
+            beta);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void spgemm(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        hipsparse::pointer_mode_guard pm_guard(handle);
+        auto a_descr = hipsparse::create_mat_descr();
+        auto b_descr = hipsparse::create_mat_descr();
+        auto c_descr = hipsparse::create_mat_descr();
+        auto d_descr = hipsparse::create_mat_descr();
+        auto info = hipsparse::create_spgemm_info();
+
+        auto alpha = one<ValueType>();
+        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
+        auto a_vals = a->get_const_values();
+        auto a_row_ptrs = a->get_const_row_ptrs();
+        auto a_col_idxs = a->get_const_col_idxs();
+        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
+        auto b_vals = b->get_const_values();
+        auto b_row_ptrs = b->get_const_row_ptrs();
+        auto b_col_idxs = b->get_const_col_idxs();
+        auto null_value = static_cast<ValueType *>(nullptr);
+        auto null_index = static_cast<IndexType *>(nullptr);
+        auto zero_nnz = IndexType{};
+        auto m = static_cast<IndexType>(a->get_size()[0]);
+        auto n = static_cast<IndexType>(b->get_size()[1]);
+        auto k = static_cast<IndexType>(a->get_size()[1]);
+        auto c_row_ptrs = c->get_row_ptrs();
+        matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+        auto &c_col_idxs_array = c_builder.get_col_idx_array();
+        auto &c_vals_array = c_builder.get_value_array();
+
+        // allocate buffer
+        size_type buffer_size{};
+        hipsparse::spgemm_buffer_size(
+            handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
+            zero_nnz, null_index, null_index, info, buffer_size);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+
+        // count nnz
+        IndexType c_nnz{};
+        hipsparse::spgemm_nnz(
+            handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
+            b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index,
+            null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer);
+
+        // accumulate non-zeros
+        c_col_idxs_array.resize_and_reset(c_nnz);
+        c_vals_array.resize_and_reset(c_nnz);
+        auto c_col_idxs = c_col_idxs_array.get_data();
+        auto c_vals = c_vals_array.get_data();
+        hipsparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+                          a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                          b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
+                          null_value, null_index, null_index, c_descr, c_vals,
+                          c_row_ptrs, c_col_idxs, info, buffer);
+
+        hipsparse::destroy_spgemm_info(info);
+        hipsparse::destroy(d_descr);
+        hipsparse::destroy(c_descr);
+        hipsparse::destroy(b_descr);
+        hipsparse::destroy(a_descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void spgeam(syn::value_list<int, subwarp_size>,
+            std::shared_ptr<const HipExecutor> exec, const ValueType *alpha,
+            const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
+            const ValueType *a_vals, const ValueType *beta,
+            const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
+            const ValueType *b_vals, matrix::Csr<ValueType, IndexType> *c)
+{
+    auto m = static_cast<IndexType>(c->get_size()[0]);
+    auto c_row_ptrs = c->get_row_ptrs();
+    // count nnz for alpha * A + beta * B
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(m, subwarps_per_block);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam_nnz<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m,
+                       c_row_ptrs);
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, m + 1);
+
+    // accumulate non-zeros for alpha * A + beta * B
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
+    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
+    c_builder.get_value_array().resize_and_reset(c_nnz);
+    auto c_col_idxs = c->get_col_idxs();
+    auto c_vals = c->get_values();
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam<subwarp_size>),
+                       dim3(num_blocks), dim3(default_block_size), 0, 0,
+                       as_hip_type(alpha), a_row_ptrs, a_col_idxs,
+                       as_hip_type(a_vals), as_hip_type(beta), b_row_ptrs,
+                       b_col_idxs, as_hip_type(b_vals), m, c_row_ptrs,
+                       c_col_idxs, as_hip_type(c_vals));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
+                     const matrix::Dense<ValueType> *alpha,
+                     const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Csr<ValueType, IndexType> *b,
+                     const matrix::Dense<ValueType> *beta,
+                     const matrix::Csr<ValueType, IndexType> *d,
+                     matrix::Csr<ValueType, IndexType> *c)
+{
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        hipsparse::pointer_mode_guard pm_guard(handle);
+        auto a_descr = hipsparse::create_mat_descr();
+        auto b_descr = hipsparse::create_mat_descr();
+        auto c_descr = hipsparse::create_mat_descr();
+        auto d_descr = hipsparse::create_mat_descr();
+        auto info = hipsparse::create_spgemm_info();
+
+        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
+        auto a_vals = a->get_const_values();
+        auto a_row_ptrs = a->get_const_row_ptrs();
+        auto a_col_idxs = a->get_const_col_idxs();
+        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
+        auto b_vals = b->get_const_values();
+        auto b_row_ptrs = b->get_const_row_ptrs();
+        auto b_col_idxs = b->get_const_col_idxs();
+        auto d_vals = d->get_const_values();
+        auto d_row_ptrs = d->get_const_row_ptrs();
+        auto d_col_idxs = d->get_const_col_idxs();
+        auto null_value = static_cast<ValueType *>(nullptr);
+        auto null_index = static_cast<IndexType *>(nullptr);
+        auto one_value = one<ValueType>();
+        auto m = static_cast<IndexType>(a->get_size()[0]);
+        auto n = static_cast<IndexType>(b->get_size()[1]);
+        auto k = static_cast<IndexType>(a->get_size()[1]);
+
+        // allocate buffer
+        size_type buffer_size{};
+        hipsparse::spgemm_buffer_size(
+            handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
+            IndexType{}, null_index, null_index, info, buffer_size);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+
+        // count nnz
+        Array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
+        auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data();
+        IndexType c_nnz{};
+        hipsparse::spgemm_nnz(
+            handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
+            b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index,
+            null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer);
+
+        // accumulate non-zeros for A * B
+        Array<IndexType> c_tmp_col_idxs_array(exec, c_nnz);
+        Array<ValueType> c_tmp_vals_array(exec, c_nnz);
+        auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data();
+        auto c_tmp_vals = c_tmp_vals_array.get_data();
+        hipsparse::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
+                          a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                          b_row_ptrs, b_col_idxs, null_value, d_descr,
+                          IndexType{}, null_value, null_index, null_index,
+                          c_descr, c_tmp_vals, c_tmp_row_ptrs, c_tmp_col_idxs,
+                          info, buffer);
+
+        // destroy hipsparse context
+        hipsparse::destroy_spgemm_info(info);
+        hipsparse::destroy(d_descr);
+        hipsparse::destroy(c_descr);
+        hipsparse::destroy(b_descr);
+        hipsparse::destroy(a_descr);
+
+        auto total_nnz = c_nnz + d->get_num_stored_elements();
+        auto nnz_per_row = total_nnz / m;
+        select_spgeam(
+            spgeam_kernels(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= nnz_per_row ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec,
+            alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs,
+            c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs,
+            d_vals, c);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto total_nnz =
+        a->get_num_stored_elements() + b->get_num_stored_elements();
+    auto nnz_per_row = total_nnz / a->get_size()[0];
+    select_spgeam(
+        spgeam_kernels(),
+        [&](int compiled_subwarp_size) {
+            return compiled_subwarp_size >= nnz_per_row ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec,
+        alpha->get_const_values(), a->get_const_row_ptrs(),
+        a->get_const_col_idxs(), a->get_const_values(),
+        beta->get_const_values(), b->get_const_row_ptrs(),
+        b->get_const_col_idxs(), b->get_const_values(), c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
+
+template <typename IndexType>
+void convert_row_ptrs_to_idxs(std::shared_ptr<const HipExecutor> exec,
+                              const IndexType *ptrs, size_type num_rows,
+                              IndexType *idxs)
+{
+    const auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    hipLaunchKernelGGL(kernel::convert_row_ptrs_to_idxs, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       as_hip_type(ptrs), as_hip_type(idxs));
+}
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
+{
+    auto num_rows = result->get_size()[0];
+
+    auto row_idxs = result->get_row_idxs();
+    const auto source_row_ptrs = source->get_const_row_ptrs();
+
+    convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto stride = result->get_stride();
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const auto col_idxs = source->get_const_col_idxs();
+    const auto vals = source->get_const_values();
+
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
+    const dim3 init_grid_dim(ceildiv(stride, block_size.x),
+                             ceildiv(num_rows, block_size.y), 1);
+    hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim),
+                       dim3(block_size), 0, 0, num_rows, num_cols, stride,
+                       as_hip_type(result->get_values()));
+
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+    hipLaunchKernelGGL(
+        kernel::fill_in_dense, dim3(grid_dim), dim3(default_block_size), 0, 0,
+        num_rows, as_hip_type(row_ptrs), as_hip_type(col_idxs),
+        as_hip_type(vals), stride, as_hip_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+
+    auto result_values = result->get_values();
+    auto result_col_idxs = result->get_col_idxs();
+    auto slice_lengths = result->get_slice_lengths();
+    auto slice_sets = result->get_slice_sets();
+
+    const auto slice_size = (result->get_slice_size() == 0)
+                                ? matrix::default_slice_size
+                                : result->get_slice_size();
+    const auto stride_factor = (result->get_stride_factor() == 0)
+                                   ? matrix::default_stride_factor
+                                   : result->get_stride_factor();
+    const int slice_num = ceildiv(num_rows, slice_size);
+
+    const auto source_values = source->get_const_values();
+    const auto source_row_ptrs = source->get_const_row_ptrs();
+    const auto source_col_idxs = source->get_const_col_idxs();
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       as_hip_type(source_row_ptrs),
+                       as_hip_type(nnz_per_row.get_data()));
+
+    grid_dim = slice_num;
+
+    hipLaunchKernelGGL(kernel::calculate_slice_lengths, dim3(grid_dim),
+                       dim3(config::warp_size), 0, 0, num_rows, slice_size,
+                       stride_factor, as_hip_type(nnz_per_row.get_const_data()),
+                       as_hip_type(slice_lengths), as_hip_type(slice_sets));
+
+    components::prefix_sum(exec, slice_sets, slice_num + 1);
+
+    grid_dim = ceildiv(num_rows, default_block_size);
+    hipLaunchKernelGGL(kernel::fill_in_sellp, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, slice_size,
+                       as_hip_type(source_values), as_hip_type(source_row_ptrs),
+                       as_hip_type(source_col_idxs), as_hip_type(slice_lengths),
+                       as_hip_type(slice_sets), as_hip_type(result_col_idxs),
+                       as_hip_type(result_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_ell(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
+{
+    const auto source_values = source->get_const_values();
+    const auto source_row_ptrs = source->get_const_row_ptrs();
+    const auto source_col_idxs = source->get_const_col_idxs();
+
+    auto result_values = result->get_values();
+    auto result_col_idxs = result->get_col_idxs();
+    const auto stride = result->get_stride();
+    const auto max_nnz_per_row = result->get_num_stored_elements_per_row();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+
+    const auto init_grid_dim =
+        ceildiv(max_nnz_per_row * num_rows, default_block_size);
+
+    hipLaunchKernelGGL(kernel::initialize_zero_ell, dim3(init_grid_dim),
+                       dim3(default_block_size), 0, 0, max_nnz_per_row, stride,
+                       as_hip_type(result_values),
+                       as_hip_type(result_col_idxs));
+
+    const auto grid_dim =
+        ceildiv(num_rows * config::warp_size, default_block_size);
+
+    hipLaunchKernelGGL(kernel::fill_in_ell, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, stride,
+                       as_hip_type(source_values), as_hip_type(source_row_ptrs),
+                       as_hip_type(source_col_idxs), as_hip_type(result_values),
+                       as_hip_type(result_col_idxs));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void calculate_total_cols(std::shared_ptr<const HipExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *source,
+                          size_type *result, size_type stride_factor,
+                          size_type slice_size)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto slice_num = ceildiv(num_rows, slice_size);
+    const auto row_ptrs = source->get_const_row_ptrs();
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       as_hip_type(row_ptrs),
+                       as_hip_type(nnz_per_row.get_data()));
+
+    grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
+    auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
+
+    hipLaunchKernelGGL(kernel::reduce_max_nnz_per_slice, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, slice_size,
+                       stride_factor, as_hip_type(nnz_per_row.get_const_data()),
+                       as_hip_type(max_nnz_per_slice.get_data()));
+
+    grid_dim = ceildiv(slice_num, default_block_size);
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, slice_num,
+                       as_hip_type(max_nnz_per_slice.get_const_data()),
+                       as_hip_type(block_results.get_data()));
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(1),
+                       dim3(default_block_size), 0, 0, grid_dim,
+                       as_hip_type(block_results.get_const_data()),
+                       as_hip_type(d_result.get_data()));
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void transpose(std::shared_ptr<const HipExecutor> exec,
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
+{
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
+        hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
+
+        hipsparse::transpose(
+            exec->get_hipsparse_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
+{
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        const dim3 block_size(default_block_size, 1, 1);
+        const dim3 grid_size(
+            ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1);
+
+        hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
+        hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
+
+        hipsparse::transpose(
+            exec->get_hipsparse_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase);
+
+        hipLaunchKernelGGL(conjugate_kernel, dim3(grid_size), dim3(block_size),
+                           0, 0, trans->get_num_stored_elements(),
+                           as_hip_type(trans->get_values()));
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const HipExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const HipExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const HipExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void calculate_max_nnz_per_row(std::shared_ptr<const HipExecutor> exec,
+                               const matrix::Csr<ValueType, IndexType> *source,
+                               size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+    auto block_results = Array<size_type>(exec, default_block_size);
+    auto d_result = Array<size_type>(exec, 1);
+
+    const auto grid_dim = ceildiv(num_rows, default_block_size);
+    hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       as_hip_type(source->get_const_row_ptrs()),
+                       as_hip_type(nnz_per_row.get_data()));
+
+    const auto n = ceildiv(num_rows, default_block_size);
+    const auto reduce_dim = n <= default_block_size ? n : default_block_size;
+    hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(reduce_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       as_hip_type(nnz_per_row.get_const_data()),
+                       as_hip_type(block_results.get_data()));
+
+    hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(1),
+                       dim3(default_block_size), 0, 0, reduce_dim,
+                       as_hip_type(block_results.get_const_data()),
+                       as_hip_type(d_result.get_data()));
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
+{
+    auto ell_val = result->get_ell_values();
+    auto ell_col = result->get_ell_col_idxs();
+    auto coo_val = result->get_coo_values();
+    auto coo_col = result->get_coo_col_idxs();
+    auto coo_row = result->get_coo_row_idxs();
+    const auto stride = result->get_ell_stride();
+    const auto max_nnz_per_row = result->get_ell_num_stored_elements_per_row();
+    const auto num_rows = result->get_size()[0];
+    const auto coo_num_stored_elements = result->get_coo_num_stored_elements();
+    auto grid_dim = ceildiv(max_nnz_per_row * num_rows, default_block_size);
+
+    hipLaunchKernelGGL(kernel::initialize_zero_ell, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, max_nnz_per_row, stride,
+                       as_hip_type(ell_val), as_hip_type(ell_col));
+
+    grid_dim = ceildiv(num_rows, default_block_size);
+    auto coo_offset = Array<size_type>(exec, num_rows);
+    hipLaunchKernelGGL(kernel::calculate_hybrid_coo_row_nnz, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       max_nnz_per_row,
+                       as_hip_type(source->get_const_row_ptrs()),
+                       as_hip_type(coo_offset.get_data()));
+
+    components::prefix_sum(exec, coo_offset.get_data(), num_rows);
+
+    grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
+    hipLaunchKernelGGL(kernel::fill_in_hybrid, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, stride,
+                       max_nnz_per_row, as_hip_type(source->get_const_values()),
+                       as_hip_type(source->get_const_row_ptrs()),
+                       as_hip_type(source->get_const_col_idxs()),
+                       as_hip_type(coo_offset.get_const_data()),
+                       as_hip_type(ell_val), as_hip_type(ell_col),
+                       as_hip_type(coo_val), as_hip_type(coo_col),
+                       as_hip_type(coo_row));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row(std::shared_ptr<const HipExecutor> exec,
+                                const matrix::Csr<ValueType, IndexType> *source,
+                                Array<size_type> *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       as_hip_type(row_ptrs), as_hip_type(result->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
+                          matrix::Csr<ValueType, IndexType> *to_sort)
+{
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        auto descr = hipsparse::create_mat_descr();
+        auto m = IndexType(to_sort->get_size()[0]);
+        auto n = IndexType(to_sort->get_size()[1]);
+        auto nnz = IndexType(to_sort->get_num_stored_elements());
+        auto row_ptrs = to_sort->get_const_row_ptrs();
+        auto col_idxs = to_sort->get_col_idxs();
+        auto vals = to_sort->get_values();
+
+        // copy values
+        Array<ValueType> tmp_vals_array(exec, nnz);
+        exec->copy(nnz, vals, tmp_vals_array.get_data());
+        auto tmp_vals = tmp_vals_array.get_const_data();
+
+        // init identity permutation
+        Array<IndexType> permutation_array(exec, nnz);
+        auto permutation = permutation_array.get_data();
+        hipsparse::create_identity_permutation(handle, nnz, permutation);
+
+        // allocate buffer
+        size_type buffer_size{};
+        hipsparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+                                       buffer_size);
+        Array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+
+        // sort column indices
+        hipsparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+                           permutation, buffer);
+
+        // sort values
+        hipsparse::gather(handle, nnz, tmp_vals, vals, permutation);
+
+        hipsparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
+
+
+template <typename ValueType, typename IndexType>
+void is_sorted_by_column_index(
+    std::shared_ptr<const HipExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *to_check, bool *is_sorted)
+{
+    *is_sorted = true;
+    auto cpu_array = Array<bool>::view(exec->get_master(), 1, is_sorted);
+    auto gpu_array = Array<bool>{exec, cpu_array};
+    auto block_size = default_block_size;
+    auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
+    auto num_blocks = ceildiv(num_rows, block_size);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::check_unsorted), dim3(num_blocks),
+        dim3(block_size), 0, 0, to_check->get_const_row_ptrs(),
+        to_check->get_const_col_idxs(), num_rows, gpu_array.get_data());
+    cpu_array = gpu_array;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
+
+
+}  // namespace csr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
new file mode 100644
index 00000000000..4d5eb1da4b3
--- /dev/null
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -0,0 +1,690 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+constexpr auto default_block_size = 512;
+
+
+#include "common/matrix/dense_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const HipExecutor> exec,
+                  const matrix::Dense<ValueType> *a,
+                  const matrix::Dense<ValueType> *b,
+                  matrix::Dense<ValueType> *c)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        {
+            hipblas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, c->get_size()[1],
+                          c->get_size()[0], a->get_size()[1], &alpha,
+                          b->get_const_values(), b->get_stride(),
+                          a->get_const_values(), a->get_stride(), &beta,
+                          c->get_values(), c->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const HipExecutor> exec,
+           const matrix::Dense<ValueType> *alpha,
+           const matrix::Dense<ValueType> *a, const matrix::Dense<ValueType> *b,
+           const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *c)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        hipblas::gemm(exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N,
+                      c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                      alpha->get_const_values(), b->get_const_values(),
+                      b->get_stride(), a->get_const_values(), a->get_stride(),
+                      beta->get_const_values(), c->get_values(),
+                      c->get_stride());
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void scale(std::shared_ptr<const HipExecutor> exec,
+           const matrix::Dense<ValueType> *alpha, matrix::Dense<ValueType> *x)
+{
+    if (hipblas::is_supported<ValueType>::value && x->get_size()[1] == 1) {
+        hipblas::scal(exec->get_hipblas_handle(), x->get_size()[0],
+                      alpha->get_const_values(), x->get_values(),
+                      x->get_stride());
+    } else {
+        // TODO: tune this parameter
+        constexpr auto block_size = default_block_size;
+        const dim3 grid_dim =
+            ceildiv(x->get_size()[0] * x->get_size()[1], block_size);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::scale<block_size>), dim3(grid_dim),
+            dim3(block_dim), 0, 0, x->get_size()[0], x->get_size()[1],
+            alpha->get_size()[1], as_hip_type(alpha->get_const_values()),
+            as_hip_type(x->get_values()), x->get_stride());
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
+
+
+template <typename ValueType>
+void add_scaled(std::shared_ptr<const HipExecutor> exec,
+                const matrix::Dense<ValueType> *alpha,
+                const matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *y)
+{
+    if (hipblas::is_supported<ValueType>::value && x->get_size()[1] == 1) {
+        hipblas::axpy(exec->get_hipblas_handle(), x->get_size()[0],
+                      alpha->get_const_values(), x->get_const_values(),
+                      x->get_stride(), y->get_values(), y->get_stride());
+    } else {
+        // TODO: tune this parameter
+        constexpr auto block_size = default_block_size;
+        const dim3 grid_dim =
+            ceildiv(x->get_size()[0] * x->get_size()[1], block_size);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::add_scaled<block_size>), dim3(grid_dim),
+            dim3(block_dim), 0, 0, x->get_size()[0], x->get_size()[1],
+            alpha->get_size()[1], as_hip_type(alpha->get_const_values()),
+            as_hip_type(x->get_const_values()), x->get_stride(),
+            as_hip_type(y->get_values()), y->get_stride());
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const HipExecutor> exec,
+                 const matrix::Dense<ValueType> *x,
+                 const matrix::Dense<ValueType> *y,
+                 matrix::Dense<ValueType> *result)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        // TODO: write a custom kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            hipblas::dot(exec->get_hipblas_handle(), x->get_size()[0],
+                         x->get_const_values() + col, x->get_stride(),
+                         y->get_const_values() + col, y->get_stride(),
+                         result->get_values() + col);
+        }
+    } else {
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        constexpr auto block_size = 1024;
+
+        constexpr auto work_per_block = work_per_thread * block_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(kernel::compute_partial_dot<block_size>),
+                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
+                as_hip_type(x->get_const_values() + col), x->get_stride(),
+                as_hip_type(y->get_const_values() + col), y->get_stride(),
+                as_hip_type(work.get_data()));
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(kernel::finalize_dot_computation<block_size>),
+                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
+                as_hip_type(work.get_const_data()),
+                as_hip_type(result->get_values() + col));
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *x,
+                   matrix::Dense<remove_complex<ValueType>> *result)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            hipblas::norm2(exec->get_hipblas_handle(), x->get_size()[0],
+                           x->get_const_values() + col, x->get_stride(),
+                           result->get_values() + col);
+        }
+    } else {
+        using norm_type = remove_complex<ValueType>;
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        constexpr auto block_size = 1024;
+
+        constexpr auto work_per_block = work_per_thread * block_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        Array<norm_type> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(kernel::compute_partial_norm2<block_size>),
+                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
+                as_hip_type(x->get_const_values() + col), x->get_stride(),
+                as_hip_type(work.get_data()));
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(kernel::finalize_norm2_computation<block_size>),
+                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
+                as_hip_type(work.get_const_data()),
+                as_hip_type(result->get_values() + col));
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_idxs = result->get_row_idxs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto stride = source->get_stride();
+
+    auto nnz_prefix_sum = Array<size_type>(exec, num_rows);
+    calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum);
+
+    const size_type grid_dim = ceildiv(num_rows, default_block_size);
+    auto add_values = Array<size_type>(exec, grid_dim);
+
+    components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
+
+    hipLaunchKernelGGL(kernel::fill_in_coo, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, num_cols,
+                       stride, as_hip_type(nnz_prefix_sum.get_const_data()),
+                       as_hip_type(source->get_const_values()),
+                       as_hip_type(row_idxs), as_hip_type(col_idxs),
+                       as_hip_type(values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_ptrs = result->get_row_ptrs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto stride = source->get_stride();
+
+    const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
+
+    hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim_nnz),
+                       dim3(default_block_size), 0, 0, num_rows, num_cols,
+                       stride, as_hip_type(source->get_const_values()),
+                       as_hip_type(row_ptrs));
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
+
+    hipLaunchKernelGGL(
+        kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0,
+        num_rows, num_cols, stride, as_hip_type(source->get_const_values()),
+        as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_ell(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto max_nnz_per_row = result->get_num_stored_elements_per_row();
+
+    auto col_ptrs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto source_stride = source->get_stride();
+    auto result_stride = result->get_stride();
+
+    auto grid_dim = ceildiv(result_stride, default_block_size);
+    hipLaunchKernelGGL(kernel::fill_in_ell, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, num_cols,
+                       source_stride, as_hip_type(source->get_const_values()),
+                       max_nnz_per_row, result_stride, as_hip_type(col_ptrs),
+                       as_hip_type(values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
+{
+    const auto stride = source->get_stride();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+
+    auto vals = result->get_values();
+    auto col_idxs = result->get_col_idxs();
+    auto slice_lengths = result->get_slice_lengths();
+    auto slice_sets = result->get_slice_sets();
+
+    const auto slice_size = (result->get_slice_size() == 0)
+                                ? matrix::default_slice_size
+                                : result->get_slice_size();
+    const auto stride_factor = (result->get_stride_factor() == 0)
+                                   ? matrix::default_stride_factor
+                                   : result->get_stride_factor();
+    const int slice_num = ceildiv(num_rows, slice_size);
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    auto grid_dim = slice_num;
+
+    hipLaunchKernelGGL(kernel::calculate_slice_lengths, dim3(grid_dim),
+                       dim3(config::warp_size), 0, 0, num_rows, slice_size,
+                       slice_num, stride_factor,
+                       as_hip_type(nnz_per_row.get_const_data()),
+                       as_hip_type(slice_lengths), as_hip_type(slice_sets));
+
+    components::prefix_sum(exec, slice_sets, slice_num + 1);
+
+    grid_dim = ceildiv(num_rows, default_block_size);
+    hipLaunchKernelGGL(
+        kernel::fill_in_sellp, dim3(grid_dim), dim3(default_block_size), 0, 0,
+        num_rows, num_cols, slice_size, stride,
+        as_hip_type(source->get_const_values()), as_hip_type(slice_lengths),
+        as_hip_type(slice_sets), as_hip_type(col_idxs), as_hip_type(vals));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_sparsity_csr(std::shared_ptr<const HipExecutor> exec,
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
+
+
+template <typename ValueType>
+void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Dense<ValueType> *source, size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
+
+
+template <typename ValueType>
+void calculate_max_nnz_per_row(std::shared_ptr<const HipExecutor> exec,
+                               const matrix::Dense<ValueType> *source,
+                               size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    const auto n = ceildiv(num_rows, default_block_size);
+    const size_type grid_dim =
+        (n <= default_block_size) ? n : default_block_size;
+
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(grid_dim),
+                       dim3(default_block_size),
+                       default_block_size * sizeof(size_type), 0, num_rows,
+                       as_hip_type(nnz_per_row.get_const_data()),
+                       as_hip_type(block_results.get_data()));
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(1),
+                       dim3(default_block_size),
+                       default_block_size * sizeof(size_type), 0, grid_dim,
+                       as_hip_type(block_results.get_const_data()),
+                       as_hip_type(d_result.get_data()));
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
+
+
+template <typename ValueType>
+void calculate_nonzeros_per_row(std::shared_ptr<const HipExecutor> exec,
+                                const matrix::Dense<ValueType> *source,
+                                Array<size_type> *result)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
+    const dim3 grid_size(grid_x, 1, 1);
+    hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_size),
+                       dim3(block_size), 0, 0, source->get_size()[0],
+                       source->get_size()[1], source->get_stride(),
+                       as_hip_type(source->get_const_values()),
+                       as_hip_type(result->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL);
+
+
+template <typename ValueType>
+void calculate_total_cols(std::shared_ptr<const HipExecutor> exec,
+                          const matrix::Dense<ValueType> *source,
+                          size_type *result, size_type stride_factor,
+                          size_type slice_size)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto num_cols = source->get_size()[1];
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
+
+    auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
+
+    hipLaunchKernelGGL(kernel::reduce_max_nnz_per_slice, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, slice_size,
+                       stride_factor, as_hip_type(nnz_per_row.get_const_data()),
+                       as_hip_type(max_nnz_per_slice.get_data()));
+
+    grid_dim = ceildiv(slice_num, default_block_size);
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(grid_dim),
+                       dim3(default_block_size),
+                       default_block_size * sizeof(size_type), 0, slice_num,
+                       as_hip_type(max_nnz_per_slice.get_const_data()),
+                       as_hip_type(block_results.get_data()));
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(1),
+                       dim3(default_block_size),
+                       default_block_size * sizeof(size_type), 0, grid_dim,
+                       as_hip_type(block_results.get_const_data()),
+                       as_hip_type(d_result.get_data()));
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL);
+
+
+template <typename ValueType>
+void transpose(std::shared_ptr<const HipExecutor> exec,
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        {
+            hipblas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N,
+                          orig->get_size()[0], orig->get_size()[1], &alpha,
+                          orig->get_const_values(), orig->get_stride(), &beta,
+                          orig->get_const_values(), trans->get_size()[1],
+                          trans->get_values(), trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+};
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType>
+void conj_transpose(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        {
+            hipblas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N,
+                          orig->get_size()[0], orig->get_size()[1], &alpha,
+                          orig->get_const_values(), orig->get_stride(), &beta,
+                          orig->get_const_values(), trans->get_size()[1],
+                          trans->get_values(), trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const HipExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    hipLaunchKernelGGL(
+        kernel::row_permute<block_size>, dim3(grid_dim), dim3(block_dim), 0, 0,
+        orig->get_size()[0], orig->get_size()[1],
+        as_hip_type(permutation_indices->get_const_data()),
+        as_hip_type(orig->get_const_values()), orig->get_stride(),
+        as_hip_type(row_permuted->get_values()), row_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const HipExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    hipLaunchKernelGGL(
+        kernel::column_permute<block_size>, dim3(grid_dim), dim3(block_dim), 0,
+        0, orig->get_size()[0], orig->get_size()[1],
+        as_hip_type(permutation_indices->get_const_data()),
+        as_hip_type(orig->get_const_values()), orig->get_stride(),
+        as_hip_type(column_permuted->get_values()),
+        column_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    hipLaunchKernelGGL(
+        kernel::inverse_row_permute<block_size>, dim3(grid_dim),
+        dim3(block_dim), 0, 0, orig->get_size()[0], orig->get_size()[1],
+        as_hip_type(permutation_indices->get_const_data()),
+        as_hip_type(orig->get_const_values()), orig->get_stride(),
+        as_hip_type(row_permuted->get_values()), row_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const HipExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
+{
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim =
+        ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size);
+    const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size};
+    hipLaunchKernelGGL(
+        kernel::inverse_column_permute<block_size>, dim3(grid_dim),
+        dim3(block_dim), 0, 0, orig->get_size()[0], orig->get_size()[1],
+        as_hip_type(permutation_indices->get_const_data()),
+        as_hip_type(orig->get_const_values()), orig->get_stride(),
+        as_hip_type(column_permuted->get_values()),
+        column_permuted->get_stride());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
+}  // namespace dense
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
new file mode 100644
index 00000000000..c29da194aa6
--- /dev/null
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -0,0 +1,378 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/ell_kernels.hpp"
+
+
+#include <array>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/format_conversion.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The ELL matrix format namespace.
+ *
+ * @ingroup ell
+ */
+namespace ell {
+
+
+constexpr int default_block_size = 512;
+
+
+// TODO: num_threads_per_core and ratio are parameters should be tuned
+/**
+ * num_threads_per_core is the oversubscribing parameter. There are
+ * `num_threads_per_core` threads assigned to each physical core.
+ */
+constexpr int num_threads_per_core = 4;
+
+
+/**
+ * ratio is the parameter to decide when to use threads to do reduction on each
+ * row. (#cols/#rows > ratio)
+ */
+constexpr double ratio = 1e-2;
+
+
+/**
+ * max_thread_per_worker is the max number of thread per worker. The
+ * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
+ */
+constexpr int max_thread_per_worker = 32;
+
+
+/**
+ * A compile-time list of sub-warp sizes for which the spmv kernels should be
+ * compiled.
+ * 0 is a special case where it uses a sub-warp size of warp_size in
+ * combination with atomic_adds.
+ */
+using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
+
+
+#include "common/matrix/ell_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int info, typename ValueType, typename IndexType>
+void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
+                   const matrix::Ell<ValueType, IndexType> *a,
+                   const matrix::Dense<ValueType> *b,
+                   matrix::Dense<ValueType> *c,
+                   const matrix::Dense<ValueType> *alpha = nullptr,
+                   const matrix::Dense<ValueType> *beta = nullptr)
+{
+    const auto nrows = a->get_size()[0];
+    constexpr int num_thread_per_worker =
+        (info == 0) ? max_thread_per_worker : info;
+    constexpr bool atomic = (info == 0);
+    const dim3 block_size(default_block_size / num_thread_per_worker,
+                          num_thread_per_worker, 1);
+    const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
+                         b->get_size()[1], 1);
+    if (alpha == nullptr && beta == nullptr) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
+            dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
+            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+            a->get_stride(), a->get_num_stored_elements_per_row(),
+            as_hip_type(b->get_const_values()), b->get_stride(),
+            as_hip_type(c->get_values()), c->get_stride());
+    } else if (alpha != nullptr && beta != nullptr) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
+            dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
+            as_hip_type(alpha->get_const_values()),
+            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+            a->get_stride(), a->get_num_stored_elements_per_row(),
+            as_hip_type(b->get_const_values()), b->get_stride(),
+            as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()),
+            c->get_stride());
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
+
+
+template <typename ValueType, typename IndexType>
+std::array<int, 3> compute_thread_worker_and_atomicity(
+    std::shared_ptr<const HipExecutor> exec,
+    const matrix::Ell<ValueType, IndexType> *a)
+{
+    int num_thread_per_worker = 1;
+    int atomic = 0;
+    int num_worker_per_row = 1;
+
+    const auto nrows = a->get_size()[0];
+    const auto ell_ncols = a->get_num_stored_elements_per_row();
+    // TODO: num_threads_per_core should be tuned for AMD gpu
+    const auto nwarps = exec->get_num_warps_per_sm() *
+                        exec->get_num_multiprocessor() * num_threads_per_core;
+
+    // Use multithreads to perform the reduction on each row when the matrix is
+    // wide.
+    // To make every thread have computation, so pick the value which is the
+    // power of 2 less than max_thread_per_worker and is less than or equal to
+    // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and
+    // allow more than one worker to work on the same row, use atomic add to
+    // handle the worker write the value into the same position. The #worker is
+    // decided according to the number of worker allowed on GPU.
+    if (static_cast<double>(ell_ncols) / nrows > ratio) {
+        while (num_thread_per_worker < max_thread_per_worker &&
+               (num_thread_per_worker << 1) <= ell_ncols) {
+            num_thread_per_worker <<= 1;
+        }
+        if (num_thread_per_worker == max_thread_per_worker) {
+            num_worker_per_row =
+                std::min(ell_ncols / max_thread_per_worker, nwarps / nrows);
+            num_worker_per_row = std::max(num_worker_per_row, 1);
+        }
+        if (num_worker_per_row > 1) {
+            atomic = 1;
+        }
+    }
+    return {num_thread_per_worker, atomic, num_worker_per_row};
+}
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const HipExecutor> exec,
+          const matrix::Ell<ValueType, IndexType> *a,
+          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+{
+    const auto data = compute_thread_worker_and_atomicity(exec, a);
+    const int num_thread_per_worker = std::get<0>(data);
+    const int atomic = std::get<1>(data);
+    const int num_worker_per_row = std::get<2>(data);
+
+    /**
+     * info is the parameter for selecting the hip kernel.
+     * for info == 0, it uses the kernel by warp_size threads with atomic
+     * operation for other value, it uses the kernel without atomic_add
+     */
+    const int info = (!atomic) * num_thread_per_worker;
+    if (atomic) {
+        components::fill_array(exec, c->get_values(),
+                               c->get_num_stored_elements(), zero<ValueType>());
+    }
+    select_abstract_spmv(
+        compiled_kernels(),
+        [&info](int compiled_info) { return info == compiled_info; },
+        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b,
+        c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *alpha,
+                   const matrix::Ell<ValueType, IndexType> *a,
+                   const matrix::Dense<ValueType> *b,
+                   const matrix::Dense<ValueType> *beta,
+                   matrix::Dense<ValueType> *c)
+{
+    const auto data = compute_thread_worker_and_atomicity(exec, a);
+    const int num_thread_per_worker = std::get<0>(data);
+    const int atomic = std::get<1>(data);
+    const int num_worker_per_row = std::get<2>(data);
+
+    /**
+     * info is the parameter for selecting the hip kernel.
+     * for info == 0, it uses the kernel by warp_size threads with atomic
+     * operation for other value, it uses the kernel without atomic_add
+     */
+    const int info = (!atomic) * num_thread_per_worker;
+    if (atomic) {
+        dense::scale(exec, beta, c);
+    }
+    select_abstract_spmv(
+        compiled_kernels(),
+        [&info](int compiled_info) { return info == compiled_info; },
+        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b, c,
+        alpha, beta);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto result_stride = result->get_stride();
+    const auto col_idxs = source->get_const_col_idxs();
+    const auto vals = source->get_const_values();
+    const auto source_stride = source->get_stride();
+
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
+    const dim3 init_grid_dim(ceildiv(result_stride, block_size.x),
+                             ceildiv(num_rows, block_size.y), 1);
+    hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim),
+                       dim3(block_size), 0, 0, num_rows, num_cols,
+                       result_stride, as_hip_type(result->get_values()));
+
+    const auto grid_dim = ceildiv(num_rows, default_block_size);
+    hipLaunchKernelGGL(kernel::fill_in_dense, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       source->get_num_stored_elements_per_row(), source_stride,
+                       as_hip_type(col_idxs), as_hip_type(vals), result_stride,
+                       as_hip_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
+{
+    auto num_rows = result->get_size()[0];
+
+    auto row_ptrs = result->get_row_ptrs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    const auto stride = source->get_stride();
+    const auto max_nnz_per_row = source->get_num_stored_elements_per_row();
+
+    constexpr auto rows_per_block =
+        ceildiv(default_block_size, config::warp_size);
+    const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
+
+    hipLaunchKernelGGL(
+        kernel::count_nnz_per_row, dim3(grid_dim_nnz), dim3(default_block_size),
+        0, 0, num_rows, max_nnz_per_row, stride,
+        as_hip_type(source->get_const_values()), as_hip_type(row_ptrs));
+
+    size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
+    auto add_values = Array<IndexType>(exec, grid_dim);
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
+    hipLaunchKernelGGL(
+        kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0,
+        num_rows, max_nnz_per_row, stride,
+        as_hip_type(source->get_const_values()),
+        as_hip_type(source->get_const_col_idxs()), as_hip_type(row_ptrs),
+        as_hip_type(col_idxs), as_hip_type(values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row(std::shared_ptr<const HipExecutor> exec,
+                                const matrix::Ell<ValueType, IndexType> *source,
+                                Array<size_type> *result)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto max_nnz_per_row = source->get_num_stored_elements_per_row();
+    const auto stride = source->get_stride();
+    const auto values = source->get_const_values();
+
+    const auto warp_size = config::warp_size;
+    const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size);
+
+    hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       max_nnz_per_row, stride, as_hip_type(values),
+                       as_hip_type(result->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL);
+
+
+}  // namespace ell
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp
new file mode 100644
index 00000000000..e9efb0eb8ee
--- /dev/null
+++ b/hip/matrix/hybrid_kernels.hip.cpp
@@ -0,0 +1,194 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/hybrid_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_kernels.hpp"
+#include "core/matrix/ell_kernels.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/format_conversion.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/segment_scan.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Hybrid matrix format namespace.
+ *
+ * @ingroup hybrid
+ */
+namespace hybrid {
+
+
+constexpr int default_block_size = 512;
+constexpr int warps_in_block = 4;
+
+
+#include "common/matrix/hybrid_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto coo_offset = Array<IndexType>(exec, num_rows + 1);
+    auto coo_val = source->get_const_coo_values();
+    auto coo_col = source->get_const_coo_col_idxs();
+    auto coo_row = source->get_const_coo_row_idxs();
+    auto ell_val = source->get_const_ell_values();
+    auto ell_col = source->get_const_ell_col_idxs();
+    const auto stride = source->get_ell_stride();
+    const auto max_nnz_per_row = source->get_ell_num_stored_elements_per_row();
+    const auto coo_num_stored_elements = source->get_coo_num_stored_elements();
+
+    // Compute the row offset of Coo without zeros
+    size_type grid_num = ceildiv(coo_num_stored_elements, default_block_size);
+    hipLaunchKernelGGL(coo::kernel::convert_row_idxs_to_ptrs, dim3(grid_num),
+                       dim3(default_block_size), 0, 0, as_hip_type(coo_row),
+                       coo_num_stored_elements,
+                       as_hip_type(coo_offset.get_data()), num_rows + 1);
+
+    // Compute the row ptrs of Csr
+    auto row_ptrs = result->get_row_ptrs();
+    auto coo_row_ptrs = Array<IndexType>(exec, num_rows);
+
+    components::fill_array(exec, row_ptrs, num_rows + 1, zero<IndexType>());
+    grid_num = ceildiv(num_rows, warps_in_block);
+    hipLaunchKernelGGL(ell::kernel::count_nnz_per_row, dim3(grid_num),
+                       dim3(default_block_size), 0, 0, num_rows,
+                       max_nnz_per_row, stride, as_hip_type(ell_val),
+                       as_hip_type(row_ptrs));
+
+    components::fill_array(exec, coo_row_ptrs.get_data(), num_rows,
+                           zero<IndexType>());
+
+    auto nwarps =
+        coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements);
+    if (nwarps > 0) {
+        int num_lines =
+            ceildiv(coo_num_stored_elements, nwarps * config::warp_size);
+        const dim3 coo_block(config::warp_size, warps_in_block, 1);
+        const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1);
+
+        hipLaunchKernelGGL(
+            kernel::count_coo_row_nnz, dim3(coo_grid), dim3(coo_block), 0, 0,
+            coo_num_stored_elements, num_lines, as_hip_type(coo_val),
+            as_hip_type(coo_row), as_hip_type(coo_row_ptrs.get_data()));
+    }
+
+    hipLaunchKernelGGL(kernel::add, dim3(grid_num), dim3(default_block_size), 0,
+                       0, num_rows, as_hip_type(row_ptrs),
+                       as_hip_type(coo_row_ptrs.get_const_data()));
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
+    // Fill the value
+    grid_num = ceildiv(num_rows, default_block_size);
+    hipLaunchKernelGGL(
+        kernel::fill_in_csr, dim3(grid_num), dim3(default_block_size), 0, 0,
+        num_rows, max_nnz_per_row, stride, as_hip_type(ell_val),
+        as_hip_type(ell_col), as_hip_type(coo_val), as_hip_type(coo_col),
+        as_hip_type(coo_offset.get_const_data()), as_hip_type(row_ptrs),
+        as_hip_type(result->get_col_idxs()), as_hip_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    size_type *result)
+{
+    size_type ell_nnz = 0;
+    size_type coo_nnz = 0;
+    ell::count_nonzeros(exec, source->get_ell(), &ell_nnz);
+
+    auto nnz = source->get_coo_num_stored_elements();
+    auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz);
+    if (nwarps > 0) {
+        int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+        const dim3 coo_block(config::warp_size, warps_in_block, 1);
+        const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1);
+        const auto num_rows = source->get_size()[0];
+        auto nnz_per_row = Array<IndexType>(exec, num_rows);
+        components::fill_array(exec, nnz_per_row.get_data(), num_rows,
+                               zero<IndexType>());
+        hipLaunchKernelGGL(kernel::count_coo_row_nnz, dim3(coo_grid),
+                           dim3(coo_block), 0, 0, nnz, num_lines,
+                           as_hip_type(source->get_coo()->get_const_values()),
+                           as_hip_type(source->get_coo()->get_const_row_idxs()),
+                           as_hip_type(nnz_per_row.get_data()));
+
+        coo_nnz =
+            reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
+    }
+
+    *result = ell_nnz + coo_nnz;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL);
+
+
+}  // namespace hybrid
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
new file mode 100644
index 00000000000..95a621f1886
--- /dev/null
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -0,0 +1,227 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/sellp_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The SELL-P matrix format namespace.
+ *
+ * @ingroup sellp
+ */
+namespace sellp {
+
+
+constexpr auto default_block_size = 512;
+
+
+#include "common/matrix/sellp_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const HipExecutor> exec,
+          const matrix::Sellp<ValueType, IndexType> *a,
+          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+{
+    const dim3 blockSize(matrix::default_slice_size);
+    const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size),
+                        b->get_size()[1]);
+
+    hipLaunchKernelGGL(
+        spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, a->get_size()[0],
+        b->get_size()[1], b->get_stride(), c->get_stride(),
+        a->get_const_slice_lengths(), a->get_const_slice_sets(),
+        as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+        as_hip_type(b->get_const_values()), as_hip_type(c->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *alpha,
+                   const matrix::Sellp<ValueType, IndexType> *a,
+                   const matrix::Dense<ValueType> *b,
+                   const matrix::Dense<ValueType> *beta,
+                   matrix::Dense<ValueType> *c)
+{
+    const dim3 blockSize(matrix::default_slice_size);
+    const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size),
+                        b->get_size()[1]);
+
+    hipLaunchKernelGGL(
+        advanced_spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0,
+        a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(),
+        a->get_const_slice_lengths(), a->get_const_slice_sets(),
+        as_hip_type(alpha->get_const_values()),
+        as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
+        as_hip_type(b->get_const_values()),
+        as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto num_cols = source->get_size()[1];
+    const auto vals = source->get_const_values();
+    const auto col_idxs = source->get_const_col_idxs();
+    const auto slice_lengths = source->get_const_slice_lengths();
+    const auto slice_sets = source->get_const_slice_sets();
+    const auto slice_size = source->get_slice_size();
+
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
+    const dim3 init_grid_dim(ceildiv(result->get_stride(), block_size.x),
+                             ceildiv(num_rows, block_size.y), 1);
+
+    hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim),
+                       dim3(block_size), 0, 0, num_rows, num_cols,
+                       result->get_stride(), as_hip_type(result->get_values()));
+
+    constexpr auto threads_per_row = config::warp_size;
+    const auto grid_dim =
+        ceildiv(slice_size * slice_num * threads_per_row, default_block_size);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::fill_in_dense<threads_per_row>),
+                       dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows,
+                       num_cols, result->get_stride(), slice_size,
+                       as_hip_type(slice_lengths), as_hip_type(slice_sets),
+                       as_hip_type(col_idxs), as_hip_type(vals),
+                       as_hip_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto slice_size = source->get_slice_size();
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    const auto source_values = source->get_const_values();
+    const auto source_slice_lengths = source->get_const_slice_lengths();
+    const auto source_slice_sets = source->get_const_slice_sets();
+    const auto source_col_idxs = source->get_const_col_idxs();
+
+    auto result_values = result->get_values();
+    auto result_col_idxs = result->get_col_idxs();
+    auto result_row_ptrs = result->get_row_ptrs();
+
+    auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
+
+    hipLaunchKernelGGL(
+        kernel::count_nnz_per_row, dim3(grid_dim), dim3(default_block_size), 0,
+        0, num_rows, slice_size, as_hip_type(source_slice_sets),
+        as_hip_type(source_values), as_hip_type(result_row_ptrs));
+
+    components::prefix_sum(exec, result_row_ptrs, num_rows + 1);
+
+    grid_dim = ceildiv(num_rows, default_block_size);
+
+    hipLaunchKernelGGL(
+        kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0,
+        num_rows, slice_size, as_hip_type(source_slice_sets),
+        as_hip_type(source_col_idxs), as_hip_type(source_values),
+        as_hip_type(result_row_ptrs), as_hip_type(result_col_idxs),
+        as_hip_type(result_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto slice_size = source->get_slice_size();
+    const auto slice_sets = source->get_const_slice_sets();
+    const auto values = source->get_const_values();
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
+
+    hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, slice_size,
+                       as_hip_type(slice_sets), as_hip_type(values),
+                       as_hip_type(nnz_per_row.get_data()));
+
+    *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL);
+
+
+}  // namespace sellp
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
new file mode 100644
index 00000000000..8ab3066f1ff
--- /dev/null
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -0,0 +1,124 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/sparsity_csr_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Compressed sparse row matrix format namespace.
+ *
+ * @ingroup sparsity
+ */
+namespace sparsity_csr {
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const HipExecutor> exec,
+          const matrix::SparsityCsr<ValueType, IndexType> *a,
+          const matrix::Dense<ValueType> *b,
+          matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *alpha,
+                   const matrix::SparsityCsr<ValueType, IndexType> *a,
+                   const matrix::Dense<ValueType> *b,
+                   const matrix::Dense<ValueType> *beta,
+                   matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void count_num_diagonal_elements(
+    std::shared_ptr<const HipExecutor> exec,
+    const matrix::SparsityCsr<ValueType, IndexType> *matrix,
+    size_type *num_diagonal_elements) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void remove_diagonal_elements(
+    std::shared_ptr<const HipExecutor> exec, const IndexType *row_ptrs,
+    const IndexType *col_idxs,
+    matrix::SparsityCsr<ValueType, IndexType> *matrix) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void transpose(std::shared_ptr<const HipExecutor> exec,
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
+                          matrix::SparsityCsr<ValueType, IndexType> *to_sort)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
+
+
+template <typename ValueType, typename IndexType>
+void is_sorted_by_column_index(
+    std::shared_ptr<const HipExecutor> exec,
+    const matrix::SparsityCsr<ValueType, IndexType> *to_check,
+    bool *is_sorted) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
+
+
+}  // namespace sparsity_csr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
new file mode 100644
index 00000000000..f2289eba530
--- /dev/null
+++ b/hip/preconditioner/isai_kernels.hip.cpp
@@ -0,0 +1,166 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/isai_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Isai preconditioner namespace.
+ * @ref Isai
+ * @ingroup isai
+ */
+namespace isai {
+
+
+constexpr int subwarp_size{row_size_limit};
+constexpr int subwarps_per_block{2};
+constexpr int default_block_size{subwarps_per_block * subwarp_size};
+
+
+#include "common/preconditioner/isai_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *input,
+                          matrix::Csr<ValueType, IndexType> *inverse,
+                          IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs,
+                          bool lower)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    if (lower) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::generate_l_inverse<subwarp_size, subwarps_per_block>),
+            grid, block, 0, 0, static_cast<IndexType>(num_rows),
+            input->get_const_row_ptrs(), input->get_const_col_idxs(),
+            as_hip_type(input->get_const_values()), inverse->get_row_ptrs(),
+            inverse->get_col_idxs(), as_hip_type(inverse->get_values()),
+            excess_rhs_ptrs, excess_nz_ptrs);
+    } else {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::generate_u_inverse<subwarp_size, subwarps_per_block>),
+            grid, block, 0, 0, static_cast<IndexType>(num_rows),
+            input->get_const_row_ptrs(), input->get_const_col_idxs(),
+            as_hip_type(input->get_const_values()), inverse->get_row_ptrs(),
+            inverse->get_col_idxs(), as_hip_type(inverse->get_values()),
+            excess_rhs_ptrs, excess_nz_ptrs);
+    }
+    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Csr<ValueType, IndexType> *input,
+                            const matrix::Csr<ValueType, IndexType> *inverse,
+                            const IndexType *excess_rhs_ptrs,
+                            const IndexType *excess_nz_ptrs,
+                            matrix::Csr<ValueType, IndexType> *excess_system,
+                            matrix::Dense<ValueType> *excess_rhs)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::generate_excess_system<subwarp_size>), grid,
+        block, 0, 0, static_cast<IndexType>(num_rows),
+        input->get_const_row_ptrs(), input->get_const_col_idxs(),
+        as_hip_type(input->get_const_values()), inverse->get_const_row_ptrs(),
+        inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs,
+        excess_system->get_row_ptrs(), excess_system->get_col_idxs(),
+        as_hip_type(excess_system->get_values()),
+        as_hip_type(excess_rhs->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
+                             const IndexType *excess_rhs_ptrs,
+                             const matrix::Dense<ValueType> *excess_solution,
+                             matrix::Csr<ValueType, IndexType> *inverse)
+{
+    const auto num_rows = inverse->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(kernel::copy_excess_solution<subwarp_size>), grid,
+        block, 0, 0, static_cast<IndexType>(num_rows),
+        inverse->get_const_row_ptrs(), excess_rhs_ptrs,
+        as_hip_type(excess_solution->get_const_values()),
+        as_hip_type(inverse->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
new file mode 100644
index 00000000000..d7d3e87970c
--- /dev/null
+++ b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
@@ -0,0 +1,149 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/warp_blas.hip.hpp"
+#include "hip/preconditioner/jacobi_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+#include "common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
+
+
+namespace {
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void advanced_apply(
+    syn::value_list<int, max_block_size>, size_type num_blocks,
+    const precision_reduction *block_precisions,
+    const IndexType *block_pointers, const ValueType *blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    const ValueType *alpha, const ValueType *b, size_type b_stride,
+    ValueType *x, size_type x_stride)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::advanced_adaptive_apply<max_block_size, subwarp_size,
+                                                warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks),
+            storage_scheme, block_precisions, block_pointers, num_blocks,
+            as_hip_type(alpha), as_hip_type(b), b_stride, as_hip_type(x),
+            x_stride);
+    } else {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::advanced_apply<max_block_size, subwarp_size,
+                                                   warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks),
+            storage_scheme, block_pointers, num_blocks, as_hip_type(alpha),
+            as_hip_type(b), b_stride, as_hip_type(x), x_stride);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void apply(std::shared_ptr<const HipExecutor> exec, size_type num_blocks,
+           uint32 max_block_size,
+           const preconditioner::block_interleaved_storage_scheme<IndexType>
+               &storage_scheme,
+           const Array<precision_reduction> &block_precisions,
+           const Array<IndexType> &block_pointers,
+           const Array<ValueType> &blocks,
+           const matrix::Dense<ValueType> *alpha,
+           const matrix::Dense<ValueType> *b,
+           const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
+{
+    // TODO: write a special kernel for multiple RHS
+    dense::scale(exec, beta, x);
+    for (size_type col = 0; col < b->get_size()[1]; ++col) {
+        select_advanced_apply(
+            compiled_kernels(),
+            [&](int compiled_block_size) {
+                return max_block_size <= compiled_block_size;
+            },
+            syn::value_list<int, config::min_warps_per_block>(),
+            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+            block_pointers.get_const_data(), blocks.get_const_data(),
+            storage_scheme, alpha->get_const_values(),
+            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
+            x->get_stride());
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/jacobi_common.hip.hpp b/hip/preconditioner/jacobi_common.hip.hpp
new file mode 100644
index 00000000000..d81dd3f9e97
--- /dev/null
+++ b/hip/preconditioner/jacobi_common.hip.hpp
@@ -0,0 +1,67 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/synthesizer/containers.hpp>
+
+
+#include "hip/base/config.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace jacobi {
+
+
+/**
+ * A compile-time list of block sizes for which dedicated generate and apply
+ * kernels should be compiled.
+ */
+#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
+using compiled_kernels = syn::as_list<syn::range<1, config::warp_size + 1, 1>>;
+#else
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 13, 16, 32, config::warp_size>;
+#endif
+
+
+constexpr int get_larger_power(int value, int guess = 1)
+{
+    return guess >= value ? guess : get_larger_power(value, guess << 1);
+}
+
+
+}  // namespace jacobi
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
new file mode 100644
index 00000000000..6f8def4af6e
--- /dev/null
+++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
@@ -0,0 +1,150 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/components/fill_array.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/diagonal_block_manipulation.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/components/warp_blas.hip.hpp"
+#include "hip/preconditioner/jacobi_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+#include "common/preconditioner/jacobi_generate_kernel.hpp.inc"
+
+
+namespace {
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void generate(syn::value_list<int, max_block_size>,
+              const matrix::Csr<ValueType, IndexType> *mtx,
+              remove_complex<ValueType> accuracy, ValueType *block_data,
+              const preconditioner::block_interleaved_storage_scheme<IndexType>
+                  &storage_scheme,
+              remove_complex<ValueType> *conditioning,
+              precision_reduction *block_precisions,
+              const IndexType *block_ptrs, size_type num_blocks)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::adaptive_generate<max_block_size, subwarp_size,
+                                          warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0],
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            as_hip_type(mtx->get_const_values()), as_hip_type(accuracy),
+            as_hip_type(block_data), storage_scheme, as_hip_type(conditioning),
+            block_precisions, block_ptrs, num_blocks);
+    } else {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::generate<max_block_size, subwarp_size,
+                                             warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0],
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            as_hip_type(mtx->get_const_values()), as_hip_type(block_data),
+            storage_scheme, block_ptrs, num_blocks);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void generate(std::shared_ptr<const HipExecutor> exec,
+              const matrix::Csr<ValueType, IndexType> *system_matrix,
+              size_type num_blocks, uint32 max_block_size,
+              remove_complex<ValueType> accuracy,
+              const preconditioner::block_interleaved_storage_scheme<IndexType>
+                  &storage_scheme,
+              Array<remove_complex<ValueType>> &conditioning,
+              Array<precision_reduction> &block_precisions,
+              const Array<IndexType> &block_pointers, Array<ValueType> &blocks)
+{
+    components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(),
+                           zero<ValueType>());
+    select_generate(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
+        system_matrix, accuracy, blocks.get_data(), storage_scheme,
+        conditioning.get_data(), block_precisions.get_data(),
+        block_pointers.get_const_data(), num_blocks);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_GENERATE_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
new file mode 100644
index 00000000000..b2d249f12b7
--- /dev/null
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -0,0 +1,262 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/preconditioner/jacobi_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+namespace {
+
+
+// a total of 32/16 warps (1024 threads)
+#if GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // GINKGO_HIP_PLATFORM_NVCC
+constexpr int default_num_warps = 32;
+#endif
+// with current architectures, at most 32 warps can be scheduled per SM (and
+// current GPUs have at most 84 SMs)
+constexpr int default_grid_size = 32 * 32 * 128;
+
+
+#include "common/preconditioner/jacobi_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+size_type find_natural_blocks(std::shared_ptr<const HipExecutor> exec,
+                              const matrix::Csr<ValueType, IndexType> *mtx,
+                              int32 max_block_size,
+                              IndexType *__restrict__ block_ptrs)
+{
+    Array<size_type> nums(exec, 1);
+
+    Array<bool> matching_next_row(exec, mtx->get_size()[0] - 1);
+
+    const dim3 block_size(config::warp_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1);
+    hipLaunchKernelGGL(compare_adjacent_rows, dim3(grid_size), dim3(block_size),
+                       0, 0, mtx->get_size()[0], max_block_size,
+                       mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+                       matching_next_row.get_data());
+    hipLaunchKernelGGL(generate_natural_block_pointer, dim3(1), dim3(1), 0, 0,
+                       mtx->get_size()[0], max_block_size,
+                       matching_next_row.get_const_data(), block_ptrs,
+                       nums.get_data());
+    nums.set_executor(exec->get_master());
+    return nums.get_const_data()[0];
+}
+
+
+template <typename IndexType>
+inline size_type agglomerate_supervariables(
+    std::shared_ptr<const HipExecutor> exec, int32 max_block_size,
+    size_type num_natural_blocks, IndexType *block_ptrs)
+{
+    Array<size_type> nums(exec, 1);
+
+    hipLaunchKernelGGL(agglomerate_supervariables_kernel, dim3(1), dim3(1), 0,
+                       0, max_block_size, num_natural_blocks, block_ptrs,
+                       nums.get_data());
+
+    nums.set_executor(exec->get_master());
+    return nums.get_const_data()[0];
+}
+
+
+}  // namespace
+
+
+void initialize_precisions(std::shared_ptr<const HipExecutor> exec,
+                           const Array<precision_reduction> &source,
+                           Array<precision_reduction> &precisions)
+{
+    const auto block_size = default_num_warps * config::warp_size;
+    const auto grid_size = min(
+        default_grid_size,
+        static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array<default_num_warps>),
+                       dim3(grid_size), dim3(block_size), 0, 0,
+                       source.get_const_data(), source.get_num_elems(),
+                       precisions.get_data(), precisions.get_num_elems());
+}
+
+
+template <typename ValueType, typename IndexType>
+void find_blocks(std::shared_ptr<const HipExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType> *system_matrix,
+                 uint32 max_block_size, size_type &num_blocks,
+                 Array<IndexType> &block_pointers)
+{
+    auto num_natural_blocks = find_natural_blocks(
+        exec, system_matrix, max_block_size, block_pointers.get_data());
+    num_blocks = agglomerate_supervariables(
+        exec, max_block_size, num_natural_blocks, block_pointers.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
+
+
+namespace {
+
+
+template <bool conjugate, int warps_per_block, int max_block_size,
+          typename ValueType, typename IndexType>
+void transpose_jacobi(
+    syn::value_list<int, max_block_size>, size_type num_blocks,
+    const precision_reduction *block_precisions,
+    const IndexType *block_pointers, const ValueType *blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    ValueType *out_blocks)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                adaptive_transpose_jacobi<conjugate, max_block_size,
+                                          subwarp_size, warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks),
+            storage_scheme, block_precisions, block_pointers, num_blocks,
+            as_hip_type(out_blocks));
+    } else {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(transpose_jacobi<conjugate, max_block_size,
+                                             subwarp_size, warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks),
+            storage_scheme, block_pointers, num_blocks,
+            as_hip_type(out_blocks));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    select_transpose_jacobi(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, false, config::min_warps_per_block>(),
+        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        block_pointers.get_const_data(), blocks.get_const_data(),
+        storage_scheme, out_blocks.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    select_transpose_jacobi(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, true, config::min_warps_per_block>(),
+        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        block_pointers.get_const_data(), blocks.get_const_data(),
+        storage_scheme, out_blocks.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_dense(
+    std::shared_ptr<const HipExecutor> exec, size_type num_blocks,
+    const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    ValueType *result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
new file mode 100644
index 00000000000..0c2fefb1afc
--- /dev/null
+++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
@@ -0,0 +1,143 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/warp_blas.hip.hpp"
+#include "hip/preconditioner/jacobi_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+#include "common/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+
+
+namespace {
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+           const precision_reduction *block_precisions,
+           const IndexType *block_pointers, const ValueType *blocks,
+           const preconditioner::block_interleaved_storage_scheme<IndexType>
+               &storage_scheme,
+           const ValueType *b, size_type b_stride, ValueType *x,
+           size_type x_stride)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::adaptive_apply<max_block_size, subwarp_size,
+                                                   warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks),
+            storage_scheme, block_precisions, block_pointers, num_blocks,
+            as_hip_type(b), b_stride, as_hip_type(x), x_stride);
+    } else {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                kernel::apply<max_block_size, subwarp_size, warps_per_block>),
+            dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks),
+            storage_scheme, block_pointers, num_blocks, as_hip_type(b),
+            b_stride, as_hip_type(x), x_stride);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(
+    std::shared_ptr<const HipExecutor> exec, size_type num_blocks,
+    uint32 max_block_size,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *x)
+{
+    // TODO: write a special kernel for multiple RHS
+    for (size_type col = 0; col < b->get_size()[1]; ++col) {
+        select_apply(
+            compiled_kernels(),
+            [&](int compiled_block_size) {
+                return max_block_size <= compiled_block_size;
+            },
+            syn::value_list<int, config::min_warps_per_block>(),
+            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+            block_pointers.get_const_data(), blocks.get_const_data(),
+            storage_scheme, b->get_const_values() + col, b->get_stride(),
+            x->get_values() + col, x->get_stride());
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/bicg_kernels.hip.cpp b/hip/solver/bicg_kernels.hip.cpp
new file mode 100644
index 00000000000..e773520b258
--- /dev/null
+++ b/hip/solver/bicg_kernels.hip.cpp
@@ -0,0 +1,149 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/bicg_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The BICG solver namespace.
+ *
+ * @ingroup bicg
+ */
+namespace bicg {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/bicg_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const HipExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *r2,
+                matrix::Dense<ValueType> *z2, matrix::Dense<ValueType> *p2,
+                matrix::Dense<ValueType> *q2,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        b->get_size()[0], b->get_size()[1], b->get_stride(),
+        as_hip_type(b->get_const_values()), as_hip_type(r->get_values()),
+        as_hip_type(z->get_values()), as_hip_type(p->get_values()),
+        as_hip_type(q->get_values()), as_hip_type(r2->get_values()),
+        as_hip_type(z2->get_values()), as_hip_type(p2->get_values()),
+        as_hip_type(q2->get_values()), as_hip_type(prev_rho->get_values()),
+        as_hip_type(rho->get_values()), as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const HipExecutor> exec,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
+            matrix::Dense<ValueType> *p2, const matrix::Dense<ValueType> *z2,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        p->get_size()[0], p->get_size()[1], p->get_stride(),
+        as_hip_type(p->get_values()), as_hip_type(z->get_const_values()),
+        as_hip_type(p2->get_values()), as_hip_type(z2->get_const_values()),
+        as_hip_type(rho->get_const_values()),
+        as_hip_type(prev_rho->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const HipExecutor> exec,
+            matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
+            matrix::Dense<ValueType> *r2, const matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *q,
+            const matrix::Dense<ValueType> *q2,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(),
+        as_hip_type(x->get_values()), as_hip_type(r->get_values()),
+        as_hip_type(r2->get_values()), as_hip_type(p->get_const_values()),
+        as_hip_type(q->get_const_values()), as_hip_type(q2->get_const_values()),
+        as_hip_type(beta->get_const_values()),
+        as_hip_type(rho->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+
+
+}  // namespace bicg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/bicgstab_kernels.hip.cpp b/hip/solver/bicgstab_kernels.hip.cpp
new file mode 100644
index 00000000000..a8776876f6f
--- /dev/null
+++ b/hip/solver/bicgstab_kernels.hip.cpp
@@ -0,0 +1,205 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/bicgstab_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The BICGSTAB solver namespace.
+ *
+ * @ingroup bicgstab
+ */
+namespace bicgstab {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/bicgstab_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const HipExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *rr, matrix::Dense<ValueType> *y,
+                matrix::Dense<ValueType> *s, matrix::Dense<ValueType> *t,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *v,
+                matrix::Dense<ValueType> *p, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *alpha,
+                matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *gamma,
+                matrix::Dense<ValueType> *omega,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        b->get_size()[0], b->get_size()[1], b->get_stride(),
+        as_hip_type(b->get_const_values()), as_hip_type(r->get_values()),
+        as_hip_type(rr->get_values()), as_hip_type(y->get_values()),
+        as_hip_type(s->get_values()), as_hip_type(t->get_values()),
+        as_hip_type(z->get_values()), as_hip_type(v->get_values()),
+        as_hip_type(p->get_values()), as_hip_type(prev_rho->get_values()),
+        as_hip_type(rho->get_values()), as_hip_type(alpha->get_values()),
+        as_hip_type(beta->get_values()), as_hip_type(gamma->get_values()),
+        as_hip_type(omega->get_values()), as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *v,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *omega,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(r->get_size()[0] * r->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+                       r->get_size()[0], r->get_size()[1], r->get_stride(),
+                       as_hip_type(r->get_const_values()),
+                       as_hip_type(p->get_values()),
+                       as_hip_type(v->get_const_values()),
+                       as_hip_type(rho->get_const_values()),
+                       as_hip_type(prev_rho->get_const_values()),
+                       as_hip_type(alpha->get_const_values()),
+                       as_hip_type(omega->get_const_values()),
+                       as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *s,
+            const matrix::Dense<ValueType> *v,
+            const matrix::Dense<ValueType> *rho,
+            matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *beta,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(r->get_size()[0] * r->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        r->get_size()[0], r->get_size()[1], r->get_stride(),
+        as_hip_type(r->get_const_values()), as_hip_type(s->get_values()),
+        as_hip_type(v->get_const_values()),
+        as_hip_type(rho->get_const_values()), as_hip_type(alpha->get_values()),
+        as_hip_type(beta->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+
+
+template <typename ValueType>
+void step_3(
+    std::shared_ptr<const HipExecutor> exec, matrix::Dense<ValueType> *x,
+    matrix::Dense<ValueType> *r, const matrix::Dense<ValueType> *s,
+    const matrix::Dense<ValueType> *t, const matrix::Dense<ValueType> *y,
+    const matrix::Dense<ValueType> *z, const matrix::Dense<ValueType> *alpha,
+    const matrix::Dense<ValueType> *beta, const matrix::Dense<ValueType> *gamma,
+    matrix::Dense<ValueType> *omega, const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(r->get_size()[0] * r->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_3_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        r->get_size()[0], r->get_size()[1], r->get_stride(), x->get_stride(),
+        as_hip_type(x->get_values()), as_hip_type(r->get_values()),
+        as_hip_type(s->get_const_values()), as_hip_type(t->get_const_values()),
+        as_hip_type(y->get_const_values()), as_hip_type(z->get_const_values()),
+        as_hip_type(alpha->get_const_values()),
+        as_hip_type(beta->get_const_values()),
+        as_hip_type(gamma->get_const_values()),
+        as_hip_type(omega->get_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+
+
+template <typename ValueType>
+void finalize(std::shared_ptr<const HipExecutor> exec,
+              matrix::Dense<ValueType> *x, const matrix::Dense<ValueType> *y,
+              const matrix::Dense<ValueType> *alpha,
+              Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(y->get_size()[0] * y->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(finalize_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+                       y->get_size()[0], y->get_size()[1], y->get_stride(),
+                       x->get_stride(), as_hip_type(x->get_values()),
+                       as_hip_type(y->get_const_values()),
+                       as_hip_type(alpha->get_const_values()),
+                       as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+
+
+}  // namespace bicgstab
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/cg_kernels.hip.cpp b/hip/solver/cg_kernels.hip.cpp
new file mode 100644
index 00000000000..688a6ab7f49
--- /dev/null
+++ b/hip/solver/cg_kernels.hip.cpp
@@ -0,0 +1,141 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/cg_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The CG solver namespace.
+ *
+ * @ingroup cg
+ */
+namespace cg {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/cg_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const HipExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        b->get_size()[0], b->get_size()[1], b->get_stride(),
+        as_hip_type(b->get_const_values()), as_hip_type(r->get_values()),
+        as_hip_type(z->get_values()), as_hip_type(p->get_values()),
+        as_hip_type(q->get_values()), as_hip_type(prev_rho->get_values()),
+        as_hip_type(rho->get_values()), as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const HipExecutor> exec,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+                       p->get_size()[0], p->get_size()[1], p->get_stride(),
+                       as_hip_type(p->get_values()),
+                       as_hip_type(z->get_const_values()),
+                       as_hip_type(rho->get_const_values()),
+                       as_hip_type(prev_rho->get_const_values()),
+                       as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const HipExecutor> exec,
+            matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
+            const matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *q,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(),
+        as_hip_type(x->get_values()), as_hip_type(r->get_values()),
+        as_hip_type(p->get_const_values()), as_hip_type(q->get_const_values()),
+        as_hip_type(beta->get_const_values()),
+        as_hip_type(rho->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
+
+
+}  // namespace cg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/cgs_kernels.hip.cpp b/hip/solver/cgs_kernels.hip.cpp
new file mode 100644
index 00000000000..b5597777790
--- /dev/null
+++ b/hip/solver/cgs_kernels.hip.cpp
@@ -0,0 +1,176 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/cgs_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The CGS solver namespace.
+ *
+ * @ingroup cgs
+ */
+namespace cgs {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/cgs_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const HipExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *r_tld, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *u,
+                matrix::Dense<ValueType> *u_hat,
+                matrix::Dense<ValueType> *v_hat, matrix::Dense<ValueType> *t,
+                matrix::Dense<ValueType> *alpha, matrix::Dense<ValueType> *beta,
+                matrix::Dense<ValueType> *gamma,
+                matrix::Dense<ValueType> *rho_prev,
+                matrix::Dense<ValueType> *rho,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        b->get_size()[0], b->get_size()[1], b->get_stride(),
+        as_hip_type(b->get_const_values()), as_hip_type(r->get_values()),
+        as_hip_type(r_tld->get_values()), as_hip_type(p->get_values()),
+        as_hip_type(q->get_values()), as_hip_type(u->get_values()),
+        as_hip_type(u_hat->get_values()), as_hip_type(v_hat->get_values()),
+        as_hip_type(t->get_values()), as_hip_type(alpha->get_values()),
+        as_hip_type(beta->get_values()), as_hip_type(gamma->get_values()),
+        as_hip_type(rho_prev->get_values()), as_hip_type(rho->get_values()),
+        as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *u,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *q,
+            matrix::Dense<ValueType> *beta, const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *rho_prev,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        p->get_size()[0], p->get_size()[1], p->get_stride(),
+        as_hip_type(r->get_const_values()), as_hip_type(u->get_values()),
+        as_hip_type(p->get_values()), as_hip_type(q->get_const_values()),
+        as_hip_type(beta->get_values()), as_hip_type(rho->get_const_values()),
+        as_hip_type(rho_prev->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Dense<ValueType> *u,
+            const matrix::Dense<ValueType> *v_hat, matrix::Dense<ValueType> *q,
+            matrix::Dense<ValueType> *t, matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *gamma,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(u->get_size()[0] * u->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        u->get_size()[0], u->get_size()[1], u->get_stride(),
+        as_hip_type(u->get_const_values()),
+        as_hip_type(v_hat->get_const_values()), as_hip_type(q->get_values()),
+        as_hip_type(t->get_values()), as_hip_type(alpha->get_values()),
+        as_hip_type(rho->get_const_values()),
+        as_hip_type(gamma->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
+
+
+template <typename ValueType>
+void step_3(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Dense<ValueType> *t,
+            const matrix::Dense<ValueType> *u_hat, matrix::Dense<ValueType> *r,
+            matrix::Dense<ValueType> *x, const matrix::Dense<ValueType> *alpha,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(t->get_size()[0] * t->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_3_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        t->get_size()[0], t->get_size()[1], t->get_stride(), x->get_stride(),
+        as_hip_type(t->get_const_values()),
+        as_hip_type(u_hat->get_const_values()), as_hip_type(r->get_values()),
+        as_hip_type(x->get_values()), as_hip_type(alpha->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
+
+
+}  // namespace cgs
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
new file mode 100644
index 00000000000..3bf0e56c7fa
--- /dev/null
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -0,0 +1,251 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_SOLVER_COMMON_TRS_KERNELS_HIP_HPP_
+#define GKO_HIP_SOLVER_COMMON_TRS_KERNELS_HIP_HPP_
+
+
+#include <functional>
+#include <memory>
+
+
+#include <hip/hip_runtime.h>
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/device_guard.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace solver {
+
+
+struct SolveStruct {
+    virtual void dummy(){};
+};
+
+
+namespace hip {
+
+
+struct SolveStruct : gko::solver::SolveStruct {
+    csrsv2Info_t solve_info;
+    hipsparseSolvePolicy_t policy;
+    hipsparseMatDescr_t factor_descr;
+    int factor_work_size;
+    void *factor_work_vec;
+    SolveStruct()
+    {
+        factor_work_vec = nullptr;
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateMatDescr(&factor_descr));
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+            hipsparseSetMatIndexBase(factor_descr, HIPSPARSE_INDEX_BASE_ZERO));
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+            hipsparseSetMatType(factor_descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetMatDiagType(
+            factor_descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrsv2Info(&solve_info));
+        policy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
+    }
+
+    SolveStruct(const SolveStruct &) = delete;
+
+    SolveStruct(SolveStruct &&) = delete;
+
+    SolveStruct &operator=(const SolveStruct &) = delete;
+
+    SolveStruct &operator=(SolveStruct &&) = delete;
+
+    ~SolveStruct()
+    {
+        hipsparseDestroyMatDescr(factor_descr);
+        if (solve_info) {
+            hipsparseDestroyCsrsv2Info(solve_info);
+        }
+        if (factor_work_vec != nullptr) {
+            hipFree(factor_work_vec);
+            factor_work_vec = nullptr;
+        }
+    }
+};
+
+
+}  // namespace hip
+}  // namespace solver
+
+
+namespace kernels {
+namespace hip {
+namespace {
+
+
+void should_perform_transpose_kernel(std::shared_ptr<const HipExecutor> exec,
+                                     bool &do_transpose)
+{
+    do_transpose = true;
+}
+
+
+void init_struct_kernel(std::shared_ptr<const HipExecutor> exec,
+                        std::shared_ptr<solver::SolveStruct> &solve_struct)
+{
+    solve_struct = std::make_shared<solver::hip::SolveStruct>();
+}
+
+
+template <typename ValueType, typename IndexType>
+void generate_kernel(std::shared_ptr<const HipExecutor> exec,
+                     const matrix::Csr<ValueType, IndexType> *matrix,
+                     solver::SolveStruct *solve_struct,
+                     const gko::size_type num_rhs, bool is_upper)
+{
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        if (auto hip_solve_struct =
+                dynamic_cast<solver::hip::SolveStruct *>(solve_struct)) {
+            auto handle = exec->get_hipsparse_handle();
+            if (is_upper) {
+                GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetMatFillMode(
+                    hip_solve_struct->factor_descr, HIPSPARSE_FILL_MODE_UPPER));
+            }
+
+            {
+                hipsparse::pointer_mode_guard pm_guard(handle);
+                hipsparse::csrsv2_buffer_size(
+                    handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                    matrix->get_size()[0], matrix->get_num_stored_elements(),
+                    hip_solve_struct->factor_descr, matrix->get_const_values(),
+                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+                    hip_solve_struct->solve_info,
+                    &hip_solve_struct->factor_work_size);
+
+                // allocate workspace
+                if (hip_solve_struct->factor_work_vec != nullptr) {
+                    exec->free(hip_solve_struct->factor_work_vec);
+                }
+                hip_solve_struct->factor_work_vec =
+                    exec->alloc<void *>(hip_solve_struct->factor_work_size);
+
+                hipsparse::csrsv2_analysis(
+                    handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                    matrix->get_size()[0], matrix->get_num_stored_elements(),
+                    hip_solve_struct->factor_descr, matrix->get_const_values(),
+                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+                    hip_solve_struct->solve_info, hip_solve_struct->policy,
+                    hip_solve_struct->factor_work_vec);
+            }
+        } else {
+            GKO_NOT_SUPPORTED(solve_struct);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void solve_kernel(std::shared_ptr<const HipExecutor> exec,
+                  const matrix::Csr<ValueType, IndexType> *matrix,
+                  const solver::SolveStruct *solve_struct,
+                  matrix::Dense<ValueType> *trans_b,
+                  matrix::Dense<ValueType> *trans_x,
+                  const matrix::Dense<ValueType> *b,
+                  matrix::Dense<ValueType> *x)
+{
+    using vec = matrix::Dense<ValueType>;
+
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        if (auto hip_solve_struct =
+                dynamic_cast<const solver::hip::SolveStruct *>(solve_struct)) {
+            ValueType one = 1.0;
+            auto handle = exec->get_hipsparse_handle();
+
+            {
+                hipsparse::pointer_mode_guard pm_guard(handle);
+                if (b->get_stride() == 1) {
+                    hipsparse::csrsv2_solve(
+                        handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                        matrix->get_size()[0],
+                        matrix->get_num_stored_elements(), &one,
+                        hip_solve_struct->factor_descr,
+                        matrix->get_const_values(),
+                        matrix->get_const_row_ptrs(),
+                        matrix->get_const_col_idxs(),
+                        hip_solve_struct->solve_info, b->get_const_values(),
+                        x->get_values(), hip_solve_struct->policy,
+                        hip_solve_struct->factor_work_vec);
+                } else {
+                    dense::transpose(exec, b, trans_b);
+                    dense::transpose(exec, x, trans_x);
+                    for (IndexType i = 0; i < trans_b->get_size()[0]; i++) {
+                        hipsparse::csrsv2_solve(
+                            handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                            matrix->get_size()[0],
+                            matrix->get_num_stored_elements(), &one,
+                            hip_solve_struct->factor_descr,
+                            matrix->get_const_values(),
+                            matrix->get_const_row_ptrs(),
+                            matrix->get_const_col_idxs(),
+                            hip_solve_struct->solve_info,
+                            trans_b->get_values() + i * trans_b->get_stride(),
+                            trans_x->get_values() + i * trans_x->get_stride(),
+                            hip_solve_struct->policy,
+                            hip_solve_struct->factor_work_vec);
+                    }
+                    dense::transpose(exec, trans_x, x);
+                }
+            }
+        } else {
+            GKO_NOT_SUPPORTED(solve_struct);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+}  // namespace
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_SOLVER_COMMON_TRS_KERNELS_HIP_HPP_
diff --git a/hip/solver/fcg_kernels.hip.cpp b/hip/solver/fcg_kernels.hip.cpp
new file mode 100644
index 00000000000..750aa5743d7
--- /dev/null
+++ b/hip/solver/fcg_kernels.hip.cpp
@@ -0,0 +1,144 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/fcg_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The FCG solver namespace.
+ *
+ * @ingroup fcg
+ */
+namespace fcg {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/fcg_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const HipExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *t,
+                matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *rho_t,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        b->get_size()[0], b->get_size()[1], b->get_stride(),
+        as_hip_type(b->get_const_values()), as_hip_type(r->get_values()),
+        as_hip_type(z->get_values()), as_hip_type(p->get_values()),
+        as_hip_type(q->get_values()), as_hip_type(t->get_values()),
+        as_hip_type(prev_rho->get_values()), as_hip_type(rho->get_values()),
+        as_hip_type(rho_t->get_values()), as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const HipExecutor> exec,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
+            const matrix::Dense<ValueType> *rho_t,
+            const matrix::Dense<ValueType> *prev_rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+                       p->get_size()[0], p->get_size()[1], p->get_stride(),
+                       as_hip_type(p->get_values()),
+                       as_hip_type(z->get_const_values()),
+                       as_hip_type(rho_t->get_const_values()),
+                       as_hip_type(prev_rho->get_const_values()),
+                       as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const HipExecutor> exec,
+            matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
+            matrix::Dense<ValueType> *t, const matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *q,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho,
+            const Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1);
+
+    hipLaunchKernelGGL(
+        step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0,
+        p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(),
+        as_hip_type(x->get_values()), as_hip_type(r->get_values()),
+        as_hip_type(t->get_values()), as_hip_type(p->get_const_values()),
+        as_hip_type(q->get_const_values()),
+        as_hip_type(beta->get_const_values()),
+        as_hip_type(rho->get_const_values()),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
+
+
+}  // namespace fcg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp
new file mode 100644
index 00000000000..2780b149660
--- /dev/null
+++ b/hip/solver/gmres_kernels.hip.cpp
@@ -0,0 +1,350 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/gmres_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The GMRES solver namespace.
+ *
+ * @ingroup gmres
+ */
+namespace gmres {
+
+
+constexpr int default_block_size = 512;
+// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
+// size limit.
+constexpr int default_dot_dim = 32;
+constexpr int default_dot_size = default_dot_dim * default_dot_dim;
+
+
+#include "common/solver/gmres_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void initialize_1(std::shared_ptr<const HipExecutor> exec,
+                  const matrix::Dense<ValueType> *b,
+                  matrix::Dense<ValueType> *residual,
+                  matrix::Dense<ValueType> *givens_sin,
+                  matrix::Dense<ValueType> *givens_cos,
+                  Array<stopping_status> *stop_status, size_type krylov_dim)
+{
+    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
+                                      krylov_dim * b->get_size()[1]);
+    const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1);
+    const dim3 block_dim(default_block_size, 1, 1);
+    constexpr auto block_size = default_block_size;
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(initialize_1_kernel<block_size>), dim3(grid_dim),
+        dim3(block_dim), 0, 0, b->get_size()[0], b->get_size()[1], krylov_dim,
+        as_hip_type(b->get_const_values()), b->get_stride(),
+        as_hip_type(residual->get_values()), residual->get_stride(),
+        as_hip_type(givens_sin->get_values()), givens_sin->get_stride(),
+        as_hip_type(givens_cos->get_values()), givens_cos->get_stride(),
+        as_hip_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL);
+
+
+template <typename ValueType>
+void initialize_2(std::shared_ptr<const HipExecutor> exec,
+                  const matrix::Dense<ValueType> *residual,
+                  matrix::Dense<remove_complex<ValueType>> *residual_norm,
+                  matrix::Dense<ValueType> *residual_norm_collection,
+                  matrix::Dense<ValueType> *krylov_bases,
+                  Array<size_type> *final_iter_nums, size_type krylov_dim)
+{
+    const auto num_rows = residual->get_size()[0];
+    const auto num_rhs = residual->get_size()[1];
+    const dim3 grid_dim_1(
+        ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(),
+                default_block_size),
+        1, 1);
+    const dim3 block_dim(default_block_size, 1, 1);
+    constexpr auto block_size = default_block_size;
+
+    residual->compute_norm2(residual_norm);
+
+    const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1,
+                          1);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(initialize_2_2_kernel<block_size>), dim3(grid_dim_2),
+        dim3(block_dim), 0, 0, residual->get_size()[0], residual->get_size()[1],
+        as_hip_type(residual->get_const_values()), residual->get_stride(),
+        as_hip_type(residual_norm->get_const_values()),
+        as_hip_type(residual_norm_collection->get_values()),
+        as_hip_type(krylov_bases->get_values()), krylov_bases->get_stride(),
+        as_hip_type(final_iter_nums->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL);
+
+
+template <typename ValueType>
+void finish_arnoldi(std::shared_ptr<const HipExecutor> exec, size_type num_rows,
+                    matrix::Dense<ValueType> *krylov_bases,
+                    matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
+                    const stopping_status *stop_status)
+{
+    const auto stride_krylov = krylov_bases->get_stride();
+    const auto stride_hessenberg = hessenberg_iter->get_stride();
+    auto hipblas_handle = exec->get_hipblas_handle();
+    const dim3 grid_size(
+        ceildiv(hessenberg_iter->get_size()[1], default_dot_dim),
+        exec->get_num_multiprocessor() * 2);
+    const dim3 block_size(default_dot_dim, default_dot_dim);
+    auto next_krylov_basis =
+        krylov_bases->get_values() +
+        (iter + 1) * num_rows * hessenberg_iter->get_size()[1];
+    for (size_type k = 0; k < iter + 1; ++k) {
+        const auto k_krylov_bases =
+            krylov_bases->get_const_values() +
+            k * num_rows * hessenberg_iter->get_size()[1];
+        if (hessenberg_iter->get_size()[1] > 1) {
+            // TODO: this condition should be tuned
+            // single rhs will use vendor's dot, otherwise, use our own
+            // multidot_kernel which parallelize multiple rhs.
+            components::fill_array(
+                exec, hessenberg_iter->get_values() + k * stride_hessenberg,
+                hessenberg_iter->get_size()[1], zero<ValueType>());
+            hipLaunchKernelGGL(
+                multidot_kernel, dim3(grid_size), dim3(block_size), 0, 0, k,
+                num_rows, hessenberg_iter->get_size()[1],
+                as_hip_type(k_krylov_bases), as_hip_type(next_krylov_basis),
+                stride_krylov, as_hip_type(hessenberg_iter->get_values()),
+                stride_hessenberg, as_hip_type(stop_status));
+        } else {
+            hipblas::dot(exec->get_hipblas_handle(), num_rows, k_krylov_bases,
+                         stride_krylov, next_krylov_basis, stride_krylov,
+                         hessenberg_iter->get_values() + k * stride_hessenberg);
+        }
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(update_next_krylov_kernel<default_block_size>),
+            dim3(ceildiv(num_rows * stride_krylov, default_block_size)),
+            dim3(default_block_size), 0, 0, k, num_rows,
+            hessenberg_iter->get_size()[1], as_hip_type(k_krylov_bases),
+            as_hip_type(next_krylov_basis), stride_krylov,
+            as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg,
+            as_hip_type(stop_status));
+    }
+    // for i in 1:iter
+    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
+    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
+    // end
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(update_hessenberg_2_kernel<default_block_size>),
+        dim3(hessenberg_iter->get_size()[1]), dim3(default_block_size), 0, 0,
+        iter, num_rows, hessenberg_iter->get_size()[1],
+        as_hip_type(next_krylov_basis), stride_krylov,
+        as_hip_type(hessenberg_iter->get_values()), stride_hessenberg,
+        as_hip_type(stop_status));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(update_krylov_kernel<default_block_size>),
+        dim3(ceildiv(num_rows * stride_krylov, default_block_size)),
+        dim3(default_block_size), 0, 0, iter, num_rows,
+        hessenberg_iter->get_size()[1], as_hip_type(next_krylov_basis),
+        stride_krylov, as_hip_type(hessenberg_iter->get_const_values()),
+        stride_hessenberg, as_hip_type(stop_status));
+    // next_krylov_basis /= hessenberg(iter, iter + 1)
+    // End of arnoldi
+}
+
+
+template <typename ValueType>
+void givens_rotation(std::shared_ptr<const HipExecutor> exec,
+                     matrix::Dense<ValueType> *givens_sin,
+                     matrix::Dense<ValueType> *givens_cos,
+                     matrix::Dense<ValueType> *hessenberg_iter,
+                     matrix::Dense<remove_complex<ValueType>> *residual_norm,
+                     matrix::Dense<ValueType> *residual_norm_collection,
+                     size_type iter, const Array<stopping_status> *stop_status)
+{
+    // TODO: tune block_size for optimal performance
+    constexpr auto block_size = default_block_size;
+    const auto num_cols = hessenberg_iter->get_size()[1];
+    const dim3 block_dim{block_size, 1, 1};
+    const dim3 grid_dim{
+        static_cast<unsigned int>(ceildiv(num_cols, block_size)), 1, 1};
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(givens_rotation_kernel<block_size>), dim3(grid_dim),
+        dim3(block_dim), 0, 0, hessenberg_iter->get_size()[0],
+        hessenberg_iter->get_size()[1], iter,
+        as_hip_type(hessenberg_iter->get_values()),
+        hessenberg_iter->get_stride(), as_hip_type(givens_sin->get_values()),
+        givens_sin->get_stride(), as_hip_type(givens_cos->get_values()),
+        givens_cos->get_stride(), as_hip_type(residual_norm->get_values()),
+        as_hip_type(residual_norm_collection->get_values()),
+        residual_norm_collection->get_stride(),
+        as_hip_type(stop_status->get_const_data()));
+}
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const HipExecutor> exec, size_type num_rows,
+            matrix::Dense<ValueType> *givens_sin,
+            matrix::Dense<ValueType> *givens_cos,
+            matrix::Dense<remove_complex<ValueType>> *residual_norm,
+            matrix::Dense<ValueType> *residual_norm_collection,
+            matrix::Dense<ValueType> *krylov_bases,
+            matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
+            Array<size_type> *final_iter_nums,
+            const Array<stopping_status> *stop_status)
+{
+    hipLaunchKernelGGL(
+        increase_final_iteration_numbers_kernel,
+        dim3(static_cast<unsigned int>(
+            ceildiv(final_iter_nums->get_num_elems(), default_block_size))),
+        dim3(default_block_size), 0, 0,
+        as_hip_type(final_iter_nums->get_data()),
+        as_hip_type(stop_status->get_const_data()),
+        final_iter_nums->get_num_elems());
+    finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter,
+                   stop_status->get_const_data());
+    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
+                    residual_norm, residual_norm_collection, iter, stop_status);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void solve_upper_triangular(
+    const matrix::Dense<ValueType> *residual_norm_collection,
+    const matrix::Dense<ValueType> *hessenberg, matrix::Dense<ValueType> *y,
+    const Array<size_type> *final_iter_nums)
+{
+    // TODO: tune block_size for optimal performance
+    constexpr auto block_size = default_block_size;
+    const auto num_rhs = residual_norm_collection->get_size()[1];
+    const dim3 block_dim{block_size, 1, 1};
+    const dim3 grid_dim{static_cast<unsigned int>(ceildiv(num_rhs, block_size)),
+                        1, 1};
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(solve_upper_triangular_kernel<block_size>),
+        dim3(grid_dim), dim3(block_dim), 0, 0, hessenberg->get_size()[1],
+        num_rhs, as_hip_type(residual_norm_collection->get_const_values()),
+        residual_norm_collection->get_stride(),
+        as_hip_type(hessenberg->get_const_values()), hessenberg->get_stride(),
+        as_hip_type(y->get_values()), y->get_stride(),
+        as_hip_type(final_iter_nums->get_const_data()));
+}
+
+
+template <typename ValueType>
+void calculate_qy(const matrix::Dense<ValueType> *krylov_bases,
+                  const matrix::Dense<ValueType> *y,
+                  matrix::Dense<ValueType> *before_preconditioner,
+                  const Array<size_type> *final_iter_nums)
+{
+    const auto num_rows = before_preconditioner->get_size()[0];
+    const auto num_cols = krylov_bases->get_size()[1];
+    const auto num_rhs = before_preconditioner->get_size()[1];
+    const auto stride_before_preconditioner =
+        before_preconditioner->get_stride();
+
+    constexpr auto block_size = default_block_size;
+    const dim3 grid_dim{
+        static_cast<unsigned int>(
+            ceildiv(num_rows * stride_before_preconditioner, block_size)),
+        1, 1};
+    const dim3 block_dim{block_size, 1, 1};
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(calculate_Qy_kernel<block_size>), dim3(grid_dim),
+        dim3(block_dim), 0, 0, num_rows, num_cols, num_rhs,
+        as_hip_type(krylov_bases->get_const_values()),
+        krylov_bases->get_stride(), as_hip_type(y->get_const_values()),
+        y->get_stride(), as_hip_type(before_preconditioner->get_values()),
+        stride_before_preconditioner,
+        as_hip_type(final_iter_nums->get_const_data()));
+    // Calculate qy
+    // before_preconditioner = krylov_bases * y
+}
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const HipExecutor> exec,
+            const matrix::Dense<ValueType> *residual_norm_collection,
+            const matrix::Dense<ValueType> *krylov_bases,
+            const matrix::Dense<ValueType> *hessenberg,
+            matrix::Dense<ValueType> *y,
+            matrix::Dense<ValueType> *before_preconditioner,
+            const Array<size_type> *final_iter_nums)
+{
+    solve_upper_triangular(residual_norm_collection, hessenberg, y,
+                           final_iter_nums);
+    calculate_qy(krylov_bases, y, before_preconditioner, final_iter_nums);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL);
+
+
+}  // namespace gmres
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/ir_kernels.hip.cpp b/hip/solver/ir_kernels.hip.cpp
new file mode 100644
index 00000000000..5993c4b120b
--- /dev/null
+++ b/hip/solver/ir_kernels.hip.cpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/ir_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The IR solver namespace.
+ *
+ * @ingroup ir
+ */
+namespace ir {
+
+
+constexpr int default_block_size = 512;
+
+
+#include "common/solver/ir_kernels.hpp.inc"
+
+
+void initialize(std::shared_ptr<const HipExecutor> exec,
+                Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1,
+                         1);
+
+    hipLaunchKernelGGL(initialize_kernel, dim3(grid_size), dim3(block_size), 0,
+                       0, stop_status->get_num_elems(),
+                       stop_status->get_data());
+}
+
+
+}  // namespace ir
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
new file mode 100644
index 00000000000..d4e66513ebe
--- /dev/null
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -0,0 +1,110 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/lower_trs_kernels.hpp"
+
+
+#include <memory>
+
+
+#include <hip/hip_runtime.h>
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/solver/lower_trs.hpp>
+
+
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/solver/common_trs_kernels.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The LOWER_TRS solver namespace.
+ *
+ * @ingroup lower_trs
+ */
+namespace lower_trs {
+
+
+void should_perform_transpose(std::shared_ptr<const HipExecutor> exec,
+                              bool &do_transpose)
+{
+    should_perform_transpose_kernel(exec, do_transpose);
+}
+
+
+void init_struct(std::shared_ptr<const HipExecutor> exec,
+                 std::shared_ptr<solver::SolveStruct> &solve_struct)
+{
+    init_struct_kernel(exec, solve_struct);
+}
+
+
+template <typename ValueType, typename IndexType>
+void generate(std::shared_ptr<const HipExecutor> exec,
+              const matrix::Csr<ValueType, IndexType> *matrix,
+              solver::SolveStruct *solve_struct, const gko::size_type num_rhs)
+{
+    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
+                                          false);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void solve(std::shared_ptr<const HipExecutor> exec,
+           const matrix::Csr<ValueType, IndexType> *matrix,
+           const solver::SolveStruct *solve_struct,
+           matrix::Dense<ValueType> *trans_b, matrix::Dense<ValueType> *trans_x,
+           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *x)
+{
+    solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
+                                       trans_x, b, x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
+
+
+}  // namespace lower_trs
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
new file mode 100644
index 00000000000..0f27b6ceb28
--- /dev/null
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -0,0 +1,110 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/upper_trs_kernels.hpp"
+
+
+#include <memory>
+
+
+#include <hip/hip_runtime.h>
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/solver/upper_trs.hpp>
+
+
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/solver/common_trs_kernels.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The UPPER_TRS solver namespace.
+ *
+ * @ingroup upper_trs
+ */
+namespace upper_trs {
+
+
+void should_perform_transpose(std::shared_ptr<const HipExecutor> exec,
+                              bool &do_transpose)
+{
+    should_perform_transpose_kernel(exec, do_transpose);
+}
+
+
+void init_struct(std::shared_ptr<const HipExecutor> exec,
+                 std::shared_ptr<solver::SolveStruct> &solve_struct)
+{
+    init_struct_kernel(exec, solve_struct);
+}
+
+
+template <typename ValueType, typename IndexType>
+void generate(std::shared_ptr<const HipExecutor> exec,
+              const matrix::Csr<ValueType, IndexType> *matrix,
+              solver::SolveStruct *solve_struct, const gko::size_type num_rhs)
+{
+    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
+                                          true);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void solve(std::shared_ptr<const HipExecutor> exec,
+           const matrix::Csr<ValueType, IndexType> *matrix,
+           const solver::SolveStruct *solve_struct,
+           matrix::Dense<ValueType> *trans_b, matrix::Dense<ValueType> *trans_x,
+           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *x)
+{
+    solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
+                                       trans_x, b, x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
+
+
+}  // namespace upper_trs
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
new file mode 100644
index 00000000000..0c2cf4da378
--- /dev/null
+++ b/hip/stop/criterion_kernels.hip.cpp
@@ -0,0 +1,87 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/stop/criterion_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/stop/stopping_status.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Set all statuses namespace.
+ * @ref set_status
+ * @ingroup set_all_statuses
+ */
+namespace set_all_statuses {
+
+
+constexpr int default_block_size = 512;
+
+
+__global__ __launch_bounds__(default_block_size) void set_all_statuses(
+    size_type num_elems, uint8 stoppingId, bool setFinalized,
+    stopping_status *stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_elems) {
+        stop_status[tidx].stop(stoppingId, setFinalized);
+    }
+}
+
+
+void set_all_statuses(std::shared_ptr<const HipExecutor> exec, uint8 stoppingId,
+                      bool setFinalized, Array<stopping_status> *stop_status)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1,
+                         1);
+
+    hipLaunchKernelGGL((set_all_statuses), dim3(grid_size), dim3(block_size), 0,
+                       0, stop_status->get_num_elems(), stoppingId,
+                       setFinalized, as_hip_type(stop_status->get_data()));
+}
+
+
+}  // namespace set_all_statuses
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
new file mode 100644
index 00000000000..d104a29d8a8
--- /dev/null
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -0,0 +1,130 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/stop/residual_norm_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Residual norm stopping criterion namespace.
+ * @ref resnorm
+ * @ingroup resnorm
+ */
+namespace residual_norm {
+
+
+constexpr int default_block_size = 512;
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void residual_norm_kernel(
+    size_type num_cols, ValueType rel_residual_goal,
+    const ValueType *__restrict__ tau, const ValueType *__restrict__ orig_tau,
+    uint8 stoppingId, bool setFinalized,
+    stopping_status *__restrict__ stop_status,
+    bool *__restrict__ device_storage)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_cols) {
+        if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) {
+            stop_status[tidx].converge(stoppingId, setFinalized);
+            device_storage[1] = true;
+        }
+        // because only false is written to all_converged, write conflicts
+        // should not cause any problem
+        else if (!stop_status[tidx].has_stopped()) {
+            device_storage[0] = false;
+        }
+    }
+}
+
+
+__global__ __launch_bounds__(1) void init_kernel(
+    bool *__restrict__ device_storage)
+{
+    device_storage[0] = true;
+    device_storage[1] = false;
+}
+
+
+template <typename ValueType>
+void residual_norm(std::shared_ptr<const HipExecutor> exec,
+                   const matrix::Dense<ValueType> *tau,
+                   const matrix::Dense<ValueType> *orig_tau,
+                   ValueType rel_residual_goal, uint8 stoppingId,
+                   bool setFinalized, Array<stopping_status> *stop_status,
+                   Array<bool> *device_storage, bool *all_converged,
+                   bool *one_changed)
+{
+    static_assert(is_complex_s<ValueType>::value == false,
+                  "ValueType must not be complex in this function!");
+    hipLaunchKernelGGL((init_kernel), dim3(1), dim3(1), 0, 0,
+                       as_hip_type(device_storage->get_data()));
+
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1);
+
+    hipLaunchKernelGGL((residual_norm_kernel), dim3(grid_size),
+                       dim3(block_size), 0, 0, tau->get_size()[1],
+                       rel_residual_goal, as_hip_type(tau->get_const_values()),
+                       as_hip_type(orig_tau->get_const_values()), stoppingId,
+                       setFinalized, as_hip_type(stop_status->get_data()),
+                       as_hip_type(device_storage->get_data()));
+
+    /* Represents all_converged, one_changed */
+    *all_converged = exec->copy_val_to_host(device_storage->get_const_data());
+    *one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_RESIDUAL_NORM_KERNEL);
+
+
+}  // namespace residual_norm
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/test/CMakeLists.txt b/hip/test/CMakeLists.txt
new file mode 100644
index 00000000000..fd1fa2941d8
--- /dev/null
+++ b/hip/test/CMakeLists.txt
@@ -0,0 +1,10 @@
+include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
+
+add_subdirectory(base)
+add_subdirectory(components)
+add_subdirectory(factorization)
+add_subdirectory(matrix)
+add_subdirectory(solver)
+add_subdirectory(preconditioner)
+add_subdirectory(stop)
+add_subdirectory(utils)
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
new file mode 100644
index 00000000000..4719886d4d9
--- /dev/null
+++ b/hip/test/base/CMakeLists.txt
@@ -0,0 +1,8 @@
+ginkgo_create_hip_test(hip_executor)
+ginkgo_create_hip_test(math)
+# Only hcc needs the libraries. nvcc only requires the headers.
+if (GINKGO_HIP_PLATFORM MATCHES "hcc")
+    ginkgo_create_hip_test(exception_helpers roc::hipblas roc::hipsparse)
+else()
+    ginkgo_create_hip_test(exception_helpers)
+endif()
diff --git a/hip/test/base/exception_helpers.hip.cpp b/hip/test/base/exception_helpers.hip.cpp
new file mode 100644
index 00000000000..8261cc24f0e
--- /dev/null
+++ b/hip/test/base/exception_helpers.hip.cpp
@@ -0,0 +1,83 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include <hip/hip_runtime.h>
+#include <hipblas.h>
+#include <hipsparse.h>
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+TEST(AssertNoHipErrors, ThrowsOnError)
+{
+    ASSERT_THROW(GKO_ASSERT_NO_HIP_ERRORS(1), gko::HipError);
+}
+
+
+TEST(AssertNoHipErrors, DoesNotThrowOnSuccess)
+{
+    ASSERT_NO_THROW(GKO_ASSERT_NO_HIP_ERRORS(hipSuccess));
+}
+
+
+TEST(AssertNoHipblasErrors, ThrowsOnError)
+{
+    ASSERT_THROW(GKO_ASSERT_NO_HIPBLAS_ERRORS(1), gko::HipblasError);
+}
+
+
+TEST(AssertNoHipblasErrors, DoesNotThrowOnSuccess)
+{
+    ASSERT_NO_THROW(GKO_ASSERT_NO_HIPBLAS_ERRORS(HIPBLAS_STATUS_SUCCESS));
+}
+
+
+TEST(AssertNoHipsparseErrors, ThrowsOnError)
+{
+    ASSERT_THROW(GKO_ASSERT_NO_HIPSPARSE_ERRORS(1), gko::HipsparseError);
+}
+
+
+TEST(AssertNoHipsparseErrors, DoesNotThrowOnSuccess)
+{
+    ASSERT_NO_THROW(GKO_ASSERT_NO_HIPSPARSE_ERRORS(HIPSPARSE_STATUS_SUCCESS));
+}
+
+
+}  // namespace
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
new file mode 100644
index 00000000000..635639fc21e
--- /dev/null
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -0,0 +1,263 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// prevent compilation failure related to disappearing assert(...) statements
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class ExampleOperation : public gko::Operation {
+public:
+    explicit ExampleOperation(int &val) : value(val) {}
+
+    void run(std::shared_ptr<const gko::OmpExecutor>) const override
+    {
+        value = -1;
+    }
+
+    void run(std::shared_ptr<const gko::ReferenceExecutor>) const override
+    {
+        value = -2;
+    }
+
+    void run(std::shared_ptr<const gko::CudaExecutor>) const override
+    {
+        value = -3;
+    }
+
+    void run(std::shared_ptr<const gko::HipExecutor>) const override
+    {
+        hipGetDevice(&value);
+    }
+
+    int &value;
+};
+
+
+class HipExecutor : public ::testing::Test {
+protected:
+    HipExecutor() : omp(gko::OmpExecutor::create()), hip(nullptr), hip2(nullptr)
+    {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        hip = gko::HipExecutor::create(0, omp);
+        hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
+                                        omp);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            // ensure that previous calls finished and didn't throw an error
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::shared_ptr<gko::HipExecutor> hip2;
+};
+
+
+TEST_F(HipExecutor, CanInstantiateTwoExecutorsOnOneDevice)
+{
+    auto hip = gko::HipExecutor::create(0, omp);
+    auto hip2 = gko::HipExecutor::create(0, omp);
+
+    // We want automatic deinitialization to not create any error
+}
+
+
+TEST_F(HipExecutor, MasterKnowsNumberOfDevices)
+{
+    int count = 0;
+    hipGetDeviceCount(&count);
+
+    auto num_devices = gko::HipExecutor::get_num_devices();
+
+    ASSERT_EQ(count, num_devices);
+}
+
+
+TEST_F(HipExecutor, AllocatesAndFreesMemory)
+{
+    int *ptr = nullptr;
+
+    ASSERT_NO_THROW(ptr = hip->alloc<int>(2));
+    ASSERT_NO_THROW(hip->free(ptr));
+}
+
+
+TEST_F(HipExecutor, FailsWhenOverallocating)
+{
+    const gko::size_type num_elems = 1ll << 50;  // 4PB of integers
+    int *ptr = nullptr;
+
+    ASSERT_THROW(
+        {
+            ptr = hip->alloc<int>(num_elems);
+            hip->synchronize();
+        },
+        gko::AllocationError);
+
+    hip->free(ptr);
+}
+
+
+__global__ void check_data(int *data)
+{
+    if (data[0] != 3 || data[1] != 8) {
+#if GINKGO_HIP_PLATFORM_HCC
+        asm("s_trap 0x02;");
+#else  // GINKGO_HIP_PLATFORM_NVCC
+        asm("trap;");
+#endif
+    }
+}
+
+TEST_F(HipExecutor, CopiesDataToHip)
+{
+    int orig[] = {3, 8};
+    auto *copy = hip->alloc<int>(2);
+
+    hip->copy_from(omp.get(), 2, orig, copy);
+
+    hipLaunchKernelGGL((check_data), dim3(1), dim3(1), 0, 0, copy);
+    ASSERT_NO_THROW(hip->synchronize());
+    hip->free(copy);
+}
+
+
+__global__ void init_data(int *data)
+{
+    data[0] = 3;
+    data[1] = 8;
+}
+
+TEST_F(HipExecutor, CopiesDataFromHip)
+{
+    int copy[2];
+    auto orig = hip->alloc<int>(2);
+    hipLaunchKernelGGL((init_data), dim3(1), dim3(1), 0, 0, orig);
+
+    omp->copy_from(hip.get(), 2, orig, copy);
+
+    EXPECT_EQ(3, copy[0]);
+    ASSERT_EQ(8, copy[1]);
+    hip->free(orig);
+}
+
+
+/* Properly checks if it works only when multiple GPUs exist */
+TEST_F(HipExecutor, PreservesDeviceSettings)
+{
+    auto previous_device = gko::HipExecutor::get_num_devices() - 1;
+    GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(previous_device));
+    auto orig = hip->alloc<int>(2);
+    int current_device;
+    GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&current_device));
+    ASSERT_EQ(current_device, previous_device);
+
+    hip->free(orig);
+    GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&current_device));
+    ASSERT_EQ(current_device, previous_device);
+}
+
+
+TEST_F(HipExecutor, RunsOnProperDevice)
+{
+    int value = -1;
+
+    GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0));
+    hip2->run(ExampleOperation(value));
+
+    ASSERT_EQ(value, hip2->get_device_id());
+}
+
+
+TEST_F(HipExecutor, CopiesDataFromHipToHip)
+{
+    int copy[2];
+    auto orig = hip->alloc<int>(2);
+    GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0));
+    hipLaunchKernelGGL((init_data), dim3(1), dim3(1), 0, 0, orig);
+
+    auto copy_hip2 = hip2->alloc<int>(2);
+    hip2->copy_from(hip.get(), 2, orig, copy_hip2);
+
+    // Check that the data is really on GPU2 and ensure we did not cheat
+    int value = -1;
+    GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(hip2->get_device_id()));
+    hipLaunchKernelGGL((check_data), dim3(1), dim3(1), 0, 0, copy_hip2);
+    GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0));
+    hip2->run(ExampleOperation(value));
+    ASSERT_EQ(value, hip2->get_device_id());
+    // Put the results on OpenMP and run CPU side assertions
+    omp->copy_from(hip2.get(), 2, copy_hip2, copy);
+    EXPECT_EQ(3, copy[0]);
+    ASSERT_EQ(8, copy[1]);
+    hip->free(copy_hip2);
+    hip->free(orig);
+}
+
+
+TEST_F(HipExecutor, Synchronizes)
+{
+    // Todo design a proper unit test once we support streams
+    ASSERT_NO_THROW(hip->synchronize());
+}
+
+
+}  // namespace
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
new file mode 100644
index 00000000000..818506a8d25
--- /dev/null
+++ b/hip/test/base/math.hip.cpp
@@ -0,0 +1,169 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// prevent compilation failure related to disappearing assert(...) statements
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include <cmath>
+#include <complex>
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+namespace kernel {
+
+
+template <typename T, typename FuncType>
+__device__ bool test_real_is_finite_function(FuncType isfin)
+{
+    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T quiet_nan = NAN;
+    bool test_true{};
+    bool test_false{};
+
+    test_true = isfin(T{0}) && isfin(-T{0}) && isfin(T{1});
+    test_false = isfin(inf) || isfin(-inf) || isfin(quiet_nan) ||
+                 isfin(inf - inf) || isfin(inf / inf) || isfin(inf * T{2}) ||
+                 isfin(T{1} / T{0}) || isfin(T{0} / T{0});
+    return test_true && !test_false;
+}
+
+
+template <typename ComplexType, typename FuncType>
+__device__ bool test_complex_is_finite_function(FuncType isfin)
+{
+    static_assert(gko::is_complex_s<ComplexType>::value,
+                  "Template type must be a complex type.");
+    using T = gko::remove_complex<ComplexType>;
+    using c_type = gko::kernels::hip::hip_type<ComplexType>;
+    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T quiet_nan = NAN;
+    bool test_true{};
+    bool test_false{};
+
+    test_true = isfin(c_type{T{0}, T{0}}) && isfin(c_type{-T{0}, -T{0}}) &&
+                isfin(c_type{T{1}, T{0}}) && isfin(c_type{T{0}, T{1}});
+    test_false = isfin(c_type{inf, T{0}}) || isfin(c_type{-inf, T{0}}) ||
+                 isfin(c_type{quiet_nan, T{0}}) || isfin(c_type{T{0}, inf}) ||
+                 isfin(c_type{T{0}, -inf}) || isfin(c_type{T{0}, quiet_nan});
+    return test_true && !test_false;
+}
+
+
+}  // namespace kernel
+
+
+template <typename T>
+__global__ void test_real_is_finite(bool *result)
+{
+    *result = kernel::test_real_is_finite_function<T>(
+        [](T val) { return gko::is_finite(val); });
+}
+
+
+template <typename ComplexType>
+__global__ void test_complex_is_finite(bool *result)
+{
+    *result = kernel::test_complex_is_finite_function<ComplexType>(
+        [](ComplexType val) { return gko::is_finite(val); });
+}
+
+
+class IsFinite : public ::testing::Test {
+protected:
+    IsFinite()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref))
+    {}
+
+    template <typename T>
+    bool test_real_is_finite_kernel()
+    {
+        gko::Array<bool> result(hip, 1);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(test_real_is_finite<T>), dim3(1),
+                           dim3(1), 0, 0, result.get_data());
+        result.set_executor(ref);
+        return *result.get_data();
+    }
+
+    template <typename T>
+    bool test_complex_is_finite_kernel()
+    {
+        gko::Array<bool> result(hip, 1);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(test_complex_is_finite<T>), dim3(1),
+                           dim3(1), 0, 0, result.get_data());
+        result.set_executor(ref);
+        return *result.get_data();
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+};
+
+
+TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel<float>()); }
+
+
+TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel<double>()); }
+
+
+TEST_F(IsFinite, FloatComplex)
+{
+    ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<float>>());
+}
+
+
+TEST_F(IsFinite, DoubleComplex)
+{
+    ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<double>>());
+}
+
+
+}  // namespace
diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..b3bec2595f9
--- /dev/null
+++ b/hip/test/components/CMakeLists.txt
@@ -0,0 +1,7 @@
+ginkgo_create_hip_test(cooperative_groups_kernels)
+ginkgo_create_hip_test(fill_array)
+ginkgo_create_hip_test(merging_kernels)
+ginkgo_create_hip_test(precision_conversion)
+ginkgo_create_hip_test(prefix_sum)
+ginkgo_create_hip_test(searching_kernels)
+ginkgo_create_hip_test(sorting_kernels)
diff --git a/hip/test/components/cooperative_groups_kernels.hip.cpp b/hip/test/components/cooperative_groups_kernels.hip.cpp
new file mode 100644
index 00000000000..823dcef0df1
--- /dev/null
+++ b/hip/test/components/cooperative_groups_kernels.hip.cpp
@@ -0,0 +1,343 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// TODO remove when the HIP includes are fixed
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+#include <cstring>
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/base/types.hip.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::hip;
+
+
+class CooperativeGroups : public ::testing::Test {
+protected:
+    CooperativeGroups()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          result(ref, 1),
+          dresult(hip)
+    {
+        *result.get_data() = true;
+        dresult = result;
+    }
+
+    template <typename Kernel>
+    void test(Kernel kernel)
+    {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1),
+                           dim3(config::warp_size), 0, 0, dresult.get_data());
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    template <typename Kernel>
+    void test_subwarp(Kernel kernel)
+    {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1),
+                           dim3(config::warp_size / 2), 0, 0,
+                           dresult.get_data());
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    gko::Array<bool> result;
+    gko::Array<bool> dresult;
+};
+
+
+constexpr static int subwarp_size = config::warp_size / 4;
+
+
+__device__ void test_assert(bool *success, bool partial)
+{
+    if (!partial) {
+        *success = false;
+    }
+}
+
+
+__global__ void cg_shuffle(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto i = int(group.thread_rank());
+    test_assert(s, group.shfl_up(i, 1) == max(0, i - 1));
+    test_assert(s, group.shfl_down(i, 1) == min(i + 1, config::warp_size - 1));
+    test_assert(s, group.shfl(i, 0) == 0);
+}
+
+
+TEST_F(CooperativeGroups, Shuffle) { test(cg_shuffle); }
+
+
+__global__ void cg_all(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    test_assert(s, group.all(true));
+    test_assert(s, !group.all(false));
+    test_assert(s, !group.all(threadIdx.x < 13));
+}
+
+
+TEST_F(CooperativeGroups, All) { test(cg_all); }
+
+
+__global__ void cg_any(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    test_assert(s, group.any(true));
+    test_assert(s, group.any(threadIdx.x == 0));
+    test_assert(s, !group.any(false));
+}
+
+
+TEST_F(CooperativeGroups, Any) { test(cg_any); }
+
+
+__global__ void cg_ballot(bool *s)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    test_assert(s, group.ballot(false) == 0);
+    test_assert(s, group.ballot(true) == ~config::lane_mask_type{});
+    test_assert(s, group.ballot(threadIdx.x < 4) == 0xf);
+}
+
+
+TEST_F(CooperativeGroups, Ballot) { test(cg_ballot); }
+
+
+__global__ void cg_subwarp_shuffle(bool *s)
+{
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto i = int(group.thread_rank());
+    test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0));
+    test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1));
+    auto group_base = threadIdx.x / subwarp_size * subwarp_size;
+    test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base);
+    if (threadIdx.x / subwarp_size == 1) {
+        test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0));
+        test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1));
+        test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base);
+    } else {
+        test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1));
+        test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base);
+        test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0));
+    }
+}
+
+
+TEST_F(CooperativeGroups, SubwarpShuffle) { test(cg_subwarp_shuffle); }
+
+
+TEST_F(CooperativeGroups, SubwarpShuffle2) { test_subwarp(cg_subwarp_shuffle); }
+
+
+__global__ void cg_subwarp_all(bool *s)
+{
+    auto grp = threadIdx.x / subwarp_size;
+    bool test_grp = grp == 1;
+    auto i = threadIdx.x % subwarp_size;
+    // only test with test_grp, the other threads run 'interference'
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    test_assert(s, !test_grp || group.all(test_grp));
+    test_assert(s, !test_grp || !group.all(!test_grp));
+    test_assert(s, !test_grp || !group.all(i < subwarp_size - 3 || !test_grp));
+    if (test_grp) {
+        test_assert(s, group.all(true));
+        test_assert(s, !group.all(false));
+        test_assert(s, !group.all(i < subwarp_size - 3));
+    } else {
+        test_assert(s, !group.all(false));
+        test_assert(s, !group.all(i < subwarp_size - 3));
+        test_assert(s, group.all(true));
+    }
+}
+
+
+TEST_F(CooperativeGroups, SubwarpAll) { test(cg_subwarp_all); }
+
+
+TEST_F(CooperativeGroups, SubwarpAll2) { test_subwarp(cg_subwarp_all); }
+
+
+__global__ void cg_subwarp_any(bool *s)
+{
+    auto grp = threadIdx.x / subwarp_size;
+    bool test_grp = grp == 1;
+    // only test with test_grp, the other threads run 'interference'
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto i = group.thread_rank();
+    test_assert(s, !test_grp || group.any(test_grp));
+    test_assert(s, !test_grp || group.any(test_grp && i == 1));
+    test_assert(s, !test_grp || !group.any(!test_grp));
+    if (test_grp) {
+        test_assert(s, group.any(true));
+        test_assert(s, group.any(i == 1));
+        test_assert(s, !group.any(false));
+    } else {
+        test_assert(s, !group.any(false));
+        test_assert(s, group.any(true));
+        test_assert(s, group.any(i == 1));
+    }
+}
+
+
+TEST_F(CooperativeGroups, SubwarpAny) { test(cg_subwarp_any); }
+
+
+TEST_F(CooperativeGroups, SubwarpAny2) { test_subwarp(cg_subwarp_any); }
+
+
+__global__ void cg_subwarp_ballot(bool *s)
+{
+    auto grp = threadIdx.x / subwarp_size;
+    bool test_grp = grp == 1;
+    auto full_mask = (config::lane_mask_type{1} << subwarp_size) - 1;
+    // only test with test_grp, the other threads run 'interference'
+    auto group =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto i = group.thread_rank();
+    test_assert(s, !test_grp || group.ballot(!test_grp) == 0);
+    test_assert(s, !test_grp || group.ballot(test_grp) == full_mask);
+    test_assert(s, !test_grp || group.ballot(i < 4 || !test_grp) == 0xf);
+    if (test_grp) {
+        test_assert(s, group.ballot(false) == 0);
+        test_assert(s, group.ballot(true) == full_mask);
+        test_assert(s, group.ballot(i < 4) == 0xf);
+    } else {
+        test_assert(s, group.ballot(true) == full_mask);
+        test_assert(s, group.ballot(i < 4) == 0xf);
+        test_assert(s, group.ballot(false) == 0);
+    }
+}
+
+
+TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); }
+
+
+TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); }
+
+
+template <typename ValueType>
+__global__ void cg_shuffle_sum(const int num, ValueType *__restrict__ value)
+{
+    auto group =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    for (int ind = 0; ind < num; ind++) {
+        value[group.thread_rank()] += group.shfl(value[ind], ind);
+    }
+}
+
+
+TEST_F(CooperativeGroups, ShuffleSumDouble)
+{
+    int num = 4;
+    uint64_t x = 0x401022C90008B240;
+    double x_dbl{};
+    std::memcpy(&x_dbl, &x, sizeof(x_dbl));
+    gko::Array<double> value(ref, config::warp_size);
+    gko::Array<double> answer(ref, config::warp_size);
+    gko::Array<double> dvalue(hip);
+    for (int i = 0; i < value.get_num_elems(); i++) {
+        value.get_data()[i] = x_dbl;
+        answer.get_data()[i] = value.get_data()[i] * (1 << num);
+    }
+    dvalue = value;
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum<double>), dim3(1),
+                       dim3(config::warp_size), 0, 0, num, dvalue.get_data());
+
+    value = dvalue;
+    GKO_ASSERT_ARRAY_EQ(value, answer);
+}
+
+
+TEST_F(CooperativeGroups, ShuffleSumComplexDouble)
+{
+    int num = 4;
+    uint64_t x = 0x401022C90008B240;
+    double x_dbl{};
+    std::memcpy(&x_dbl, &x, sizeof(x_dbl));
+    gko::Array<std::complex<double>> value(ref, config::warp_size);
+    gko::Array<std::complex<double>> answer(ref, config::warp_size);
+    gko::Array<std::complex<double>> dvalue(hip);
+    for (int i = 0; i < value.get_num_elems(); i++) {
+        value.get_data()[i] = std::complex<double>{x_dbl, x_dbl};
+        answer.get_data()[i] =
+            std::complex<double>{x_dbl * (1 << num), x_dbl * (1 << num)};
+    }
+    dvalue = value;
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum<thrust::complex<double>>),
+                       dim3(1), dim3(config::warp_size), 0, 0, num,
+                       as_hip_type(dvalue.get_data()));
+
+    value = dvalue;
+    GKO_ASSERT_ARRAY_EQ(value, answer);
+}
+
+
+}  // namespace
diff --git a/hip/test/components/fill_array.hip.cpp b/hip/test/components/fill_array.hip.cpp
new file mode 100644
index 00000000000..1c7bfda89d0
--- /dev/null
+++ b/hip/test/components/fill_array.hip.cpp
@@ -0,0 +1,89 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// TODO remove when the HIP includes are fixed
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include "core/components/fill_array.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils/assertions.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class FillArray : public ::testing::Test {
+protected:
+    using value_type = double;
+    FillArray()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::HipExecutor::create(0, ref)),
+          total_size(6344),
+          vals(ref, total_size),
+          dvals(exec, total_size)
+    {
+        std::fill_n(vals.get_data(), total_size, 1234.0);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> exec;
+    gko::size_type total_size;
+    gko::Array<value_type> vals;
+    gko::Array<value_type> dvals;
+};
+
+
+TEST_F(FillArray, EqualsReference)
+{
+    gko::kernels::hip::components::fill_array(exec, dvals.get_data(),
+                                              total_size, 1234.0);
+    GKO_ASSERT_ARRAY_EQ(vals, dvals);
+}
+
+
+}  // namespace
diff --git a/hip/test/components/merging_kernels.hip.cpp b/hip/test/components/merging_kernels.hip.cpp
new file mode 100644
index 00000000000..466c31a48b3
--- /dev/null
+++ b/hip/test/components/merging_kernels.hip.cpp
@@ -0,0 +1,306 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// TODO remove when the HIP includes are fixed
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include "hip/components/merging.hip.hpp"
+
+
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::hip;
+using namespace gko::kernels::hip::group;
+
+
+class Merging : public ::testing::Test {
+protected:
+    Merging()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          rng(123456),
+          rng_runs{100},
+          max_size{1637},
+          sizes{0,  1,  2,   3,   4,   10,  15,   16,
+                31, 34, 102, 242, 534, 956, 1239, 1637},
+          data1(ref, max_size),
+          data2(ref, max_size),
+          outdata(ref, 2 * max_size),
+          idxs1(ref),
+          idxs2(ref),
+          idxs3(ref),
+          refidxs1(ref),
+          refidxs2(ref),
+          refidxs3(ref),
+          refdata(ref, 2 * max_size),
+          ddata1(hip),
+          ddata2(hip),
+          didxs1(hip, 2 * max_size),
+          didxs2(hip, 2 * max_size),
+          didxs3(hip, 2 * max_size),
+          drefidxs1(hip, 2 * max_size),
+          drefidxs2(hip, 2 * max_size),
+          drefidxs3(hip, 2 * max_size),
+          doutdata(hip, 2 * max_size)
+    {}
+
+    void init_data(int rng_run)
+    {
+        std::uniform_int_distribution<gko::int32> dist(0, max_size);
+        std::fill_n(data1.get_data(), max_size, 0);
+        std::fill_n(data2.get_data(), max_size, 0);
+        for (int i = 0; i < max_size; ++i) {
+            // here we also want to test some corner cases
+            // first two runs: zero data1
+            if (rng_run > 1) data1.get_data()[i] = dist(rng);
+            // first and third run: zero data2
+            if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng);
+        }
+        std::sort(data1.get_data(), data1.get_data() + max_size);
+        std::sort(data2.get_data(), data2.get_data() + max_size);
+
+        ddata1 = data1;
+        ddata2 = data2;
+    }
+
+    void assert_eq_ref(int size, int eq_size)
+    {
+        outdata = doutdata;
+        auto out_ptr = outdata.get_const_data();
+        auto out_end = out_ptr + eq_size;
+        auto ref_ptr = refdata.get_data();
+        std::copy_n(data1.get_const_data(), size, ref_ptr);
+        std::copy_n(data2.get_const_data(), size, ref_ptr + size);
+        std::sort(ref_ptr, ref_ptr + 2 * size);
+
+        ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::default_random_engine rng;
+
+    int rng_runs;
+    int max_size;
+    std::vector<int> sizes;
+    gko::Array<gko::int32> data1;
+    gko::Array<gko::int32> data2;
+    gko::Array<gko::int32> idxs1;
+    gko::Array<gko::int32> idxs2;
+    gko::Array<gko::int32> idxs3;
+    gko::Array<gko::int32> refidxs1;
+    gko::Array<gko::int32> refidxs2;
+    gko::Array<gko::int32> refidxs3;
+    gko::Array<gko::int32> outdata;
+    gko::Array<gko::int32> refdata;
+    gko::Array<gko::int32> ddata1;
+    gko::Array<gko::int32> ddata2;
+    gko::Array<gko::int32> didxs1;
+    gko::Array<gko::int32> didxs2;
+    gko::Array<gko::int32> didxs3;
+    gko::Array<gko::int32> drefidxs1;
+    gko::Array<gko::int32> drefidxs2;
+    gko::Array<gko::int32> drefidxs3;
+    gko::Array<gko::int32> doutdata;
+};
+
+
+__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
+                                gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto i = warp.thread_rank();
+    auto result = group_merge_step<config::warp_size>(a[i], b[i], warp);
+    c[i] = min(result.a_val, result.b_val);
+}
+
+TEST_F(Merging, MergeStep)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1),
+                           dim3(config::warp_size), 0, 0,
+                           ddata1.get_const_data(), ddata2.get_const_data(),
+                           doutdata.get_data());
+
+        assert_eq_ref(config::warp_size, config::warp_size);
+    }
+}
+
+
+__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
+                           gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                       return true;
+                                   });
+}
+
+TEST_F(Merging, FullMerge)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        for (auto size : sizes) {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1),
+                               dim3(config::warp_size), 0, 0,
+                               ddata1.get_const_data(), ddata2.get_const_data(),
+                               size, doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
+                                      int size, gko::int32 *c)
+{
+    sequential_merge(
+        a, size, b, size,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+            return true;
+        });
+}
+
+TEST_F(Merging, SequentialFullMerge)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        for (auto size : sizes) {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1),
+                               dim3(1), 0, 0, ddata1.get_const_data(),
+                               ddata2.get_const_data(), size,
+                               doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
+                                int size, gko::int32 *c, gko::int32 *aidxs,
+                                gko::int32 *bidxs, gko::int32 *cidxs,
+                                gko::int32 *refaidxs, gko::int32 *refbidxs,
+                                gko::int32 *refcidxs)
+{
+    if (threadIdx.x == 0) {
+        sequential_merge(a, size, b, size,
+                         [&](int a_idx, gko::int32 a_val, int b_idx,
+                             gko::int32 b_val, int i) {
+                             refaidxs[i] = a_idx;
+                             refbidxs[i] = b_idx;
+                             refcidxs[i] = i;
+                             return true;
+                         });
+    }
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           aidxs[i] = a_idx;
+                                           bidxs[i] = b_idx;
+                                           cidxs[i] = i;
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                       return true;
+                                   });
+}
+
+TEST_F(Merging, FullMergeIdxs)
+{
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
+        for (auto size : sizes) {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1),
+                               dim3(config::warp_size), 0, 0,
+                               ddata1.get_const_data(), ddata2.get_const_data(),
+                               size, doutdata.get_data(), didxs1.get_data(),
+                               didxs2.get_data(), didxs3.get_data(),
+                               drefidxs1.get_data(), drefidxs2.get_data(),
+                               drefidxs3.get_data());
+
+            assert_eq_ref(size, 2 * size);
+            idxs1 = didxs1;
+            idxs2 = didxs2;
+            idxs3 = didxs3;
+            refidxs1 = drefidxs1;
+            refidxs2 = drefidxs2;
+            refidxs3 = drefidxs3;
+            auto idxs1_ptr = idxs1.get_const_data();
+            auto idxs2_ptr = idxs2.get_const_data();
+            auto idxs3_ptr = idxs3.get_const_data();
+            auto refidxs1_ptr = refidxs1.get_const_data();
+            auto refidxs2_ptr = refidxs2.get_const_data();
+            auto refidxs3_ptr = refidxs3.get_const_data();
+
+            ASSERT_TRUE(
+                std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr));
+        }
+    }
+}
+
+
+}  // namespace
diff --git a/hip/test/components/precision_conversion.hip.cpp b/hip/test/components/precision_conversion.hip.cpp
new file mode 100644
index 00000000000..a7b9713b871
--- /dev/null
+++ b/hip/test/components/precision_conversion.hip.cpp
@@ -0,0 +1,173 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class PrecisionConversion : public ::testing::Test {
+protected:
+    PrecisionConversion()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::HipExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          cvals(ref, total_size),
+          vals2(ref, 1),
+          expected_float(ref, 1),
+          expected_double(ref, 1),
+          dvals(exec),
+          dcvals(exec),
+          dvals2(exec)
+    {
+        auto maxval = 1e10f;
+        std::uniform_real_distribution<float> dist(-maxval, maxval);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+            cvals.get_data()[i] = {dist(rand), dist(rand)};
+        }
+        dvals = vals;
+        dcvals = cvals;
+        gko::uint64 rawdouble{0x4218888000889111ULL};
+        gko::uint32 rawfloat{0x50c44400UL};
+        gko::uint64 rawrounded{0x4218888000000000ULL};
+        std::memcpy(vals2.get_data(), &rawdouble, sizeof(double));
+        std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float));
+        std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double));
+        dvals2 = vals2;
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<float> vals;
+    gko::Array<float> dvals;
+    gko::Array<double> vals2;
+    gko::Array<double> dvals2;
+    gko::Array<float> expected_float;
+    gko::Array<double> expected_double;
+    gko::Array<std::complex<float>> cvals;
+    gko::Array<std::complex<float>> dcvals;
+};
+
+
+TEST_F(PrecisionConversion, ConvertsReal)
+{
+    gko::Array<double> dtmp;
+    gko::Array<float> dout;
+
+    dtmp = dvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealViaRef)
+{
+    gko::Array<double> tmp{ref};
+    gko::Array<float> dout;
+
+    tmp = dvals;
+    dout = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplex)
+{
+    gko::Array<std::complex<double>> dtmp;
+    gko::Array<std::complex<float>> dout;
+
+    dtmp = dcvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dcvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConversionRounds)
+{
+    gko::Array<float> dtmp;
+    gko::Array<double> dout;
+
+    dtmp = dvals2;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dtmp, expected_float);
+    GKO_ASSERT_ARRAY_EQ(dout, expected_double);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealFromRef)
+{
+    gko::Array<double> dtmp;
+    gko::Array<float> dout;
+
+    dtmp = vals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplexFromRef)
+{
+    gko::Array<std::complex<double>> dtmp;
+    gko::Array<std::complex<float>> dout;
+
+    dtmp = cvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dcvals, dout);
+}
+
+
+}  // namespace
diff --git a/hip/test/components/prefix_sum.hip.cpp b/hip/test/components/prefix_sum.hip.cpp
new file mode 100644
index 00000000000..96f91522d06
--- /dev/null
+++ b/hip/test/components/prefix_sum.hip.cpp
@@ -0,0 +1,95 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::HipExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::components::prefix_sum(ref, vals.get_data(),
+                                                        size);
+        gko::kernels::hip::components::prefix_sum(exec, dvals.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(vals, dvals);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/hip/test/components/searching_kernels.hip.cpp b/hip/test/components/searching_kernels.hip.cpp
new file mode 100644
index 00000000000..e55855e40c3
--- /dev/null
+++ b/hip/test/components/searching_kernels.hip.cpp
@@ -0,0 +1,253 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// TODO remove when the HIP includes are fixed
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include "hip/components/searching.hip.hpp"
+
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::hip;
+using namespace gko::kernels::hip::group;
+
+
+class Searching : public ::testing::Test {
+protected:
+    Searching()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          result(ref, 1),
+          dresult(hip),
+          sizes(14203)
+    {
+        std::iota(sizes.begin(), sizes.end(), 0);
+    }
+
+    template <typename Kernel>
+    void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1)
+    {
+        *result.get_data() = true;
+        dresult = result;
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks),
+                           dim3(config::warp_size), 0, 0, dresult.get_data(),
+                           offset, size);
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    gko::Array<bool> result;
+    gko::Array<bool> dresult;
+    std::vector<int> sizes;
+};
+
+
+__device__ void test_assert(bool *success, bool predicate)
+{
+    if (!predicate) {
+        *success = false;
+    }
+}
+
+
+__global__ void test_binary_search(bool *success, int offset, int size)
+{
+    // test binary search on [offset, offset + size)
+    // for all possible partition points
+    auto result = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset;
+    });
+    auto result2 = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset + 1;
+    });
+    test_assert(success, result == threadIdx.x + offset);
+    test_assert(success, result2 == threadIdx.x + offset + 1);
+}
+
+TEST_F(Searching, BinaryNoOffset)
+{
+    run_test(test_binary_search, 0, config::warp_size);
+}
+
+TEST_F(Searching, BinaryOffset)
+{
+    run_test(test_binary_search, 5, config::warp_size);
+}
+
+
+__global__ void test_empty_binary_search(bool *success, int offset, int)
+{
+    auto result = binary_search(offset, 0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == offset);
+}
+
+TEST_F(Searching, BinaryEmptyNoOffset)
+{
+    run_test(test_empty_binary_search, 0, 0);
+}
+
+TEST_F(Searching, BinaryEmptyOffset)
+{
+    run_test(test_empty_binary_search, 5, 0);
+}
+
+
+__global__ void test_sync_binary_search(bool *success, int, int size)
+{
+    // test binary search on [0, size)
+    // for all possible partition points
+    auto result = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x;
+    });
+    auto result2 = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x + 1;
+    });
+    test_assert(success, result == threadIdx.x);
+    test_assert(success, result2 == threadIdx.x + 1);
+}
+
+TEST_F(Searching, SyncBinary)
+{
+    run_test(test_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_empty_sync_binary_search(bool *success, int, int)
+{
+    auto result = synchronous_binary_search(0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == 0);
+}
+
+TEST_F(Searching, EmptySyncBinary)
+{
+    run_test(test_empty_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_warp_ary_search(bool *success, int offset, int size)
+{
+    // test binary search on [offset, offset + size)
+    // for all possible partition points
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_ary_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpAryNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpAryOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 134, size, size + 1);
+    }
+}
+
+
+__global__ void test_warp_wide_search(bool *success, int offset, int size)
+{
+    // test binary search on [offset, offset + size)
+    // for all possible partition points
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_wide_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpWideNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpWideOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 142, size, size + 1);
+    }
+}
+
+
+}  // namespace
diff --git a/hip/test/components/sorting_kernels.hip.cpp b/hip/test/components/sorting_kernels.hip.cpp
new file mode 100644
index 00000000000..ca30186096c
--- /dev/null
+++ b/hip/test/components/sorting_kernels.hip.cpp
@@ -0,0 +1,146 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "hip/components/sorting.hip.hpp"
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+using gko::kernels::hip::bitonic_sort;
+using gko::kernels::hip::config;
+
+
+constexpr auto num_elements = 2048;
+constexpr auto num_local = 4;
+constexpr auto num_threads = num_elements / num_local;
+
+
+__global__ void test_sort_shared(gko::int32 *data)
+{
+    gko::int32 local[num_local];
+    __shared__ gko::int32 sh_local[num_elements];
+    for (int i = 0; i < num_local; ++i) {
+        local[i] = data[threadIdx.x * num_local + i];
+    }
+    bitonic_sort<num_elements, num_local>(local, sh_local);
+    for (int i = 0; i < num_local; ++i) {
+        data[threadIdx.x * num_local + i] = local[i];
+    }
+}
+
+
+__global__ void test_sort_warp(gko::int32 *data)
+{
+    gko::int32 local[num_local];
+    for (int i = 0; i < num_local; ++i) {
+        local[i] = data[threadIdx.x * num_local + i];
+    }
+    bitonic_sort<config::warp_size * num_local, num_local>(
+        local, static_cast<gko::int32 *>(nullptr));
+    for (int i = 0; i < num_local; ++i) {
+        data[threadIdx.x * num_local + i] = local[i];
+    }
+}
+
+
+class Sorting : public ::testing::Test {
+protected:
+    Sorting()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          rng(123456),
+          ref_shared(ref, num_elements),
+          ref_warp(ref),
+          ddata(hip)
+    {
+        // we want some duplicate elements
+        std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
+        for (auto i = 0; i < num_elements; ++i) {
+            ref_shared.get_data()[i] = dist(rng);
+        }
+        ddata = gko::Array<gko::int32>{hip, ref_shared};
+        ref_warp = ref_shared;
+        std::sort(ref_shared.get_data(), ref_shared.get_data() + num_elements);
+        std::sort(ref_warp.get_data(),
+                  ref_warp.get_data() + (config::warp_size * num_local));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::default_random_engine rng;
+    gko::Array<gko::int32> ref_shared;
+    gko::Array<gko::int32> ref_warp;
+    gko::Array<gko::int32> ddata;
+};
+
+
+TEST_F(Sorting, HipBitonicSortWarp)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_warp), dim3(1),
+                       dim3(config::warp_size), 0, 0, ddata.get_data());
+    ddata.set_executor(ref);
+    auto data_ptr = ddata.get_const_data();
+    auto ref_ptr = ref_warp.get_const_data();
+
+    ASSERT_TRUE(std::equal(data_ptr, data_ptr + (num_local * config::warp_size),
+                           ref_ptr));
+}
+
+
+TEST_F(Sorting, HipBitonicSortShared)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_shared), dim3(1),
+                       dim3(num_threads), 0, 0, ddata.get_data());
+    ddata.set_executor(ref);
+    auto data_ptr = ddata.get_const_data();
+    auto ref_ptr = ref_shared.get_const_data();
+
+    ASSERT_TRUE(std::equal(data_ptr, data_ptr + num_elements, ref_ptr));
+}
+
+
+}  // namespace
diff --git a/hip/test/factorization/CMakeLists.txt b/hip/test/factorization/CMakeLists.txt
new file mode 100644
index 00000000000..da6c40ca680
--- /dev/null
+++ b/hip/test/factorization/CMakeLists.txt
@@ -0,0 +1,4 @@
+ginkgo_create_hip_test_special_linkage(ilu_kernels)
+ginkgo_create_hip_test(par_ict_kernels)
+ginkgo_create_hip_test(par_ilu_kernels)
+ginkgo_create_hip_test(par_ilut_kernels)
diff --git a/hip/test/factorization/ilu_kernels.cpp b/hip/test/factorization/ilu_kernels.cpp
new file mode 100644
index 00000000000..b0bffcdd430
--- /dev/null
+++ b/hip/test/factorization/ilu_kernels.cpp
@@ -0,0 +1,121 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/ilu.hpp>
+
+
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/par_ilu.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class Ilu : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::shared_ptr<Csr> csr_ref;
+    std::shared_ptr<Csr> csr_hip;
+
+    Ilu()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref))
+    {}
+
+    void SetUp() override
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        csr_ref = gko::read<Csr>(input_file, ref);
+        csr_hip = Csr::create(hip);
+        csr_hip->copy_from(gko::lend(csr_ref));
+    }
+};
+
+
+TEST_F(Ilu, ComputeILUIsEquivalentToRef)
+{
+    auto ref_fact =
+        gko::factorization::ParIlu<>::build().on(ref)->generate(csr_ref);
+    auto hip_fact =
+        gko::factorization::Ilu<>::build().on(hip)->generate(csr_hip);
+
+    GKO_ASSERT_MTX_NEAR(ref_fact->get_l_factor(), hip_fact->get_l_factor(),
+                        1e-14);
+    GKO_ASSERT_MTX_NEAR(ref_fact->get_u_factor(), hip_fact->get_u_factor(),
+                        1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_l_factor(),
+                               hip_fact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_u_factor(),
+                               hip_fact->get_u_factor());
+}
+
+
+TEST_F(Ilu, SetsCorrectStrategy)
+{
+    auto hip_fact =
+        gko::factorization::Ilu<>::build()
+            .with_l_strategy(std::make_shared<Csr::merge_path>())
+            .with_u_strategy(std::make_shared<Csr::load_balance>(hip))
+            .on(hip)
+            ->generate(csr_hip);
+
+    ASSERT_EQ(hip_fact->get_l_factor()->get_strategy()->get_name(),
+              "merge_path");
+    ASSERT_EQ(hip_fact->get_u_factor()->get_strategy()->get_name(),
+              "load_balance");
+}
+
+
+}  // namespace
diff --git a/hip/test/factorization/par_ict_kernels.hip.cpp b/hip/test/factorization/par_ict_kernels.hip.cpp
new file mode 100644
index 00000000000..b8858dadaa4
--- /dev/null
+++ b/hip/test/factorization/par_ict_kernels.hip.cpp
@@ -0,0 +1,177 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class ParIct : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    ParIct()
+        : mtx_size(500, 500),
+          rand_engine(6780),
+          ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref))
+    {
+        mtx = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<>(10, mtx_size[1]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+
+        dmtx_ani = Csr::create(hip);
+        dmtx_l_ani = Csr::create(hip);
+        dmtx = Csr::create(hip);
+        dmtx->copy_from(lend(mtx));
+        dmtx_l = Csr::create(hip);
+        dmtx_l->copy_from(lend(mtx_l));
+    }
+
+    void SetUp()
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        mtx_ani = gko::read<Csr>(input_file, ref);
+        mtx_ani->sort_by_column_index();
+
+        {
+            mtx_l_ani = Csr::create(ref, mtx_ani->get_size());
+            gko::matrix::CsrBuilder<value_type, index_type> l_builder(
+                lend(mtx_l_ani));
+            gko::kernels::reference::factorization::initialize_row_ptrs_l(
+                ref, lend(mtx_ani), mtx_l_ani->get_row_ptrs());
+            auto l_nnz =
+                mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+            l_builder.get_value_array().resize_and_reset(l_nnz);
+            gko::kernels::reference::factorization::initialize_l(
+                ref, lend(mtx_ani), lend(mtx_l_ani), true);
+        }
+        dmtx_ani->copy_from(lend(mtx_ani));
+        dmtx_l_ani->copy_from(lend(mtx_l_ani));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+
+    const gko::dim<2> mtx_size;
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> mtx_ani;
+    std::unique_ptr<Csr> mtx_l_ani;
+    std::unique_ptr<Csr> mtx_l;
+
+    std::unique_ptr<Csr> dmtx;
+    std::unique_ptr<Csr> dmtx_ani;
+    std::unique_ptr<Csr> dmtx_l_ani;
+    std::unique_ptr<Csr> dmtx_l;
+};
+
+
+TEST_F(ParIct, KernelAddCandidatesIsEquivalentToRef)
+{
+    auto mtx_llt = Csr::create(ref, mtx_size);
+    mtx_l->apply(lend(mtx_l->transpose()), lend(mtx_llt));
+    auto dmtx_llt = Csr::create(hip, mtx_size);
+    dmtx_llt->copy_from(lend(mtx_llt));
+    auto res_mtx_l = Csr::create(ref, mtx_size);
+    auto dres_mtx_l = Csr::create(hip, mtx_size);
+
+    gko::kernels::reference::par_ict_factorization::add_candidates(
+        ref, lend(mtx_llt), lend(mtx), lend(mtx_l), lend(res_mtx_l));
+    gko::kernels::hip::par_ict_factorization::add_candidates(
+        hip, lend(dmtx_llt), lend(dmtx), lend(dmtx_l), lend(dres_mtx_l));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14);
+}
+
+
+TEST_F(ParIct, KernelComputeFactorIsEquivalentToRef)
+{
+    auto square_size = mtx_ani->get_size();
+    auto mtx_l_coo = Coo::create(ref, square_size);
+    mtx_l_ani->convert_to(lend(mtx_l_coo));
+    auto dmtx_l_coo = Coo::create(hip, square_size);
+    dmtx_l_coo->copy_from(lend(mtx_l_coo));
+
+    gko::kernels::reference::par_ict_factorization::compute_factor(
+        ref, lend(mtx_ani), lend(mtx_l_ani), lend(mtx_l_coo));
+    for (int i = 0; i < 20; ++i) {
+        gko::kernels::hip::par_ict_factorization::compute_factor(
+            hip, lend(dmtx_ani), lend(dmtx_l_ani), lend(dmtx_l_coo));
+    }
+
+    GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2);
+}
+
+
+}  // namespace
diff --git a/hip/test/factorization/par_ilu_kernels.hip.cpp b/hip/test/factorization/par_ilu_kernels.hip.cpp
new file mode 100644
index 00000000000..96dffed19e1
--- /dev/null
+++ b/hip/test/factorization/par_ilu_kernels.hip.cpp
@@ -0,0 +1,349 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilu_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class ParIlu : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Dense = gko::matrix::Dense<value_type>;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    std::ranlux48 rand_engine;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::shared_ptr<const Csr> csr_ref;
+    std::shared_ptr<const Csr> csr_hip;
+
+    ParIlu()
+        : rand_engine(19),
+          ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          csr_ref(nullptr),
+          csr_hip(nullptr)
+    {}
+
+    void SetUp() override
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        auto csr_ref_temp = gko::read<Csr>(input_file, ref);
+        auto csr_hip_temp = Csr::create(hip);
+        csr_hip_temp->copy_from(gko::lend(csr_ref_temp));
+        // Make sure there are diagonal elements present
+        gko::kernels::reference::factorization::add_diagonal_elements(
+            ref, gko::lend(csr_ref_temp), false);
+        gko::kernels::hip::factorization::add_diagonal_elements(
+            hip, gko::lend(csr_hip_temp), false);
+        csr_ref = gko::give(csr_ref_temp);
+        csr_hip = gko::give(csr_hip_temp);
+    }
+
+    template <typename Mtx>
+    std::unique_ptr<Mtx> gen_mtx(index_type num_rows, index_type num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<index_type>(0, num_cols - 1),
+            std::normal_distribution<value_type>(0.0, 1.0), rand_engine, ref);
+    }
+
+    std::unique_ptr<Csr> gen_unsorted_mtx(index_type num_rows,
+                                          index_type num_cols)
+    {
+        using std::swap;
+        auto mtx = gen_mtx<Csr>(num_rows, num_cols);
+        auto values = mtx->get_values();
+        auto col_idxs = mtx->get_col_idxs();
+        const auto row_ptrs = mtx->get_const_row_ptrs();
+        for (int row = 0; row < num_rows; ++row) {
+            const auto row_start = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            const int num_row_elements = row_end - row_start;
+            auto idx_dist = std::uniform_int_distribution<index_type>(
+                row_start, row_end - 1);
+            for (int i = 0; i < num_row_elements / 2; ++i) {
+                auto idx1 = idx_dist(rand_engine);
+                auto idx2 = idx_dist(rand_engine);
+                if (idx1 != idx2) {
+                    swap(values[idx1], values[idx2]);
+                    swap(col_idxs[idx1], col_idxs[idx2]);
+                }
+            }
+        }
+        return mtx;
+    }
+
+    void initialize_row_ptrs(index_type *l_row_ptrs_ref,
+                             index_type *u_row_ptrs_ref,
+                             index_type *l_row_ptrs_hip,
+                             index_type *u_row_ptrs_hip)
+    {
+        gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
+            ref, gko::lend(csr_ref), l_row_ptrs_ref, u_row_ptrs_ref);
+        gko::kernels::hip::factorization::initialize_row_ptrs_l_u(
+            hip, gko::lend(csr_hip), l_row_ptrs_hip, u_row_ptrs_hip);
+    }
+
+    void initialize_lu(std::unique_ptr<Csr> *l_ref, std::unique_ptr<Csr> *u_ref,
+                       std::unique_ptr<Csr> *l_hip, std::unique_ptr<Csr> *u_hip)
+    {
+        auto num_row_ptrs = csr_ref->get_size()[0] + 1;
+        gko::Array<index_type> l_row_ptrs_ref{ref, num_row_ptrs};
+        gko::Array<index_type> u_row_ptrs_ref{ref, num_row_ptrs};
+        gko::Array<index_type> l_row_ptrs_hip{hip, num_row_ptrs};
+        gko::Array<index_type> u_row_ptrs_hip{hip, num_row_ptrs};
+
+        initialize_row_ptrs(
+            l_row_ptrs_ref.get_data(), u_row_ptrs_ref.get_data(),
+            l_row_ptrs_hip.get_data(), u_row_ptrs_hip.get_data());
+        // Since `initialize_row_ptrs` was already tested, it is expected that
+        // `*_ref` and `*_hip` contain identical values
+        auto l_nnz = l_row_ptrs_ref.get_const_data()[num_row_ptrs - 1];
+        auto u_nnz = u_row_ptrs_ref.get_const_data()[num_row_ptrs - 1];
+
+        *l_ref = Csr::create(ref, csr_ref->get_size(), l_nnz);
+        *u_ref = Csr::create(ref, csr_ref->get_size(), u_nnz);
+        *l_hip = Csr::create(hip, csr_hip->get_size(), l_nnz);
+        *u_hip = Csr::create(hip, csr_hip->get_size(), u_nnz);
+        // Copy the already initialized `row_ptrs` to the new matrices
+        ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(),
+                  (*l_ref)->get_row_ptrs());
+        ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(),
+                  (*u_ref)->get_row_ptrs());
+        hip->copy(num_row_ptrs, l_row_ptrs_hip.get_data(),
+                  (*l_hip)->get_row_ptrs());
+        hip->copy(num_row_ptrs, u_row_ptrs_hip.get_data(),
+                  (*u_hip)->get_row_ptrs());
+
+        gko::kernels::reference::factorization::initialize_l_u(
+            ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref));
+        gko::kernels::hip::factorization::initialize_l_u(
+            hip, gko::lend(csr_hip), gko::lend(*l_hip), gko::lend(*u_hip));
+    }
+
+    template <typename ToType, typename FromType>
+    static std::unique_ptr<ToType> static_unique_ptr_cast(
+        std::unique_ptr<FromType> &&from)
+    {
+        return std::unique_ptr<ToType>{static_cast<ToType *>(from.release())};
+    }
+
+    void compute_lu(std::unique_ptr<Csr> *l_ref, std::unique_ptr<Csr> *u_ref,
+                    std::unique_ptr<Csr> *l_hip, std::unique_ptr<Csr> *u_hip,
+                    gko::size_type iterations = 0)
+    {
+        auto coo_ref = Coo::create(ref);
+        csr_ref->convert_to(gko::lend(coo_ref));
+        auto coo_hip = Coo::create(hip);
+        csr_hip->convert_to(gko::lend(coo_hip));
+        initialize_lu(l_ref, u_ref, l_hip, u_hip);
+        auto u_transpose_lin_op_ref = (*u_ref)->transpose();
+        auto u_transpose_csr_ref =
+            static_unique_ptr_cast<Csr>(std::move(u_transpose_lin_op_ref));
+        auto u_transpose_lin_op_hip = (*u_hip)->transpose();
+        auto u_transpose_csr_hip =
+            static_unique_ptr_cast<Csr>(std::move(u_transpose_lin_op_hip));
+
+        gko::kernels::reference::par_ilu_factorization::compute_l_u_factors(
+            ref, iterations, gko::lend(coo_ref), gko::lend(*l_ref),
+            gko::lend(u_transpose_csr_ref));
+        gko::kernels::hip::par_ilu_factorization::compute_l_u_factors(
+            hip, iterations, gko::lend(coo_hip), gko::lend(*l_hip),
+            gko::lend(u_transpose_csr_hip));
+        auto u_lin_op_ref = u_transpose_csr_ref->transpose();
+        *u_ref = static_unique_ptr_cast<Csr>(std::move(u_lin_op_ref));
+        auto u_lin_op_hip = u_transpose_csr_hip->transpose();
+        *u_hip = static_unique_ptr_cast<Csr>(std::move(u_lin_op_hip));
+    }
+};
+
+
+TEST_F(ParIlu, HipKernelAddDiagonalElementsSortedEquivalentToRef)
+{
+    index_type num_rows{600};
+    index_type num_cols{600};
+    auto mtx_ref = gen_mtx<Csr>(num_rows, num_cols);
+    auto mtx_hip = Csr::create(hip);
+    mtx_hip->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        ref, gko::lend(mtx_ref), true);
+    gko::kernels::hip::factorization::add_diagonal_elements(
+        hip, gko::lend(mtx_hip), true);
+    hip->synchronize();
+
+    ASSERT_TRUE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_hip, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_hip);
+}
+
+
+TEST_F(ParIlu, HipKernelAddDiagonalElementsUnsortedEquivalentToRef)
+{
+    index_type num_rows{600};
+    index_type num_cols{600};
+    auto mtx_ref = gen_unsorted_mtx(num_rows, num_cols);
+    auto mtx_hip = Csr::create(hip);
+    mtx_hip->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        ref, gko::lend(mtx_ref), false);
+    gko::kernels::hip::factorization::add_diagonal_elements(
+        hip, gko::lend(mtx_hip), false);
+    hip->synchronize();
+
+    ASSERT_FALSE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_hip, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_hip);
+}
+
+
+TEST_F(ParIlu, HipKernelAddDiagonalElementsNonSquareEquivalentToRef)
+{
+    index_type num_rows{600};
+    index_type num_cols{500};
+    auto mtx_ref = gen_mtx<Csr>(num_rows, num_cols);
+    auto mtx_hip = Csr::create(hip);
+    mtx_hip->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        ref, gko::lend(mtx_ref), true);
+    gko::kernels::hip::factorization::add_diagonal_elements(
+        hip, gko::lend(mtx_hip), true);
+    hip->synchronize();
+
+    ASSERT_TRUE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_hip, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_hip);
+}
+
+
+TEST_F(ParIlu, KernelInitializeRowPtrsLUEquivalentToRef)
+{
+    auto num_row_ptrs = csr_ref->get_size()[0] + 1;
+    gko::Array<index_type> l_row_ptrs_array_ref(ref, num_row_ptrs);
+    gko::Array<index_type> u_row_ptrs_array_ref(ref, num_row_ptrs);
+    gko::Array<index_type> l_row_ptrs_array_hip(hip, num_row_ptrs);
+    gko::Array<index_type> u_row_ptrs_array_hip(hip, num_row_ptrs);
+
+    initialize_row_ptrs(
+        l_row_ptrs_array_ref.get_data(), u_row_ptrs_array_ref.get_data(),
+        l_row_ptrs_array_hip.get_data(), u_row_ptrs_array_hip.get_data());
+
+    GKO_ASSERT_ARRAY_EQ(l_row_ptrs_array_ref, l_row_ptrs_array_hip);
+    GKO_ASSERT_ARRAY_EQ(u_row_ptrs_array_ref, u_row_ptrs_array_hip);
+}
+
+
+TEST_F(ParIlu, KernelInitializeParILUIsEquivalentToRef)
+{
+    std::unique_ptr<Csr> l_ref{};
+    std::unique_ptr<Csr> u_ref{};
+    std::unique_ptr<Csr> l_hip{};
+    std::unique_ptr<Csr> u_hip{};
+
+    initialize_lu(&l_ref, &u_ref, &l_hip, &u_hip);
+
+    GKO_ASSERT_MTX_NEAR(l_ref, l_hip, 1e-14);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_hip, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_hip);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_hip);
+}
+
+
+TEST_F(ParIlu, KernelComputeParILUIsEquivalentToRef)
+{
+    std::unique_ptr<Csr> l_ref{};
+    std::unique_ptr<Csr> u_ref{};
+    std::unique_ptr<Csr> l_hip{};
+    std::unique_ptr<Csr> u_hip{};
+
+    compute_lu(&l_ref, &u_ref, &l_hip, &u_hip);
+
+    GKO_ASSERT_MTX_NEAR(l_ref, l_hip, 5e-2);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_hip, 5e-2);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_hip);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_hip);
+}
+
+
+TEST_F(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef)
+{
+    std::unique_ptr<Csr> l_ref{};
+    std::unique_ptr<Csr> u_ref{};
+    std::unique_ptr<Csr> l_hip{};
+    std::unique_ptr<Csr> u_hip{};
+    gko::size_type iterations{200};
+
+    compute_lu(&l_ref, &u_ref, &l_hip, &u_hip, iterations);
+
+    GKO_ASSERT_MTX_NEAR(l_ref, l_hip, 1e-14);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_hip, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_hip);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_hip);
+}
+
+
+}  // namespace
diff --git a/hip/test/factorization/par_ilut_kernels.hip.cpp b/hip/test/factorization/par_ilut_kernels.hip.cpp
new file mode 100644
index 00000000000..38fb5eb205f
--- /dev/null
+++ b/hip/test/factorization/par_ilut_kernels.hip.cpp
@@ -0,0 +1,547 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/factorization/par_ilu_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class ParIlut : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Dense = gko::matrix::Dense<value_type>;
+    using ComplexDense = gko::matrix::Dense<std::complex<value_type>>;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using ComplexCsr = gko::matrix::Csr<std::complex<value_type>, index_type>;
+
+    ParIlut()
+        : mtx_size(500, 700),
+          rand_engine(1337),
+          ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref))
+    {
+        mtx1 = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<>(10, mtx_size[1]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx2 = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<>(0, mtx_size[1]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_square = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[0],
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l2 = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], true,
+            std::uniform_int_distribution<>(1, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_l_complex =
+            gko::test::generate_random_lower_triangular_matrix<ComplexCsr>(
+                mtx_size[0], mtx_size[0], false,
+                std::uniform_int_distribution<>(10, mtx_size[0]),
+                std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_u = gko::test::generate_random_upper_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<>(10, mtx_size[0]),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+        mtx_u_complex =
+            gko::test::generate_random_upper_triangular_matrix<ComplexCsr>(
+                mtx_size[0], mtx_size[0], false,
+                std::uniform_int_distribution<>(10, mtx_size[0]),
+                std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+
+
+        dmtx1 = Csr::create(hip);
+        dmtx1->copy_from(mtx1.get());
+        dmtx2 = Csr::create(hip);
+        dmtx2->copy_from(mtx2.get());
+        dmtx_square = Csr::create(hip);
+        dmtx_square->copy_from(mtx_square.get());
+        dmtx_ani = Csr::create(hip);
+        dmtx_l_ani = Csr::create(hip);
+        dmtx_u_ani = Csr::create(hip);
+        dmtx_ut_ani = Csr::create(hip);
+        dmtx_l = Csr::create(hip);
+        dmtx_l->copy_from(mtx_l.get());
+        dmtx_l2 = Csr::create(hip);
+        dmtx_l2->copy_from(mtx_l2.get());
+        dmtx_u = Csr::create(hip);
+        dmtx_u->copy_from(mtx_u.get());
+        dmtx_l_complex = ComplexCsr::create(hip);
+        dmtx_l_complex->copy_from(mtx_l_complex.get());
+        dmtx_u_complex = ComplexCsr::create(hip);
+        dmtx_u_complex->copy_from(mtx_u_complex.get());
+    }
+
+    void SetUp()
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        mtx_ani = gko::read<Csr>(input_file, ref);
+        mtx_ani->sort_by_column_index();
+
+        {
+            mtx_l_ani = Csr::create(ref, mtx_ani->get_size());
+            mtx_u_ani = Csr::create(ref, mtx_ani->get_size());
+            gko::matrix::CsrBuilder<value_type, index_type> l_builder(
+                mtx_l_ani.get());
+            gko::matrix::CsrBuilder<value_type, index_type> u_builder(
+                mtx_u_ani.get());
+            gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
+                ref, mtx_ani.get(), mtx_l_ani->get_row_ptrs(),
+                mtx_u_ani->get_row_ptrs());
+            auto l_nnz =
+                mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            auto u_nnz =
+                mtx_u_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+            l_builder.get_value_array().resize_and_reset(l_nnz);
+            u_builder.get_col_idx_array().resize_and_reset(u_nnz);
+            u_builder.get_value_array().resize_and_reset(u_nnz);
+            gko::kernels::reference::factorization::initialize_l_u(
+                ref, mtx_ani.get(), mtx_l_ani.get(), mtx_u_ani.get());
+            mtx_ut_ani = Csr::create(ref, mtx_ani->get_size(),
+                                     mtx_u_ani->get_num_stored_elements());
+            gko::kernels::reference::csr::transpose(ref, mtx_u_ani.get(),
+                                                    mtx_ut_ani.get());
+        }
+        dmtx_ani->copy_from(mtx_ani.get());
+        dmtx_l_ani->copy_from(mtx_l_ani.get());
+        dmtx_u_ani->copy_from(mtx_u_ani.get());
+        dmtx_ut_ani->copy_from(mtx_ut_ani.get());
+    }
+
+    template <typename Mtx>
+    void test_select(const std::unique_ptr<Mtx> &mtx,
+                     const std::unique_ptr<Mtx> &dmtx, index_type rank,
+                     value_type tolerance = 0.0)
+    {
+        auto size = index_type(mtx->get_num_stored_elements());
+        using ValueType = typename Mtx::value_type;
+
+        gko::remove_complex<ValueType> res{};
+        gko::remove_complex<ValueType> dres{};
+        gko::Array<ValueType> tmp(ref);
+        gko::Array<gko::remove_complex<ValueType>> tmp2(ref);
+        gko::Array<ValueType> dtmp(hip);
+        gko::Array<gko::remove_complex<ValueType>> dtmp2(hip);
+
+        gko::kernels::reference::par_ilut_factorization::threshold_select(
+            ref, mtx.get(), rank, tmp, tmp2, res);
+        gko::kernels::hip::par_ilut_factorization::threshold_select(
+            hip, dmtx.get(), rank, dtmp, dtmp2, dres);
+
+        ASSERT_NEAR(res, dres, tolerance);
+    }
+
+    template <typename Mtx,
+              typename Coo = gko::matrix::Coo<typename Mtx::value_type,
+                                              typename Mtx::index_type>>
+    void test_filter(const std::unique_ptr<Mtx> &mtx,
+                     const std::unique_ptr<Mtx> &dmtx, value_type threshold,
+                     bool lower)
+    {
+        auto res = Mtx::create(ref, mtx_size);
+        auto dres = Mtx::create(hip, mtx_size);
+        auto res_coo = Coo::create(ref, mtx_size);
+        auto dres_coo = Coo::create(hip, mtx_size);
+        auto local_mtx = gko::as<Mtx>(lower ? mtx->clone() : mtx->transpose());
+        auto local_dmtx =
+            gko::as<Mtx>(lower ? dmtx->clone() : dmtx->transpose());
+
+        gko::kernels::reference::par_ilut_factorization::threshold_filter(
+            ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower);
+        gko::kernels::hip::par_ilut_factorization::threshold_filter(
+            hip, local_dmtx.get(), threshold, dres.get(), dres_coo.get(),
+            lower);
+
+        GKO_ASSERT_MTX_NEAR(res, dres, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+        GKO_ASSERT_MTX_NEAR(res, res_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo);
+        GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo);
+    }
+
+    template <typename Mtx,
+              typename Coo = gko::matrix::Coo<typename Mtx::value_type,
+                                              typename Mtx::index_type>>
+    void test_filter_approx(const std::unique_ptr<Mtx> &mtx,
+                            const std::unique_ptr<Mtx> &dmtx, index_type rank,
+                            value_type tolerance = 0.0)
+    {
+        auto res = Mtx::create(ref, mtx_size);
+        auto dres = Mtx::create(hip, mtx_size);
+        auto res_coo = Coo::create(ref, mtx_size);
+        auto dres_coo = Coo::create(hip, mtx_size);
+        using ValueType = typename Mtx::value_type;
+
+        gko::Array<ValueType> tmp(ref);
+        gko::Array<ValueType> dtmp(hip);
+        gko::remove_complex<ValueType> threshold{};
+        gko::remove_complex<ValueType> dthreshold{};
+
+        gko::kernels::reference::par_ilut_factorization::
+            threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold,
+                                    res.get(), res_coo.get());
+        gko::kernels::hip::par_ilut_factorization::threshold_filter_approx(
+            hip, dmtx.get(), rank, dtmp, dthreshold, dres.get(),
+            dres_coo.get());
+
+        GKO_ASSERT_MTX_NEAR(res, dres, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+        GKO_ASSERT_MTX_NEAR(res, res_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo);
+        GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo);
+        ASSERT_NEAR(threshold, dthreshold, tolerance);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+
+    const gko::dim<2> mtx_size;
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx1;
+    std::unique_ptr<Csr> mtx2;
+    std::unique_ptr<Csr> mtx_square;
+    std::unique_ptr<Csr> mtx_ani;
+    std::unique_ptr<Csr> mtx_l_ani;
+    std::unique_ptr<Csr> mtx_u_ani;
+    std::unique_ptr<Csr> mtx_ut_ani;
+    std::unique_ptr<Csr> mtx_l;
+    std::unique_ptr<Csr> mtx_l2;
+    std::unique_ptr<ComplexCsr> mtx_l_complex;
+    std::unique_ptr<Csr> mtx_u;
+    std::unique_ptr<ComplexCsr> mtx_u_complex;
+
+    std::unique_ptr<Csr> dmtx1;
+    std::unique_ptr<Csr> dmtx2;
+    std::unique_ptr<Csr> dmtx_square;
+    std::unique_ptr<Csr> dmtx_ani;
+    std::unique_ptr<Csr> dmtx_l_ani;
+    std::unique_ptr<Csr> dmtx_u_ani;
+    std::unique_ptr<Csr> dmtx_ut_ani;
+    std::unique_ptr<Csr> dmtx_l;
+    std::unique_ptr<Csr> dmtx_l2;
+    std::unique_ptr<ComplexCsr> dmtx_l_complex;
+    std::unique_ptr<Csr> dmtx_u;
+    std::unique_ptr<ComplexCsr> dmtx_u_complex;
+};
+
+
+TEST_F(ParIlut, KernelThresholdSelectIsEquivalentToRef)
+{
+    test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 3);
+}
+
+
+TEST_F(ParIlut, KernelThresholdSelectMinIsEquivalentToRef)
+{
+    test_select(mtx_l, dmtx_l, 0);
+}
+
+
+TEST_F(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef)
+{
+    test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdSelectIsEquivalentToRef)
+{
+    test_select(mtx_l_complex, dmtx_l_complex,
+                mtx_l_complex->get_num_stored_elements() / 3, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdSelectMinIsEquivalentToRef)
+{
+    test_select(mtx_l_complex, dmtx_l_complex, 0, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdSelectMaxLowerIsEquivalentToRef)
+{
+    test_select(mtx_l_complex, dmtx_l_complex,
+                mtx_l_complex->get_num_stored_elements() - 1, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef)
+{
+    auto res = Csr::create(ref, mtx_size);
+    auto dres = Csr::create(hip, mtx_size);
+    Coo *null_coo = nullptr;
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter(
+        ref, mtx_l.get(), 0.5, res.get(), null_coo, true);
+    gko::kernels::hip::par_ilut_factorization::threshold_filter(
+        hip, dmtx_l.get(), 0.5, dres.get(), null_coo, true);
+
+    GKO_ASSERT_MTX_NEAR(res, dres, 0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0.5, true);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0.5, false);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterNoneLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0, true);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterNoneUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0, false);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterAllLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 1e6, true);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterAllUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 1e6, false);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0.5, true);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterNoneLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0, true);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterAllLowerIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 1e6, true);
+}
+
+
+#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \
+    ((hipsparseVersionMajor > 1) ||                                     \
+     (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4))
+TEST_F(ParIlut, KernelComplexThresholdFilterUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0.5, false);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterNoneUpperIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 0, false);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterAllUppererIsEquivalentToRef)
+{
+    test_filter(mtx_l_complex, dmtx_l_complex, 1e6, false);
+}
+#endif  // hipsparse version >= 1.4
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
+{
+    test_filter(mtx_l, dmtx_l, 0.5, true);
+    auto res = Csr::create(ref, mtx_size);
+    auto dres = Csr::create(hip, mtx_size);
+    Coo *null_coo = nullptr;
+    gko::Array<value_type> tmp(ref);
+    gko::Array<value_type> dtmp(hip);
+    gko::remove_complex<value_type> threshold{};
+    gko::remove_complex<value_type> dthreshold{};
+    index_type rank{};
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter_approx(
+        ref, mtx_l.get(), rank, tmp, threshold, res.get(), null_coo);
+    gko::kernels::hip::par_ilut_factorization::threshold_filter_approx(
+        hip, dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), null_coo);
+
+    GKO_ASSERT_MTX_NEAR(res, dres, 0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+    ASSERT_EQ(threshold, dthreshold);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 2);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l, dmtx_l, 0);
+}
+
+
+TEST_F(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterApproxLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l_complex, dmtx_l_complex,
+                       mtx_l_complex->get_num_stored_elements() / 2,
+                       r<value_type>::value);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterApproxNoneLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l_complex, dmtx_l_complex, 0, r<value_type>::value);
+}
+
+
+TEST_F(ParIlut, KernelComplexThresholdFilterApproxAllLowerIsEquivalentToRef)
+{
+    test_filter_approx(mtx_l_complex, dmtx_l_complex,
+                       mtx_l_complex->get_num_stored_elements() - 1,
+                       r<value_type>::value);
+}
+
+
+TEST_F(ParIlut, KernelAddCandidatesIsEquivalentToRef)
+{
+    auto square_size = mtx_square->get_size();
+    auto mtx_lu = Csr::create(ref, square_size);
+    mtx_l2->apply(mtx_u.get(), mtx_lu.get());
+    auto dmtx_lu = Csr::create(hip, square_size);
+    dmtx_lu->copy_from(mtx_lu.get());
+    auto res_mtx_l = Csr::create(ref, square_size);
+    auto res_mtx_u = Csr::create(ref, square_size);
+    auto dres_mtx_l = Csr::create(hip, square_size);
+    auto dres_mtx_u = Csr::create(hip, square_size);
+
+    gko::kernels::reference::par_ilut_factorization::add_candidates(
+        ref, mtx_lu.get(), mtx_square.get(), mtx_l2.get(), mtx_u.get(),
+        res_mtx_l.get(), res_mtx_u.get());
+    gko::kernels::hip::par_ilut_factorization::add_candidates(
+        hip, dmtx_lu.get(), dmtx_square.get(), dmtx_l2.get(), dmtx_u.get(),
+        dres_mtx_l.get(), dres_mtx_u.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, dres_mtx_u);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14);
+    GKO_ASSERT_MTX_NEAR(res_mtx_u, dres_mtx_u, 1e-14);
+}
+
+
+TEST_F(ParIlut, KernelComputeLUIsEquivalentToRef)
+{
+    auto square_size = mtx_ani->get_size();
+    auto mtx_l_coo = Coo::create(ref, square_size);
+    auto mtx_u_coo = Coo::create(ref, square_size);
+    mtx_l_ani->convert_to(mtx_l_coo.get());
+    mtx_u_ani->convert_to(mtx_u_coo.get());
+    auto dmtx_l_coo = Coo::create(hip, square_size);
+    auto dmtx_u_coo = Coo::create(hip, square_size);
+    dmtx_l_coo->copy_from(mtx_l_coo.get());
+    dmtx_u_coo->copy_from(mtx_u_coo.get());
+
+    gko::kernels::reference::par_ilut_factorization::compute_l_u_factors(
+        ref, mtx_ani.get(), mtx_l_ani.get(), mtx_l_coo.get(), mtx_u_ani.get(),
+        mtx_u_coo.get(), mtx_ut_ani.get());
+    for (int i = 0; i < 20; ++i) {
+        gko::kernels::hip::par_ilut_factorization::compute_l_u_factors(
+            hip, dmtx_ani.get(), dmtx_l_ani.get(), dmtx_l_coo.get(),
+            dmtx_u_ani.get(), dmtx_u_coo.get(), dmtx_ut_ani.get());
+    }
+    auto dmtx_utt_ani = gko::as<Csr>(dmtx_ut_ani->transpose());
+
+    GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2);
+    GKO_ASSERT_MTX_NEAR(mtx_u_ani, dmtx_u_ani, 1e-2);
+    GKO_ASSERT_MTX_NEAR(dmtx_u_ani, dmtx_utt_ani, 0);
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/CMakeLists.txt b/hip/test/matrix/CMakeLists.txt
new file mode 100644
index 00000000000..5f8b7251566
--- /dev/null
+++ b/hip/test/matrix/CMakeLists.txt
@@ -0,0 +1,6 @@
+ginkgo_create_hip_test(coo_kernels)
+ginkgo_create_hip_test(csr_kernels)
+ginkgo_create_hip_test(dense_kernels)
+ginkgo_create_hip_test(ell_kernels)
+ginkgo_create_hip_test(hybrid_kernels)
+ginkgo_create_hip_test(sellp_kernels)
diff --git a/hip/test/matrix/coo_kernels.hip.cpp b/hip/test/matrix/coo_kernels.hip.cpp
new file mode 100644
index 00000000000..aa0f5373161
--- /dev/null
+++ b/hip/test/matrix/coo_kernels.hip.cpp
@@ -0,0 +1,262 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/coo.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/matrix/coo_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Coo : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Coo<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Coo() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Vec> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Vec>(
+            num_rows, num_cols, std::uniform_int_distribution<>(1, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(int num_vectors = 1)
+    {
+        mtx = Mtx::create(ref);
+        mtx->copy_from(gen_mtx(532, 231));
+        expected = gen_mtx(532, num_vectors);
+        y = gen_mtx(231, num_vectors);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dmtx = Mtx::create(hip);
+        dmtx->copy_from(mtx.get());
+        dresult = Vec::create(hip);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(hip);
+        dbeta->copy_from(beta.get());
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec> beta;
+
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec> dbeta;
+};
+
+
+TEST_F(Coo, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, SimpleApplyAddIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply2(y.get(), expected.get());
+    dmtx->apply2(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, AdvancedApplyAddIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply2(alpha.get(), y.get(), expected.get());
+    dmtx->apply2(dalpha.get(), dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, SimpleApplyToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, AdvancedApplyToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, SimpleApplyAddToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply2(y.get(), expected.get());
+    dmtx->apply2(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, SimpleApplyAddToLargeDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(33);
+
+    mtx->apply2(y.get(), expected.get());
+    dmtx->apply2(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, AdvancedApplyAddToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply2(alpha.get(), y.get(), expected.get());
+    dmtx->apply2(dalpha.get(), dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, AdvancedApplyAddToLargeDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(33);
+
+    mtx->apply2(y.get(), expected.get());
+    dmtx->apply2(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Coo, ConvertToDenseIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto ddense_mtx = gko::matrix::Dense<>::create(hip);
+
+    mtx->convert_to(dense_mtx.get());
+    dmtx->convert_to(ddense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Coo, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    mtx->convert_to(dense_mtx.get());
+    dense_mtx->convert_to(csr_mtx.get());
+    dmtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
new file mode 100644
index 00000000000..a1b2adfd794
--- /dev/null
+++ b/hip/test/matrix/csr_kernels.hip.cpp
@@ -0,0 +1,701 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/matrix/csr_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Csr : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Csr<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Csr() : mtx_size(532, 231), rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_nnz_row, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
+                           int num_vectors = 1)
+    {
+        mtx = Mtx::create(ref, strategy);
+        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
+        square_mtx = Mtx::create(ref, strategy);
+        square_mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[0], 1));
+        expected = gen_mtx<Vec>(mtx_size[0], num_vectors, 1);
+        y = gen_mtx<Vec>(mtx_size[1], num_vectors, 1);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dmtx = Mtx::create(hip, strategy);
+        dmtx->copy_from(mtx.get());
+        square_dmtx = Mtx::create(hip, strategy);
+        square_dmtx->copy_from(square_mtx.get());
+        dresult = Vec::create(hip);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(hip);
+        dbeta->copy_from(beta.get());
+    }
+
+    struct matrix_pair {
+        std::unique_ptr<Mtx> ref;
+        std::unique_ptr<Mtx> hip;
+    };
+
+    matrix_pair gen_unsorted_mtx()
+    {
+        constexpr int min_nnz_per_row = 2;  // Must be at least 2
+        auto local_mtx_ref =
+            gen_mtx<Mtx>(mtx_size[0], mtx_size[1], min_nnz_per_row);
+        for (size_t row = 0; row < mtx_size[0]; ++row) {
+            const auto row_ptrs = local_mtx_ref->get_const_row_ptrs();
+            const auto start_row = row_ptrs[row];
+            auto col_idx = local_mtx_ref->get_col_idxs() + start_row;
+            auto vals = local_mtx_ref->get_values() + start_row;
+            const auto nnz_in_this_row = row_ptrs[row + 1] - row_ptrs[row];
+            auto swap_idx_dist =
+                std::uniform_int_distribution<>(0, nnz_in_this_row - 1);
+            // shuffle `nnz_in_this_row / 2` times
+            for (size_t perm = 0; perm < nnz_in_this_row; perm += 2) {
+                const auto idx1 = swap_idx_dist(rand_engine);
+                const auto idx2 = swap_idx_dist(rand_engine);
+                std::swap(col_idx[idx1], col_idx[idx2]);
+                std::swap(vals[idx1], vals[idx2]);
+            }
+        }
+        auto local_mtx_hip = Mtx::create(hip);
+        local_mtx_hip->copy_from(local_mtx_ref.get());
+
+        return {std::move(local_mtx_ref), std::move(local_mtx_hip)};
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    const gko::dim<2> mtx_size;
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> square_mtx;
+    std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec> beta;
+
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> square_dmtx;
+    std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec> dbeta;
+};
+
+
+TEST_F(Csr, StrategyAfterCopyIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(hip));
+
+    ASSERT_EQ(mtx->get_strategy()->get_name(),
+              dmtx->get_strategy()->get_name());
+}
+
+
+TEST_F(Csr, SimpleApplyIsEquivalentToRefWithLoadBalance)
+{
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(hip));
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithLoadBalance)
+{
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(hip));
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyIsEquivalentToRefWithHipsparse)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithHipsparse)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyIsEquivalentToRefWithMergePath)
+{
+    set_up_apply_data(std::make_shared<Mtx::merge_path>());
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithMergePath)
+{
+    set_up_apply_data(std::make_shared<Mtx::merge_path>());
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyIsEquivalentToRefWithClassical)
+{
+    set_up_apply_data(std::make_shared<Mtx::classical>());
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithClassical)
+{
+    set_up_apply_data(std::make_shared<Mtx::classical>());
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyIsEquivalentToRefWithAutomatical)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithLoadBalance)
+{
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(hip), 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithLoadBalance)
+{
+    set_up_apply_data(std::make_shared<Mtx::load_balance>(hip), 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithClassical)
+{
+    set_up_apply_data(std::make_shared<Mtx::classical>(), 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithClassical)
+{
+    set_up_apply_data(std::make_shared<Mtx::classical>(), 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithMergePath)
+{
+    set_up_apply_data(std::make_shared<Mtx::merge_path>(), 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath)
+{
+    set_up_apply_data(std::make_shared<Mtx::merge_path>(), 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    mtx->apply(alpha.get(), trans.get(), beta.get(), square_mtx.get());
+    dmtx->apply(dalpha.get(), d_trans.get(), dbeta.get(), square_dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
+    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    mtx->apply(trans.get(), square_mtx.get());
+    dmtx->apply(d_trans.get(), square_dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
+    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+    auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto da = Mtx::create(hip);
+    auto db = Mtx::create(hip);
+    da->copy_from(a.get());
+    db->copy_from(b.get());
+    auto id = gko::matrix::Identity<Mtx::value_type>::create(ref, mtx_size[1]);
+    auto did = gko::matrix::Identity<Mtx::value_type>::create(hip, mtx_size[1]);
+
+    a->apply(alpha.get(), id.get(), beta.get(), b.get());
+    da->apply(dalpha.get(), did.get(), dbeta.get(), db.get());
+
+    GKO_ASSERT_MTX_NEAR(b, db, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(b, db);
+    ASSERT_TRUE(db->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, TransposeIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(d_trans.get()),
+                        static_cast<Mtx *>(trans.get()), 0.0);
+}
+
+
+TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto ddense_mtx = gko::matrix::Dense<>::create(hip);
+
+    mtx->convert_to(dense_mtx.get());
+    dmtx->convert_to(ddense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, MoveToDenseIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto ddense_mtx = gko::matrix::Dense<>::create(hip);
+
+    mtx->move_to(dense_mtx.get());
+    dmtx->move_to(ddense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, ConvertToEllIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto ell_mtx = gko::matrix::Ell<>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<>::create(hip);
+
+    mtx->convert_to(ell_mtx.get());
+    dmtx->convert_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, MoveToEllIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto ell_mtx = gko::matrix::Ell<>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<>::create(hip);
+
+    mtx->move_to(ell_mtx.get());
+    dmtx->move_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip);
+
+    mtx->convert_to(sparsity_mtx.get());
+    dmtx->convert_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip);
+
+    mtx->move_to(sparsity_mtx.get());
+    dmtx->move_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(hip);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
+TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    gko::size_type max_nnz_per_row;
+    gko::size_type dmax_nnz_per_row;
+
+    gko::kernels::reference::csr::calculate_max_nnz_per_row(ref, mtx.get(),
+                                                            &max_nnz_per_row);
+    gko::kernels::hip::csr::calculate_max_nnz_per_row(hip, dmtx.get(),
+                                                      &dmax_nnz_per_row);
+
+    ASSERT_EQ(max_nnz_per_row, dmax_nnz_per_row);
+}
+
+
+TEST_F(Csr, ConvertToCooIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto coo_mtx = gko::matrix::Coo<>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<>::create(hip);
+
+    mtx->convert_to(coo_mtx.get());
+    dmtx->convert_to(dcoo_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, MoveToCooIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto coo_mtx = gko::matrix::Coo<>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<>::create(hip);
+
+    mtx->move_to(coo_mtx.get());
+    dmtx->move_to(dcoo_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, ConvertToSellpIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    mtx->convert_to(sellp_mtx.get());
+    dmtx->convert_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, MoveToSellpIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    mtx->move_to(sellp_mtx.get());
+    dmtx->move_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, CalculateTotalColsIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    gko::size_type total_cols;
+    gko::size_type dtotal_cols;
+
+    gko::kernels::reference::csr::calculate_total_cols(
+        ref, mtx.get(), &total_cols, 2, gko::matrix::default_slice_size);
+    gko::kernels::hip::csr::calculate_total_cols(
+        hip, dmtx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size);
+
+    ASSERT_EQ(total_cols, dtotal_cols);
+}
+
+
+TEST_F(Csr, CalculatesNonzerosPerRow)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    gko::Array<gko::size_type> row_nnz(ref, mtx->get_size()[0]);
+    gko::Array<gko::size_type> drow_nnz(hip, dmtx->get_size()[0]);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row(ref, mtx.get(),
+                                                             &row_nnz);
+    gko::kernels::hip::csr::calculate_nonzeros_per_row(hip, dmtx.get(),
+                                                       &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ConvertToHybridIsEquivalentToRef)
+{
+    using Hybrid_type = gko::matrix::Hybrid<>;
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto hybrid_mtx = Hybrid_type::create(
+        ref, std::make_shared<Hybrid_type::column_limit>(2));
+    auto dhybrid_mtx = Hybrid_type::create(
+        hip, std::make_shared<Hybrid_type::column_limit>(2));
+
+    mtx->convert_to(hybrid_mtx.get());
+    dmtx->convert_to(dhybrid_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, MoveToHybridIsEquivalentToRef)
+{
+    using Hybrid_type = gko::matrix::Hybrid<>;
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    auto hybrid_mtx = Hybrid_type::create(
+        ref, std::make_shared<Hybrid_type::column_limit>(2));
+    auto dhybrid_mtx = Hybrid_type::create(
+        hip, std::make_shared<Hybrid_type::column_limit>(2));
+
+    mtx->move_to(hybrid_mtx.get());
+    dmtx->move_to(dhybrid_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    bool is_sorted_hip{};
+    bool is_sorted_ref{};
+
+    is_sorted_ref = mtx->is_sorted_by_column_index();
+    is_sorted_hip = dmtx->is_sorted_by_column_index();
+
+    ASSERT_EQ(is_sorted_ref, is_sorted_hip);
+}
+
+
+TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef)
+{
+    auto uns_mtx = gen_unsorted_mtx();
+    bool is_sorted_hip{};
+    bool is_sorted_ref{};
+
+    is_sorted_ref = uns_mtx.ref->is_sorted_by_column_index();
+    is_sorted_hip = uns_mtx.hip->is_sorted_by_column_index();
+
+    ASSERT_EQ(is_sorted_ref, is_sorted_hip);
+}
+
+
+TEST_F(Csr, SortSortedMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+
+    mtx->sort_by_column_index();
+    dmtx->sort_by_column_index();
+
+    // Values must be unchanged, therefore, tolerance is `0`
+    GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0);
+}
+
+
+TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)
+{
+    auto uns_mtx = gen_unsorted_mtx();
+
+    uns_mtx.ref->sort_by_column_index();
+    uns_mtx.hip->sort_by_column_index();
+
+    // Values must be unchanged, therefore, tolerance is `0`
+    GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.hip, 0);
+}
+
+
+TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
+{
+    auto automatical = std::make_shared<Mtx::automatical>(hip);
+    auto row_len_limit = std::max(automatical->nvidia_row_len_limit,
+                                  automatical->amd_row_len_limit);
+    auto load_balance_mtx = Mtx::create(ref);
+    auto classical_mtx = Mtx::create(ref);
+    load_balance_mtx->copy_from(
+        gen_mtx<Vec>(1, row_len_limit + 1000, row_len_limit + 1));
+    classical_mtx->copy_from(gen_mtx<Vec>(50, 50, 1));
+    auto load_balance_mtx_d = Mtx::create(hip);
+    auto classical_mtx_d = Mtx::create(hip);
+    load_balance_mtx_d->copy_from(load_balance_mtx.get());
+    classical_mtx_d->copy_from(classical_mtx.get());
+
+    load_balance_mtx_d->set_strategy(automatical);
+    classical_mtx_d->set_strategy(automatical);
+
+    EXPECT_EQ("load_balance", load_balance_mtx_d->get_strategy()->get_name());
+    EXPECT_EQ("classical", classical_mtx_d->get_strategy()->get_name());
+    ASSERT_NE(load_balance_mtx_d->get_strategy().get(),
+              classical_mtx_d->get_strategy().get());
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp
new file mode 100644
index 00000000000..96261c4ab46
--- /dev/null
+++ b/hip/test/matrix/dense_kernels.hip.cpp
@@ -0,0 +1,536 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include "core/matrix/dense_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Dense : public ::testing::Test {
+protected:
+    using itype = int;
+    using vtype = double;
+    using Mtx = gko::matrix::Dense<vtype>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
+    using Arr = gko::Array<itype>;
+
+    Dense() : rand_engine(15) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_vector_data(gko::size_type num_vecs,
+                            bool different_alpha = false)
+    {
+        x = gen_mtx<Mtx>(1000, num_vecs);
+        y = gen_mtx<Mtx>(1000, num_vecs);
+        if (different_alpha) {
+            alpha = gen_mtx<Mtx>(1, num_vecs);
+        } else {
+            alpha = gko::initialize<Mtx>({2.0}, ref);
+        }
+        dx = Mtx::create(hip);
+        dx->copy_from(x.get());
+        dy = Mtx::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Mtx::create(hip);
+        dalpha->copy_from(alpha.get());
+        expected = Mtx::create(ref, gko::dim<2>{1, num_vecs});
+        dresult = Mtx::create(hip, gko::dim<2>{1, num_vecs});
+    }
+
+    void set_up_apply_data()
+    {
+        x = gen_mtx<Mtx>(65, 25);
+        y = gen_mtx<Mtx>(25, 35);
+        expected = gen_mtx<Mtx>(65, 35);
+        alpha = gko::initialize<Mtx>({2.0}, ref);
+        beta = gko::initialize<Mtx>({-1.0}, ref);
+        dx = Mtx::create(hip);
+        dx->copy_from(x.get());
+        dy = Mtx::create(hip);
+        dy->copy_from(y.get());
+        dresult = Mtx::create(hip);
+        dresult->copy_from(expected.get());
+        dalpha = Mtx::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Mtx::create(hip);
+        dbeta->copy_from(beta.get());
+
+        std::vector<itype> tmp(x->get_size()[0], 0);
+        auto rng = std::default_random_engine{};
+        std::iota(tmp.begin(), tmp.end(), 0);
+        std::shuffle(tmp.begin(), tmp.end(), rng);
+        std::vector<itype> tmp2(x->get_size()[1], 0);
+        std::iota(tmp2.begin(), tmp2.end(), 0);
+        std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        rpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        drpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{hip, tmp.begin(), tmp.end()});
+        cpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
+        dcpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{hip, tmp2.begin(), tmp2.end()});
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> expected;
+    std::unique_ptr<Mtx> dresult;
+    std::unique_ptr<Mtx> dx;
+    std::unique_ptr<Mtx> dy;
+    std::unique_ptr<Mtx> dalpha;
+    std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> drpermute_idxs;
+    std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Arr> dcpermute_idxs;
+};
+
+
+TEST_F(Dense, SingleVectorHipScaleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto result = Mtx::create(ref);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+    result->copy_from(dx.get());
+
+    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorHipScaleIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorHipScaleWithDifferentAlphaIsEquivalentToRef)
+{
+    set_up_vector_data(20, true);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, SingleVectorHipAddScaledIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorHipAddScaledIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorHipAddScaledWithDifferentAlphaIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, SingleVectorHipComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorHipComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, HipComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(20);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->hip, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+}
+
+
+TEST_F(Dense, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(y.get(), expected.get());
+    dx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, IsTransposable)
+{
+    set_up_apply_data();
+
+    auto trans = x->transpose();
+    auto dtrans = dx->transpose();
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
+                        static_cast<Mtx *>(trans.get()), 0);
+}
+
+
+TEST_F(Dense, ConvertToCooIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto coo_mtx = gko::matrix::Coo<>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<>::create(hip);
+
+    x->convert_to(coo_mtx.get());
+    dx->convert_to(dcoo_mtx.get());
+
+    ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
+              coo_mtx->get_num_stored_elements());
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, MoveToCooIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto coo_mtx = gko::matrix::Coo<>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<>::create(hip);
+
+    x->move_to(coo_mtx.get());
+    dx->move_to(dcoo_mtx.get());
+
+    ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
+              coo_mtx->get_num_stored_elements());
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    x->convert_to(csr_mtx.get());
+    dx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, MoveToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    x->move_to(csr_mtx.get());
+    dx->move_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, ConvertToEllIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto ell_mtx = gko::matrix::Ell<>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<>::create(hip);
+
+    x->convert_to(ell_mtx.get());
+    dx->convert_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, MoveToEllIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto ell_mtx = gko::matrix::Ell<>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<>::create(hip);
+
+    x->move_to(ell_mtx.get());
+    dx->move_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    x->convert_to(sellp_mtx.get());
+    dx->convert_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14);
+}
+
+
+TEST_F(Dense, MoveToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    x->move_to(sellp_mtx.get());
+    dx->move_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14);
+}
+
+
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(hip);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
+TEST_F(Dense, CountNNZIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type nnz;
+    gko::size_type dnnz;
+
+    gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz);
+    gko::kernels::hip::dense::count_nonzeros(hip, dx.get(), &dnnz);
+
+    ASSERT_EQ(nnz, dnnz);
+}
+
+
+TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::Array<gko::size_type> nnz_per_row(ref);
+    nnz_per_row.resize_and_reset(x->get_size()[0]);
+    gko::Array<gko::size_type> dnnz_per_row(hip);
+    dnnz_per_row.resize_and_reset(dx->get_size()[0]);
+
+    gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(),
+                                                               &nnz_per_row);
+    gko::kernels::hip::dense::calculate_nonzeros_per_row(hip, dx.get(),
+                                                         &dnnz_per_row);
+
+    auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
+    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+        ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
+    }
+}
+
+
+TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type max_nnz;
+    gko::size_type dmax_nnz;
+
+    gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(),
+                                                              &max_nnz);
+    gko::kernels::hip::dense::calculate_max_nnz_per_row(hip, dx.get(),
+                                                        &dmax_nnz);
+
+    ASSERT_EQ(max_nnz, dmax_nnz);
+}
+
+
+TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type total_cols;
+    gko::size_type dtotal_cols;
+
+    gko::kernels::reference::dense::calculate_total_cols(
+        ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size);
+    gko::kernels::hip::dense::calculate_total_cols(
+        hip, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size);
+
+    ASSERT_EQ(total_cols, dtotal_cols);
+}
+
+
+TEST_F(Dense, IsRowPermutable)
+{
+    set_up_apply_data();
+
+    auto r_permute = x->row_permute(rpermute_idxs.get());
+    auto dr_permute = dx->row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
+                        static_cast<Mtx *>(dr_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsColPermutable)
+{
+    set_up_apply_data();
+
+    auto c_permute = x->column_permute(cpermute_idxs.get());
+    auto dc_permute = dx->column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
+                        static_cast<Mtx *>(dc_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseRowPermutable)
+{
+    set_up_apply_data();
+
+    auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
+                        static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseColPermutable)
+{
+    set_up_apply_data();
+
+    auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
+                        static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp
new file mode 100644
index 00000000000..c28285ae885
--- /dev/null
+++ b/hip/test/matrix/ell_kernels.hip.cpp
@@ -0,0 +1,347 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/matrix/ell_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Ell : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Ell<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Ell() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Vec> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Vec>(
+            num_rows, num_cols, std::uniform_int_distribution<>(1, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(int num_rows = 532, int num_cols = 231,
+                           int num_vectors = 1,
+                           int num_stored_elements_per_row = 0, int stride = 0)
+    {
+        mtx = Mtx::create(ref, gko::dim<2>{}, num_stored_elements_per_row,
+                          stride);
+        mtx->copy_from(gen_mtx(num_rows, num_cols));
+        expected = gen_mtx(num_rows, num_vectors);
+        y = gen_mtx(num_cols, num_vectors);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dmtx = Mtx::create(hip);
+        dmtx->copy_from(mtx.get());
+        dresult = Vec::create(hip);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(hip);
+        dbeta->copy_from(beta.get());
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec> beta;
+
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec> dbeta;
+};
+
+
+TEST_F(Ell, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef)
+{
+    set_up_apply_data(10, 10000);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedByAtomicApplyIsEquivalentToRef)
+{
+    set_up_apply_data(10, 10000);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyByAtomicToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(10, 10000, 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedByAtomicToDenseMatrixApplyIsEquivalentToRef)
+{
+    set_up_apply_data(10, 10000, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyOnSmallMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10, 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, ConvertToDenseIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto ddense_mtx = gko::matrix::Dense<>::create(hip);
+
+    mtx->convert_to(dense_mtx.get());
+    dmtx->convert_to(ddense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Ell, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    mtx->convert_to(csr_mtx.get());
+    dmtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    gko::Array<gko::size_type> nnz_per_row;
+    nnz_per_row.set_executor(ref);
+    nnz_per_row.resize_and_reset(mtx->get_size()[0]);
+
+    gko::Array<gko::size_type> dnnz_per_row;
+    dnnz_per_row.set_executor(hip);
+    dnnz_per_row.resize_and_reset(dmtx->get_size()[0]);
+
+    gko::kernels::reference::ell::calculate_nonzeros_per_row(ref, mtx.get(),
+                                                             &nnz_per_row);
+    gko::kernels::hip::ell::calculate_nonzeros_per_row(hip, dmtx.get(),
+                                                       &dnnz_per_row);
+
+    auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
+    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+        ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
+    }
+}
+
+
+TEST_F(Ell, CountNNZIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    gko::size_type nnz;
+    gko::size_type dnnz;
+
+    gko::kernels::reference::ell::count_nonzeros(ref, mtx.get(), &nnz);
+    gko::kernels::hip::ell::count_nonzeros(hip, dmtx.get(), &dnnz);
+
+    ASSERT_EQ(nnz, dnnz);
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/hybrid_kernels.hip.cpp b/hip/test/matrix/hybrid_kernels.hip.cpp
new file mode 100644
index 00000000000..83d2cc37c86
--- /dev/null
+++ b/hip/test/matrix/hybrid_kernels.hip.cpp
@@ -0,0 +1,222 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/hybrid.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/matrix/hybrid_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Hybrid : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Hybrid<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Hybrid() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Vec> gen_mtx(int num_rows, int num_cols, int min_nnz_row)
+    {
+        return gko::test::generate_random_matrix<Vec>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_nnz_row, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(int num_vectors = 1,
+                           std::shared_ptr<Mtx::strategy_type> strategy =
+                               std::make_shared<Mtx::automatic>())
+    {
+        mtx = Mtx::create(ref, strategy);
+        mtx->copy_from(gen_mtx(532, 231, 1));
+        expected = gen_mtx(532, num_vectors, 1);
+        y = gen_mtx(231, num_vectors, 1);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dmtx = Mtx::create(hip, strategy);
+        dmtx->copy_from(mtx.get());
+        dresult = Vec::create(hip);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(hip);
+        dbeta->copy_from(beta.get());
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec> beta;
+
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec> dbeta;
+};
+
+
+TEST_F(Hybrid, SubMatrixExecutorAfterCopyIsEquivalentToExcutor)
+{
+    set_up_apply_data();
+
+    auto coo_mtx = dmtx->get_coo();
+    auto ell_mtx = dmtx->get_ell();
+
+    ASSERT_EQ(coo_mtx->get_executor(), hip);
+    ASSERT_EQ(ell_mtx->get_executor(), hip);
+    ASSERT_EQ(dmtx->get_executor(), hip);
+}
+
+
+TEST_F(Hybrid, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Hybrid, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Hybrid, SimpleApplyToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Hybrid, AdvancedApplyToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Hybrid, CountNonzerosIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type nonzeros;
+    gko::size_type dnonzeros;
+
+    gko::kernels::reference::hybrid::count_nonzeros(ref, mtx.get(), &nonzeros);
+    gko::kernels::hip::hybrid::count_nonzeros(hip, dmtx.get(), &dnonzeros);
+
+    ASSERT_EQ(nonzeros, dnonzeros);
+}
+
+
+TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_data(1, std::make_shared<Mtx::column_limit>(2));
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    mtx->convert_to(csr_mtx.get());
+    dmtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Hybrid, MoveToCsrIsEquivalentToRef)
+{
+    set_up_apply_data(1, std::make_shared<Mtx::column_limit>(2));
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    mtx->move_to(csr_mtx.get());
+    dmtx->move_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/sellp_kernels.hip.cpp b/hip/test/matrix/sellp_kernels.hip.cpp
new file mode 100644
index 00000000000..410b8f58a19
--- /dev/null
+++ b/hip/test/matrix/sellp_kernels.hip.cpp
@@ -0,0 +1,297 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/matrix/sellp_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Sellp : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Sellp<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Sellp() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Vec> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Vec>(
+            num_rows, num_cols, std::uniform_int_distribution<>(1, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_vector(
+        int slice_size = gko::matrix::default_slice_size,
+        int stride_factor = gko::matrix::default_stride_factor,
+        int total_cols = 0)
+    {
+        mtx = Mtx::create(ref);
+        mtx->copy_from(gen_mtx(532, 231));
+        expected = gen_mtx(532, 1);
+        y = gen_mtx(231, 1);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dmtx = Mtx::create(hip);
+        dmtx->copy_from(mtx.get());
+        dresult = Vec::create(hip);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(hip);
+        dbeta->copy_from(beta.get());
+    }
+
+    void set_up_apply_matrix(
+        int slice_size = gko::matrix::default_slice_size,
+        int stride_factor = gko::matrix::default_stride_factor,
+        int total_cols = 0)
+    {
+        mtx = Mtx::create(ref);
+        mtx->copy_from(gen_mtx(532, 231));
+        expected = gen_mtx(532, 64);
+        y = gen_mtx(231, 64);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dmtx = Mtx::create(hip);
+        dmtx->copy_from(mtx.get());
+        dresult = Vec::create(hip);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(hip);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(hip);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(hip);
+        dbeta->copy_from(beta.get());
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec> beta;
+
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec> dbeta;
+};
+
+
+TEST_F(Sellp, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_vector();
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_vector();
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp, SimpleApplyWithSliceSizeAndStrideFactorIsEquivalentToRef)
+{
+    set_up_apply_vector(32, 2);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp, AdvancedApplyWithSliceSizeAndStrideFActorIsEquivalentToRef)
+{
+    set_up_apply_vector(32, 2);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp, SimpleApplyMultipleRHSIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp, AdvancedApplyMultipleRHSIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp,
+       SimpleApplyMultipleRHSWithSliceSizeAndStrideFactorIsEquivalentToRef)
+{
+    set_up_apply_matrix(32, 2);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp,
+       AdvancedApplyMultipleRHSWithSliceSizeAndStrideFActorIsEquivalentToRef)
+{
+    set_up_apply_matrix(32, 2);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    auto result = Vec::create(ref);
+    result->copy_from(dresult.get());
+    GKO_ASSERT_MTX_NEAR(result, expected, 1e-14);
+}
+
+
+TEST_F(Sellp, ConvertToDenseIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto ddense_mtx = gko::matrix::Dense<>::create(hip);
+
+    mtx->convert_to(dense_mtx.get());
+    dmtx->convert_to(ddense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Sellp, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(hip);
+
+    mtx->convert_to(csr_mtx.get());
+    dmtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Sellp, CountNonzerosIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    gko::size_type nnz;
+    gko::size_type dnnz;
+
+    gko::kernels::reference::sellp::count_nonzeros(ref, mtx.get(), &nnz);
+    gko::kernels::hip::sellp::count_nonzeros(hip, dmtx.get(), &dnnz);
+
+    ASSERT_EQ(nnz, dnnz);
+}
+
+
+}  // namespace
diff --git a/hip/test/preconditioner/CMakeLists.txt b/hip/test/preconditioner/CMakeLists.txt
new file mode 100644
index 00000000000..6f974174421
--- /dev/null
+++ b/hip/test/preconditioner/CMakeLists.txt
@@ -0,0 +1,2 @@
+ginkgo_create_hip_test_special_linkage(jacobi_kernels)
+ginkgo_create_hip_test(isai_kernels)
diff --git a/hip/test/preconditioner/isai_kernels.hip.cpp b/hip/test/preconditioner/isai_kernels.hip.cpp
new file mode 100644
index 00000000000..88f67c0adb3
--- /dev/null
+++ b/hip/test/preconditioner/isai_kernels.hip.cpp
@@ -0,0 +1,326 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/preconditioner/isai_kernels.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+enum struct matrix_type { lower, upper };
+class Isai : public ::testing::Test {
+protected:
+    using value_type = double;
+    using index_type = gko::int32;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    Isai() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    std::unique_ptr<Csr> clone_allocations(const Csr *csr_mtx)
+    {
+        if (csr_mtx->get_executor() != ref) {
+            return {nullptr};
+        }
+        const auto num_elems = csr_mtx->get_num_stored_elements();
+        auto sparsity = csr_mtx->clone();
+
+        // values are now filled with invalid data to catch potential errors
+        auto begin_values = sparsity->get_values();
+        auto end_values = begin_values + num_elems;
+        std::fill(begin_values, end_values, -gko::one<value_type>());
+        return sparsity;
+    }
+
+    void initialize_data(matrix_type type, gko::size_type n,
+                         gko::size_type row_limit)
+    {
+        const bool for_lower_tm = type == matrix_type::lower;
+        auto nz_dist = std::uniform_int_distribution<index_type>(1, row_limit);
+        auto val_dist = std::uniform_real_distribution<value_type>(-1., 1.);
+        mtx = Csr::create(ref);
+        mtx = gko::test::generate_random_triangular_matrix<Csr>(
+            n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref,
+            gko::dim<2>{n, n});
+        inverse = clone_allocations(mtx.get());
+
+        d_mtx = Csr::create(hip);
+        d_mtx->copy_from(mtx.get());
+        d_inverse = Csr::create(hip);
+        d_inverse->copy_from(inverse.get());
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> inverse;
+
+    std::unique_ptr<Csr> d_mtx;
+    std::unique_ptr<Csr> d_inverse;
+};
+
+
+TEST_F(Isai, HipIsaiGenerateLinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 536, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(hip, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::hip::isai::generate_tri_inverse(
+        hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, HipIsaiGenerateUinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 615, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(hip, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::hip::isai::generate_tri_inverse(
+        hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, HipIsaiGenerateLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 554, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(hip, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::hip::isai::generate_tri_inverse(
+        hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, HipIsaiGenerateUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 695, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(hip, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::hip::isai::generate_tri_inverse(
+        hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, HipIsaiGenerateExcessLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 518, 40);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(hip, a1);
+    gko::Array<index_type> da2(hip, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(hip, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(hip, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get());
+    gko::kernels::hip::isai::generate_excess_system(
+        hip, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, HipIsaiGenerateExcessUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 673, 51);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(hip, a1);
+    gko::Array<index_type> da2(hip, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(hip, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(hip, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get());
+    gko::kernels::hip::isai::generate_excess_system(
+        hip, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, HipIsaiScatterExcessSolutionLIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 572, 52);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(hip, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = Dense::create(hip);
+    de_rhs->copy_from(lend(e_rhs));
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get());
+    gko::kernels::hip::isai::scatter_excess_solution(
+        hip, da1.get_const_data(), de_rhs.get(), d_inverse.get());
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, HipIsaiScatterExcessSolutionUIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 702, 45);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(hip, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = Dense::create(hip);
+    de_rhs->copy_from(lend(e_rhs));
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get());
+    gko::kernels::hip::isai::scatter_excess_solution(
+        hip, da1.get_const_data(), de_rhs.get(), d_inverse.get());
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+}  // namespace
diff --git a/hip/test/preconditioner/jacobi_kernels.cpp b/hip/test/preconditioner/jacobi_kernels.cpp
new file mode 100644
index 00000000000..f1863a6b42f
--- /dev/null
+++ b/hip/test/preconditioner/jacobi_kernels.cpp
@@ -0,0 +1,847 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/jacobi.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Jacobi : public ::testing::Test {
+protected:
+    using Bj = gko::preconditioner::Jacobi<>;
+    using Mtx = gko::matrix::Csr<>;
+    using Vec = gko::matrix::Dense<>;
+    using mtx_data = gko::matrix_data<>;
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    void initialize_data(
+        std::initializer_list<gko::int32> block_pointers,
+        std::initializer_list<gko::precision_reduction> block_precisions,
+        std::initializer_list<double> condition_numbers,
+        gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1,
+        double accuracy = 0.1)
+    {
+        std::ranlux48 engine(42);
+        const auto dim = *(end(block_pointers) - 1);
+        if (condition_numbers.size() == 0) {
+            mtx = gko::test::generate_random_matrix<Mtx>(
+                dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz),
+                std::normal_distribution<>(0.0, 1.0), engine, ref);
+        } else {
+            std::vector<mtx_data> blocks;
+            for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) {
+                const auto size =
+                    begin(block_pointers)[i + 1] - begin(block_pointers)[i];
+                const auto cond = begin(condition_numbers)[i];
+                blocks.push_back(mtx_data::cond(
+                    size, cond, std::normal_distribution<>(-1, 1), engine));
+            }
+            mtx = Mtx::create(ref);
+            mtx->read(mtx_data::diag(begin(blocks), end(blocks)));
+        }
+        gko::Array<gko::int32> block_ptrs(ref, block_pointers);
+        gko::Array<gko::precision_reduction> block_prec(ref, block_precisions);
+        if (block_prec.get_num_elems() == 0) {
+            bj_factory =
+                Bj::build()
+                    .with_max_block_size(max_block_size)
+                    .with_block_pointers(block_ptrs)
+                    .with_max_block_stride(gko::uint32(hip->get_warp_size()))
+                    .on(ref);
+            d_bj_factory = Bj::build()
+                               .with_max_block_size(max_block_size)
+                               .with_block_pointers(block_ptrs)
+                               .on(hip);
+        } else {
+            bj_factory =
+                Bj::build()
+                    .with_max_block_size(max_block_size)
+                    .with_block_pointers(block_ptrs)
+                    .with_max_block_stride(gko::uint32(hip->get_warp_size()))
+                    .with_storage_optimization(block_prec)
+                    .with_accuracy(accuracy)
+                    .on(ref);
+            d_bj_factory = Bj::build()
+                               .with_max_block_size(max_block_size)
+                               .with_block_pointers(block_ptrs)
+                               .with_storage_optimization(block_prec)
+                               .with_accuracy(accuracy)
+                               .on(hip);
+        }
+        b = gko::test::generate_random_matrix<Vec>(
+            dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
+            std::normal_distribution<>(0.0, 1.0), engine, ref);
+        d_b = Vec::create(hip);
+        d_b->copy_from(b.get());
+        x = gko::test::generate_random_matrix<Vec>(
+            dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
+            std::normal_distribution<>(0.0, 1.0), engine, ref);
+        d_x = Vec::create(hip);
+        d_x->copy_from(x.get());
+    }
+
+    const gko::precision_reduction dp{};
+    const gko::precision_reduction sp{0, 1};
+    const gko::precision_reduction hp{0, 2};
+    const gko::precision_reduction tp{1, 0};
+    const gko::precision_reduction qp{2, 0};
+    const gko::precision_reduction up{1, 1};
+    const gko::precision_reduction ap{gko::precision_reduction::autodetect()};
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::shared_ptr<Mtx> mtx;
+    std::unique_ptr<Vec> x;
+    std::unique_ptr<Vec> b;
+    std::unique_ptr<Vec> d_x;
+    std::unique_ptr<Vec> d_b;
+
+    std::unique_ptr<Bj::Factory> bj_factory;
+    std::unique_ptr<Bj::Factory> d_bj_factory;
+};
+
+
+TEST_F(Jacobi, HipFindNaturalBlocksEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+        1       1
+        1       1
+     */
+    auto mtx = share(Mtx::create(ref));
+    mtx->read({{4, 4},
+               {{0, 0, 1.0},
+                {0, 1, 1.0},
+                {1, 0, 1.0},
+                {1, 1, 1.0},
+                {2, 0, 1.0},
+                {2, 2, 1.0},
+                {3, 0, 1.0},
+                {3, 2, 1.0}}});
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi, HipExecutesSupervariableAgglomerationEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+                1   1
+                1   1
+                        1
+     */
+    auto mtx = share(Mtx::create(ref));
+    mtx->read({{5, 5},
+               {{0, 0, 1.0},
+                {0, 1, 1.0},
+                {1, 0, 1.0},
+                {1, 1, 1.0},
+                {2, 2, 1.0},
+                {2, 3, 1.0},
+                {3, 2, 1.0},
+                {3, 3, 1.0},
+                {4, 4, 1.0}}});
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi, HipFindNaturalBlocksInLargeMatrixEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+        1       1
+        1       1
+        1       1
+        1       1
+     */
+    using data = gko::matrix_data<double, int>;
+    auto mtx = share(Mtx::create(ref));
+    mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0, 0.0},
+                                      {1.0, 1.0, 0.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}}));
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi,
+       HipExecutesSupervariableAgglomerationInLargeMatrixEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+                1   1
+                1   1
+                        1
+     */
+    using data = gko::matrix_data<double, int>;
+    auto mtx = share(Mtx::create(ref));
+    mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 1.0, 0.0, 0.0, 0.0},
+                                      {0.0, 0.0, 1.0, 1.0, 0.0},
+                                      {0.0, 0.0, 1.0, 1.0, 0.0},
+                                      {0.0, 0.0, 0.0, 0.0, 1.0}}));
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi,
+       HipExecutesSupervarAgglomerationEquivalentToRefFor150NonzerowsPerRow)
+{
+    /* example matrix duplicated 50 times:
+        1   1       1
+        1   1       1
+        1       1   1
+        1       1   1
+                1        1
+     */
+    using data = gko::matrix_data<double, int>;
+    auto mtx = share(Mtx::create(ref));
+    mtx->read({{50, 50},
+               {{1.0, 1.0, 0.0, 1.0, 0.0},
+                {1.0, 1.0, 0.0, 1.0, 0.0},
+                {1.0, 0.0, 1.0, 1.0, 0.0},
+                {1.0, 0.0, 1.0, 1.0, 0.0},
+                {0.0, 0.0, 1.0, 0.0, 1.0}}});
+
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithBlockSize32)
+{
+    initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+#if GINKGO_HIP_PLATFORM_HCC
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithBlockSize64)
+{
+    initialize_data({0, 64, 128, 192, 256}, {}, {}, 64, 100, 110);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+#endif
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithDifferentBlockSize)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+TEST_F(Jacobi, HipTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, HipConjTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithBlockSize32)
+{
+    initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+#if GINKGO_HIP_PLATFORM_HCC
+TEST_F(Jacobi, HipApplyEquivalentToRefWithBlockSize64)
+{
+    initialize_data({0, 64, 128, 192, 256}, {}, {}, 64, 100, 111);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+#endif
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithDifferentBlockSize)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
+                    97, 99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, hip);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, hip);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(alpha.get(), b.get(), beta.get(), x.get());
+    d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, HipLinearCombinationApplyToMultipleVectorsEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99, 5);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, hip);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, hip);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(alpha.get(), b.get(), beta.get(), x.get());
+    d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = clone(ref, d_bj_factory->generate(mtx));
+
+    for (int i = 0; i < gko::as<Bj>(bj.get())->get_num_blocks(); ++i) {
+        EXPECT_NEAR(bj->get_conditioning()[i], d_bj->get_conditioning()[i],
+                    1e-9);
+    }
+}
+
+
+TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef)
+{
+    initialize_data(
+        {0, 2, 14, 27, 40, 51, 61, 70, 80, 92, 100},
+        {ap, ap, ap, ap, ap, ap, ap, ap, ap, ap},
+        {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 13, 97,
+        99, 1, 0.2);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = gko::clone(ref, d_bj_factory->generate(mtx));
+
+    auto bj_prec =
+        bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    auto d_bj_prec =
+        d_bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    for (int i = 0; i < gko::as<Bj>(bj.get())->get_num_blocks(); ++i) {
+        EXPECT_EQ(bj_prec[i], d_bj_prec[i]);
+    }
+}
+
+
+TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
+{
+    auto mtx = gko::matrix::Csr<>::create(hip);
+    // clang-format off
+    mtx->read(mtx_data::diag({
+        // perfectly conditioned block, small value difference,
+        // can use fp16 (5, 10)
+        {{2.0, 1.0},
+         {1.0, 2.0}},
+        // perfectly conditioned block (scaled orthogonal),
+        // with large value difference, need fp16 (7, 8)
+        {{1e-8, -1e-16},
+         {1e-16,  1e-8}}
+    }));
+    // clang-format on
+
+    auto bj =
+        Bj::build()
+            .with_max_block_size(13u)
+            .with_block_pointers(gko::Array<gko::int32>(hip, {0, 2, 4}))
+            .with_storage_optimization(gko::precision_reduction::autodetect())
+            .with_accuracy(0.1)
+            .on(hip)
+            ->generate(give(mtx));
+
+    // both blocks are in the same group, both need (7, 8)
+    auto h_bj = clone(ref, bj);
+    auto prec =
+        h_bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    EXPECT_EQ(prec[0], gko::precision_reduction(1, 1));
+    ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithFullPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-13);
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-7);
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithCustomReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6);
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-3);
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithCustomQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1);
+}
+
+
+TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1);
+}
+
+
+TEST_F(Jacobi, HipTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    bj->copy_from(d_bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi,
+       HipConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    bj->copy_from(d_bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithFullPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithCustomReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-5);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithCustomReducedAndReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {up, up, up, up, up, up, up, up, up, up, up}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithCustomQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, HipApplyEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1);
+}
+
+
+TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
+                    99);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, hip);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, hip);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRefWithFullPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
+                    99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRefWithReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1);
+}
+
+
+TEST_F(
+    Jacobi,
+    HipLinearCombinationApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
+                    99, 5);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, hip);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, hip);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/CMakeLists.txt b/hip/test/solver/CMakeLists.txt
new file mode 100644
index 00000000000..3ec7956cf65
--- /dev/null
+++ b/hip/test/solver/CMakeLists.txt
@@ -0,0 +1,9 @@
+ginkgo_create_hip_test_special_linkage(bicg_kernels)
+ginkgo_create_hip_test_special_linkage(bicgstab_kernels)
+ginkgo_create_hip_test_special_linkage(cg_kernels)
+ginkgo_create_hip_test_special_linkage(cgs_kernels)
+ginkgo_create_hip_test_special_linkage(fcg_kernels)
+ginkgo_create_hip_test_special_linkage(gmres_kernels)
+ginkgo_create_hip_test_special_linkage(ir_kernels)
+ginkgo_create_hip_test_special_linkage(lower_trs_kernels)
+ginkgo_create_hip_test_special_linkage(upper_trs_kernels)
diff --git a/hip/test/solver/bicg_kernels.cpp b/hip/test/solver/bicg_kernels.cpp
new file mode 100644
index 00000000000..67fda77f84b
--- /dev/null
+++ b/hip/test/solver/bicg_kernels.cpp
@@ -0,0 +1,357 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicg.hpp>
+
+
+#include <fstream>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/bicg_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+class Bicg : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    Bicg() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+
+        std::string file_name(gko::matrices::location_ani1_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        csr_ref = gko::read<Csr>(input_file, ref);
+        auto csr_hip_temp = Csr::create(hip);
+        csr_hip_temp->copy_from(gko::lend(csr_ref));
+        csr_hip = gko::give(csr_hip_temp);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 43;
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        q = gen_mtx(m, n);
+        r2 = gen_mtx(m, n);
+        z2 = gen_mtx(m, n);
+        p2 = gen_mtx(m, n);
+        q2 = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        beta = gen_mtx(1, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_b = Mtx::create(hip);
+        d_b->copy_from(b.get());
+        d_r = Mtx::create(hip);
+        d_r->copy_from(r.get());
+        d_z = Mtx::create(hip);
+        d_z->copy_from(z.get());
+        d_p = Mtx::create(hip);
+        d_p->copy_from(p.get());
+        d_q = Mtx::create(hip);
+        d_q->copy_from(q.get());
+        d_r2 = Mtx::create(hip);
+        d_r2->copy_from(r2.get());
+        d_z2 = Mtx::create(hip);
+        d_z2->copy_from(z2.get());
+        d_p2 = Mtx::create(hip);
+        d_p2->copy_from(p2.get());
+        d_q2 = Mtx::create(hip);
+        d_q2->copy_from(q2.get());
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_beta = Mtx::create(hip);
+        d_beta->copy_from(beta.get());
+        d_prev_rho = Mtx::create(hip);
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho = Mtx::create(hip);
+        d_rho->copy_from(rho.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(hip, n));
+        *d_stop_status = *stop_status;
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> q;
+    std::unique_ptr<Mtx> r2;
+    std::unique_ptr<Mtx> z2;
+    std::unique_ptr<Mtx> p2;
+    std::unique_ptr<Mtx> q2;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_q;
+    std::unique_ptr<Mtx> d_r2;
+    std::unique_ptr<Mtx> d_z2;
+    std::unique_ptr<Mtx> d_p2;
+    std::unique_ptr<Mtx> d_q2;
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+    std::shared_ptr<const Csr> csr_ref;
+    std::shared_ptr<const Csr> csr_hip;
+};
+
+
+TEST_F(Bicg, HipBicgInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::initialize(
+        ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(),
+        rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get());
+    gko::kernels::hip::bicg::initialize(
+        hip, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
+        d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(),
+        d_q2.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Bicg, HipBicgStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(),
+                                          z2.get(), rho.get(), prev_rho.get(),
+                                          stop_status.get());
+    gko::kernels::hip::bicg::step_1(hip, d_p.get(), d_z.get(), d_p2.get(),
+                                    d_z2.get(), d_rho.get(), d_prev_rho.get(),
+                                    d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14);
+}
+
+
+TEST_F(Bicg, HipBicgStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::step_2(
+        ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(),
+        rho.get(), stop_status.get());
+    gko::kernels::hip::bicg::step_2(
+        hip, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), d_q2.get(),
+        d_beta.get(), d_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14);
+}
+
+
+TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    make_spd(mtx.get());
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = Mtx::create(hip);
+    d_mtx->copy_from(mtx.get());
+    auto d_x = Mtx::create(hip);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    auto bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(hip),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(hip))
+            .on(hip);
+    auto solver = bicg_factory->generate(std::move(mtx));
+    auto d_solver = d_bicg_factory->generate(std::move(d_mtx));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Bicg, ApplyWithSuiteSparseMatrixIsEquivalentToRef)
+{
+    auto x = gen_mtx(36, 1);
+    auto b = gen_mtx(36, 1);
+    auto d_x = Mtx::create(hip);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    auto bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(hip),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(hip))
+            .on(hip);
+    auto solver = bicg_factory->generate(std::move(csr_ref));
+    auto d_solver = d_bicg_factory->generate(std::move(csr_hip));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/bicgstab_kernels.cpp b/hip/test/solver/bicgstab_kernels.cpp
new file mode 100644
index 00000000000..999b40bebaa
--- /dev/null
+++ b/hip/test/solver/bicgstab_kernels.cpp
@@ -0,0 +1,357 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicgstab.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/bicgstab_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Bicgstab : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    using Solver = gko::solver::Bicgstab<>;
+
+    Bicgstab() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+
+        mtx = gen_mtx(123, 123);
+        make_diag_dominant(mtx.get());
+        d_mtx = Mtx::create(hip);
+        d_mtx->copy_from(mtx.get());
+
+        hip_bicgstab_factory =
+            Solver::build()
+                .with_criteria(
+                    gko::stop::Iteration::build().with_max_iters(246u).on(hip),
+                    gko::stop::ResidualNormReduction<>::build()
+                        .with_reduction_factor(1e-15)
+                        .on(hip))
+                .on(hip);
+        ref_bicgstab_factory =
+            Solver::build()
+                .with_criteria(
+                    gko::stop::Iteration::build().with_max_iters(246u).on(ref),
+                    gko::stop::ResidualNormReduction<>::build()
+                        .with_reduction_factor(1e-15)
+                        .on(ref))
+                .on(ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 17;
+        x = gen_mtx(m, n);
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        rr = gen_mtx(m, n);
+        s = gen_mtx(m, n);
+        t = gen_mtx(m, n);
+        y = gen_mtx(m, n);
+        v = gen_mtx(m, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        alpha = gen_mtx(1, n);
+        beta = gen_mtx(1, n);
+        gamma = gen_mtx(1, n);
+        omega = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < n; ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_x = Mtx::create(hip);
+        d_b = Mtx::create(hip);
+        d_r = Mtx::create(hip);
+        d_z = Mtx::create(hip);
+        d_p = Mtx::create(hip);
+        d_t = Mtx::create(hip);
+        d_s = Mtx::create(hip);
+        d_y = Mtx::create(hip);
+        d_v = Mtx::create(hip);
+        d_rr = Mtx::create(hip);
+        d_prev_rho = Mtx::create(hip);
+        d_rho = Mtx::create(hip);
+        d_alpha = Mtx::create(hip);
+        d_beta = Mtx::create(hip);
+        d_gamma = Mtx::create(hip);
+        d_omega = Mtx::create(hip);
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(hip));
+
+        d_x->copy_from(x.get());
+        d_b->copy_from(b.get());
+        d_r->copy_from(r.get());
+        d_z->copy_from(z.get());
+        d_p->copy_from(p.get());
+        d_v->copy_from(v.get());
+        d_y->copy_from(y.get());
+        d_t->copy_from(t.get());
+        d_s->copy_from(s.get());
+        d_rr->copy_from(rr.get());
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho->copy_from(rho.get());
+        d_alpha->copy_from(alpha.get());
+        d_beta->copy_from(beta.get());
+        d_gamma->copy_from(gamma.get());
+        d_omega->copy_from(omega.get());
+        *d_stop_status =
+            *stop_status;  // copy_from is not a public member function of Array
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<Mtx> d_mtx;
+    std::unique_ptr<Solver::Factory> hip_bicgstab_factory;
+    std::unique_ptr<Solver::Factory> ref_bicgstab_factory;
+
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> rr;
+    std::unique_ptr<Mtx> s;
+    std::unique_ptr<Mtx> t;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> v;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> gamma;
+    std::unique_ptr<Mtx> omega;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_t;
+    std::unique_ptr<Mtx> d_s;
+    std::unique_ptr<Mtx> d_y;
+    std::unique_ptr<Mtx> d_v;
+    std::unique_ptr<Mtx> d_rr;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<Mtx> d_alpha;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_gamma;
+    std::unique_ptr<Mtx> d_omega;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+};
+
+
+TEST_F(Bicgstab, HipBicgstabInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicgstab::initialize(
+        ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(),
+        v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(),
+        gamma.get(), omega.get(), stop_status.get());
+    gko::kernels::hip::bicgstab::initialize(
+        hip, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(),
+        d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(),
+        d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(),
+        d_stop_status.get());
+
+    GKO_EXPECT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_y, y, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_t, t, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_s, s, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_rr, rr, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_v, v, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_alpha, alpha, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_beta, beta, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14);
+    GKO_EXPECT_MTX_NEAR(d_omega, omega, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Bicgstab, HipBicgstabStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicgstab::step_1(
+        ref, r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(),
+        omega.get(), stop_status.get());
+    gko::kernels::hip::bicgstab::step_1(
+        hip, d_r.get(), d_p.get(), d_v.get(), d_rho.get(), d_prev_rho.get(),
+        d_alpha.get(), d_omega.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+}
+
+
+TEST_F(Bicgstab, HipBicgstabStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicgstab::step_2(ref, r.get(), s.get(), v.get(),
+                                              rho.get(), alpha.get(),
+                                              beta.get(), stop_status.get());
+    gko::kernels::hip::bicgstab::step_2(hip, d_r.get(), d_s.get(), d_v.get(),
+                                        d_rho.get(), d_alpha.get(),
+                                        d_beta.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_s, s, 1e-14);
+}
+
+
+TEST_F(Bicgstab, HipBicgstabStep3IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicgstab::step_3(
+        ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
+        beta.get(), gamma.get(), omega.get(), stop_status.get());
+    gko::kernels::hip::bicgstab::step_3(
+        hip, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(),
+        d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(),
+        d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_omega, omega, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+}
+
+
+TEST_F(Bicgstab, HipBicgstabApplyOneRHSIsEquivalentToRef)
+{
+    int m = 123;
+    int n = 1;
+    auto ref_solver = ref_bicgstab_factory->generate(mtx);
+    auto hip_solver = hip_bicgstab_factory->generate(d_mtx);
+    auto b = gen_mtx(m, n);
+    auto x = gen_mtx(m, n);
+    auto d_b = Mtx::create(hip);
+    auto d_x = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    d_x->copy_from(x.get());
+
+    ref_solver->apply(b.get(), x.get());
+    hip_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13);
+}
+
+
+TEST_F(Bicgstab, HipBicgstabApplyMultipleRHSIsEquivalentToRef)
+{
+    int m = 123;
+    int n = 16;
+    auto hip_solver = hip_bicgstab_factory->generate(d_mtx);
+    auto ref_solver = ref_bicgstab_factory->generate(mtx);
+    auto b = gen_mtx(m, n);
+    auto x = gen_mtx(m, n);
+    auto d_b = Mtx::create(hip);
+    auto d_x = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    d_x->copy_from(x.get());
+
+    ref_solver->apply(b.get(), x.get());
+    hip_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/cg_kernels.cpp b/hip/test/solver/cg_kernels.cpp
new file mode 100644
index 00000000000..db472f22000
--- /dev/null
+++ b/hip/test/solver/cg_kernels.cpp
@@ -0,0 +1,272 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/cg.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/cg_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Cg : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    Cg() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 43;
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        q = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        beta = gen_mtx(1, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_b = Mtx::create(hip);
+        d_b->copy_from(b.get());
+        d_r = Mtx::create(hip);
+        d_r->copy_from(r.get());
+        d_z = Mtx::create(hip);
+        d_z->copy_from(z.get());
+        d_p = Mtx::create(hip);
+        d_p->copy_from(p.get());
+        d_q = Mtx::create(hip);
+        d_q->copy_from(q.get());
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_beta = Mtx::create(hip);
+        d_beta->copy_from(beta.get());
+        d_prev_rho = Mtx::create(hip);
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho = Mtx::create(hip);
+        d_rho->copy_from(rho.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(hip, n));
+        *d_stop_status = *stop_status;
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> q;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_q;
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+};
+
+
+TEST_F(Cg, HipCgInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::cg::initialize(ref, b.get(), r.get(), z.get(),
+                                            p.get(), q.get(), prev_rho.get(),
+                                            rho.get(), stop_status.get());
+    gko::kernels::hip::cg::initialize(hip, d_b.get(), d_r.get(), d_z.get(),
+                                      d_p.get(), d_q.get(), d_prev_rho.get(),
+                                      d_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Cg, HipCgStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::cg::step_1(ref, p.get(), z.get(), rho.get(),
+                                        prev_rho.get(), stop_status.get());
+    gko::kernels::hip::cg::step_1(hip, d_p.get(), d_z.get(), d_rho.get(),
+                                  d_prev_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+}
+
+
+TEST_F(Cg, HipCgStep2IsEquivalentToRef)
+{
+    initialize_data();
+    gko::kernels::reference::cg::step_2(ref, x.get(), r.get(), p.get(), q.get(),
+                                        beta.get(), rho.get(),
+                                        stop_status.get());
+    gko::kernels::hip::cg::step_2(hip, d_x.get(), d_r.get(), d_p.get(),
+                                  d_q.get(), d_beta.get(), d_rho.get(),
+                                  d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+}
+
+
+TEST_F(Cg, ApplyIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    make_spd(mtx.get());
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = Mtx::create(hip);
+    d_mtx->copy_from(mtx.get());
+    auto d_x = Mtx::create(hip);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    auto cg_factory =
+        gko::solver::Cg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_cg_factory =
+        gko::solver::Cg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(hip),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(hip))
+            .on(hip);
+    auto solver = cg_factory->generate(std::move(mtx));
+    auto d_solver = d_cg_factory->generate(std::move(d_mtx));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/cgs_kernels.cpp b/hip/test/solver/cgs_kernels.cpp
new file mode 100644
index 00000000000..ff676c2dffc
--- /dev/null
+++ b/hip/test/solver/cgs_kernels.cpp
@@ -0,0 +1,349 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/cgs.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/cgs_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Cgs : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    using Solver = gko::solver::Cgs<>;
+
+    Cgs() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+
+        mtx = gen_mtx(123, 123);
+        make_diag_dominant(mtx.get());
+        d_mtx = Mtx::create(hip);
+        d_mtx->copy_from(mtx.get());
+        hip_cgs_factory =
+            Solver::build()
+                .with_criteria(
+                    gko::stop::Iteration::build().with_max_iters(246u).on(hip),
+                    gko::stop::ResidualNormReduction<>::build()
+                        .with_reduction_factor(1e-15)
+                        .on(hip))
+                .on(hip);
+        ref_cgs_factory =
+            Solver::build()
+                .with_criteria(
+                    gko::stop::Iteration::build().with_max_iters(246u).on(ref),
+                    gko::stop::ResidualNormReduction<>::build()
+                        .with_reduction_factor(1e-15)
+                        .on(ref))
+                .on(ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 43;
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        r_tld = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        q = gen_mtx(m, n);
+        u = gen_mtx(m, n);
+        u_hat = gen_mtx(m, n);
+        v_hat = gen_mtx(m, n);
+        t = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        alpha = gen_mtx(1, n);
+        beta = gen_mtx(1, n);
+        gamma = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        rho_prev = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_b = Mtx::create(hip);
+        d_b->copy_from(b.get());
+        d_r = Mtx::create(hip);
+        d_r->copy_from(r.get());
+        d_r_tld = Mtx::create(hip);
+        d_r_tld->copy_from(r_tld.get());
+        d_p = Mtx::create(hip);
+        d_p->copy_from(p.get());
+        d_q = Mtx::create(hip);
+        d_q->copy_from(q.get());
+        d_u = Mtx::create(hip);
+        d_u->copy_from(u.get());
+        d_u_hat = Mtx::create(hip);
+        d_u_hat->copy_from(u_hat.get());
+        d_v_hat = Mtx::create(hip);
+        d_v_hat->copy_from(v_hat.get());
+        d_t = Mtx::create(hip);
+        d_t->copy_from(t.get());
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_alpha = Mtx::create(hip);
+        d_alpha->copy_from(alpha.get());
+        d_beta = Mtx::create(hip);
+        d_beta->copy_from(beta.get());
+        d_gamma = Mtx::create(hip);
+        d_gamma->copy_from(gamma.get());
+        d_rho_prev = Mtx::create(hip);
+        d_rho_prev->copy_from(rho_prev.get());
+        d_rho = Mtx::create(hip);
+        d_rho->copy_from(rho.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(hip, n));
+        // because there is no public function copy_from, use overloaded =
+        // operator
+        *d_stop_status = *stop_status;
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<Mtx> d_mtx;
+    std::unique_ptr<Solver::Factory> hip_cgs_factory;
+    std::unique_ptr<Solver::Factory> ref_cgs_factory;
+
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> r_tld;
+    std::unique_ptr<Mtx> t;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> q;
+    std::unique_ptr<Mtx> u;
+    std::unique_ptr<Mtx> u_hat;
+    std::unique_ptr<Mtx> v_hat;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> gamma;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<Mtx> rho_prev;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_r_tld;
+    std::unique_ptr<Mtx> d_t;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_q;
+    std::unique_ptr<Mtx> d_u;
+    std::unique_ptr<Mtx> d_u_hat;
+    std::unique_ptr<Mtx> d_v_hat;
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_alpha;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_gamma;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<Mtx> d_rho_prev;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+};
+
+
+TEST_F(Cgs, HipCgsInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::cgs::initialize(
+        ref, b.get(), r.get(), r_tld.get(), p.get(), q.get(), u.get(),
+        u_hat.get(), v_hat.get(), t.get(), alpha.get(), beta.get(), gamma.get(),
+        rho_prev.get(), rho.get(), stop_status.get());
+    gko::kernels::hip::cgs::initialize(
+        hip, d_b.get(), d_r.get(), d_r_tld.get(), d_p.get(), d_q.get(),
+        d_u.get(), d_u_hat.get(), d_v_hat.get(), d_t.get(), d_alpha.get(),
+        d_beta.get(), d_gamma.get(), d_rho_prev.get(), d_rho.get(),
+        d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r_tld, r_tld, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_u, u, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_u_hat, u_hat, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_v_hat, v_hat, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho_prev, rho_prev, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_gamma, gamma, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Cgs, HipCgsStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::cgs::step_1(ref, r.get(), u.get(), p.get(),
+                                         q.get(), beta.get(), rho.get(),
+                                         rho_prev.get(), stop_status.get());
+    gko::kernels::hip::cgs::step_1(hip, d_r.get(), d_u.get(), d_p.get(),
+                                   d_q.get(), d_beta.get(), d_rho.get(),
+                                   d_rho_prev.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_u, u, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+}
+
+
+TEST_F(Cgs, HipCgsStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::cgs::step_2(ref, u.get(), v_hat.get(), q.get(),
+                                         t.get(), alpha.get(), rho.get(),
+                                         gamma.get(), stop_status.get());
+    gko::kernels::hip::cgs::step_2(hip, d_u.get(), d_v_hat.get(), d_q.get(),
+                                   d_t.get(), d_alpha.get(), d_rho.get(),
+                                   d_gamma.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+}
+
+
+TEST_F(Cgs, HipCgsStep3IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::cgs::step_3(ref, t.get(), u_hat.get(), r.get(),
+                                         x.get(), alpha.get(),
+                                         stop_status.get());
+    gko::kernels::hip::cgs::step_3(hip, d_t.get(), d_u_hat.get(), d_r.get(),
+                                   d_x.get(), d_alpha.get(),
+                                   d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+}
+
+
+TEST_F(Cgs, HipCgsApplyOneRHSIsEquivalentToRef)
+{
+    int m = 123;
+    int n = 1;
+    auto ref_solver = ref_cgs_factory->generate(mtx);
+    auto hip_solver = hip_cgs_factory->generate(d_mtx);
+    auto b = gen_mtx(m, n);
+    auto x = gen_mtx(m, n);
+    auto d_b = Mtx::create(hip);
+    auto d_x = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    d_x->copy_from(x.get());
+
+    ref_solver->apply(b.get(), x.get());
+    hip_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13);
+}
+
+
+TEST_F(Cgs, HipCgsApplyMultipleRHSIsEquivalentToRef)
+{
+    int m = 123;
+    int n = 16;
+    auto hip_solver = hip_cgs_factory->generate(d_mtx);
+    auto ref_solver = ref_cgs_factory->generate(mtx);
+    auto b = gen_mtx(m, n);
+    auto x = gen_mtx(m, n);
+    auto d_b = Mtx::create(hip);
+    auto d_x = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    d_x->copy_from(x.get());
+
+    ref_solver->apply(b.get(), x.get());
+    hip_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13);
+}
+
+}  // namespace
diff --git a/hip/test/solver/fcg_kernels.cpp b/hip/test/solver/fcg_kernels.cpp
new file mode 100644
index 00000000000..7771cf9b03c
--- /dev/null
+++ b/hip/test/solver/fcg_kernels.cpp
@@ -0,0 +1,285 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/fcg.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/fcg_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Fcg : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    using Solver = gko::solver::Fcg<>;
+
+    Fcg() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 43;
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        t = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        q = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        beta = gen_mtx(1, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        rho_t = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_b = Mtx::create(hip);
+        d_b->copy_from(b.get());
+        d_r = Mtx::create(hip);
+        d_r->copy_from(r.get());
+        d_t = Mtx::create(hip);
+        d_t->copy_from(t.get());
+        d_z = Mtx::create(hip);
+        d_z->copy_from(z.get());
+        d_p = Mtx::create(hip);
+        d_p->copy_from(p.get());
+        d_q = Mtx::create(hip);
+        d_q->copy_from(q.get());
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_beta = Mtx::create(hip);
+        d_beta->copy_from(beta.get());
+        d_prev_rho = Mtx::create(hip);
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho_t = Mtx::create(hip);
+        d_rho_t->copy_from(rho_t.get());
+        d_rho = Mtx::create(hip);
+        d_rho->copy_from(rho.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(hip, n));
+        *d_stop_status = *stop_status;
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> t;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> q;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<Mtx> rho_t;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_t;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_q;
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<Mtx> d_rho_t;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+};
+
+
+TEST_F(Fcg, HipFcgInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::fcg::initialize(
+        ref, b.get(), r.get(), z.get(), p.get(), q.get(), t.get(),
+        prev_rho.get(), rho.get(), rho_t.get(), stop_status.get());
+    gko::kernels::hip::fcg::initialize(
+        hip, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_t.get(),
+        d_prev_rho.get(), d_rho.get(), d_rho_t.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho_t, rho_t, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Fcg, HipFcgStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::fcg::step_1(ref, p.get(), z.get(), rho_t.get(),
+                                         prev_rho.get(), stop_status.get());
+    gko::kernels::hip::fcg::step_1(hip, d_p.get(), d_z.get(), d_rho_t.get(),
+                                   d_prev_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+}
+
+
+TEST_F(Fcg, HipFcgStep2IsEquivalentToRef)
+{
+    initialize_data();
+    gko::kernels::reference::fcg::step_2(ref, x.get(), r.get(), t.get(),
+                                         p.get(), q.get(), beta.get(),
+                                         rho.get(), stop_status.get());
+    gko::kernels::hip::fcg::step_2(hip, d_x.get(), d_r.get(), d_t.get(),
+                                   d_p.get(), d_q.get(), d_beta.get(),
+                                   d_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14);
+}
+
+
+TEST_F(Fcg, ApplyIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    make_spd(mtx.get());
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = Mtx::create(hip);
+    d_mtx->copy_from(mtx.get());
+    auto d_x = Mtx::create(hip);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(hip);
+    d_b->copy_from(b.get());
+    auto fcg_factory =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_fcg_factory =
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(hip),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(hip))
+            .on(hip);
+    auto solver = fcg_factory->generate(std::move(mtx));
+    auto d_solver = d_fcg_factory->generate(std::move(d_mtx));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/gmres_kernels.cpp b/hip/test/solver/gmres_kernels.cpp
new file mode 100644
index 00000000000..d16c781cb1e
--- /dev/null
+++ b/hip/test/solver/gmres_kernels.cpp
@@ -0,0 +1,300 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/gmres.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/gmres_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Gmres : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using norm_type = gko::remove_complex<value_type>;
+    using NormVector = gko::matrix::Dense<norm_type>;
+    template <typename T>
+    using Dense = typename gko::matrix::Dense<T>;
+
+    Gmres() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    template <typename ValueType = value_type, typename IndexType = index_type>
+    std::unique_ptr<Dense<ValueType>> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Dense<ValueType>>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<IndexType>(num_cols, num_cols),
+            std::normal_distribution<ValueType>(-1.0, 1.0), rand_engine, ref);
+    }
+
+
+    void initialize_data(int nrhs = 43)
+    {
+        int m = 597;
+        x = gen_mtx(m, nrhs);
+        y = gen_mtx(gko::solver::default_krylov_dim, nrhs);
+        before_preconditioner = Mtx::create_with_config_of(x.get());
+        b = gen_mtx(m, nrhs);
+        krylov_bases = gen_mtx(m * (gko::solver::default_krylov_dim + 1), nrhs);
+        hessenberg = gen_mtx(gko::solver::default_krylov_dim + 1,
+                             gko::solver::default_krylov_dim * nrhs);
+        hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, nrhs);
+        residual = gen_mtx(m, nrhs);
+        residual_norm = gen_mtx<norm_type>(1, nrhs);
+        residual_norm_collection =
+            gen_mtx(gko::solver::default_krylov_dim + 1, nrhs);
+        givens_sin = gen_mtx(gko::solver::default_krylov_dim, nrhs);
+        givens_cos = gen_mtx(gko::solver::default_krylov_dim, nrhs);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, nrhs));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+        final_iter_nums = std::unique_ptr<gko::Array<gko::size_type>>(
+            new gko::Array<gko::size_type>(ref, nrhs));
+        for (size_t i = 0; i < final_iter_nums->get_num_elems(); ++i) {
+            final_iter_nums->get_data()[i] = 5;
+        }
+
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_before_preconditioner = Mtx::create_with_config_of(d_x.get());
+        d_y = Mtx::create(hip);
+        d_y->copy_from(y.get());
+        d_b = Mtx::create(hip);
+        d_b->copy_from(b.get());
+        d_krylov_bases = Mtx::create(hip);
+        d_krylov_bases->copy_from(krylov_bases.get());
+        d_hessenberg = Mtx::create(hip);
+        d_hessenberg->copy_from(hessenberg.get());
+        d_hessenberg_iter = Mtx::create(hip);
+        d_hessenberg_iter->copy_from(hessenberg_iter.get());
+        d_residual = Mtx::create(hip);
+        d_residual->copy_from(residual.get());
+        d_residual_norm = NormVector::create(hip);
+        d_residual_norm->copy_from(residual_norm.get());
+        d_residual_norm_collection = Mtx::create(hip);
+        d_residual_norm_collection->copy_from(residual_norm_collection.get());
+        d_givens_sin = Mtx::create(hip);
+        d_givens_sin->copy_from(givens_sin.get());
+        d_givens_cos = Mtx::create(hip);
+        d_givens_cos->copy_from(givens_cos.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(hip, nrhs));
+        *d_stop_status = *stop_status;
+        d_final_iter_nums = std::unique_ptr<gko::Array<gko::size_type>>(
+            new gko::Array<gko::size_type>(hip, nrhs));
+        *d_final_iter_nums = *final_iter_nums;
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> before_preconditioner;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> krylov_bases;
+    std::unique_ptr<Mtx> hessenberg;
+    std::unique_ptr<Mtx> hessenberg_iter;
+    std::unique_ptr<Mtx> residual;
+    std::unique_ptr<Mtx> residual_norm;
+    std::unique_ptr<Mtx> residual_norm_collection;
+    std::unique_ptr<Mtx> givens_sin;
+    std::unique_ptr<Mtx> givens_cos;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+    std::unique_ptr<gko::Array<gko::size_type>> final_iter_nums;
+
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_before_preconditioner;
+    std::unique_ptr<Mtx> d_y;
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_krylov_bases;
+    std::unique_ptr<Mtx> d_hessenberg;
+    std::unique_ptr<Mtx> d_hessenberg_iter;
+    std::unique_ptr<Mtx> d_residual;
+    std::unique_ptr<Mtx> d_residual_norm;
+    std::unique_ptr<Mtx> d_residual_norm_collection;
+    std::unique_ptr<Mtx> d_givens_sin;
+    std::unique_ptr<Mtx> d_givens_cos;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+    std::unique_ptr<gko::Array<gko::size_type>> d_final_iter_nums;
+};
+
+
+TEST_F(Gmres, HipGmresInitialize1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::gmres::initialize_1(
+        ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
+        stop_status.get(), gko::solver::default_krylov_dim);
+    gko::kernels::hip::gmres::initialize_1(
+        hip, d_b.get(), d_residual.get(), d_givens_sin.get(),
+        d_givens_cos.get(), d_stop_status.get(),
+        gko::solver::default_krylov_dim);
+
+    GKO_ASSERT_MTX_NEAR(d_residual, residual, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Gmres, HipGmresInitialize2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::gmres::initialize_2(
+        ref, residual.get(), residual_norm.get(),
+        residual_norm_collection.get(), krylov_bases.get(),
+        final_iter_nums.get(), gko::solver::default_krylov_dim);
+    gko::kernels::hip::gmres::initialize_2(
+        hip, d_residual.get(), d_residual_norm.get(),
+        d_residual_norm_collection.get(), d_krylov_bases.get(),
+        d_final_iter_nums.get(), gko::solver::default_krylov_dim);
+
+    GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection,
+                        1e-14);
+    GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
+}
+
+
+TEST_F(Gmres, HipGmresStep1IsEquivalentToRef)
+{
+    initialize_data();
+    int iter = 5;
+
+    gko::kernels::reference::gmres::step_1(
+        ref, x->get_size()[0], givens_sin.get(), givens_cos.get(),
+        residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(),
+        hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get());
+    gko::kernels::hip::gmres::step_1(
+        hip, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(),
+        d_residual_norm.get(), d_residual_norm_collection.get(),
+        d_krylov_bases.get(), d_hessenberg_iter.get(), iter,
+        d_final_iter_nums.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection,
+                        1e-14);
+    GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
+}
+
+
+TEST_F(Gmres, HipGmresStep1OnSingleRHSIsEquivalentToRef)
+{
+    initialize_data(1);
+    int iter = 5;
+
+    gko::kernels::reference::gmres::step_1(
+        ref, x->get_size()[0], givens_sin.get(), givens_cos.get(),
+        residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(),
+        hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get());
+    gko::kernels::hip::gmres::step_1(
+        hip, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(),
+        d_residual_norm.get(), d_residual_norm_collection.get(),
+        d_krylov_bases.get(), d_hessenberg_iter.get(), iter,
+        d_final_iter_nums.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection,
+                        1e-14);
+    GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
+}
+
+
+TEST_F(Gmres, HipGmresStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::gmres::step_2(ref, residual_norm_collection.get(),
+                                           krylov_bases.get(), hessenberg.get(),
+                                           y.get(), before_preconditioner.get(),
+                                           final_iter_nums.get());
+    gko::kernels::hip::gmres::step_2(hip, d_residual_norm_collection.get(),
+                                     d_krylov_bases.get(), d_hessenberg.get(),
+                                     d_y.get(), d_before_preconditioner.get(),
+                                     d_final_iter_nums.get());
+
+    GKO_ASSERT_MTX_NEAR(d_y, y, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/ir_kernels.cpp b/hip/test/solver/ir_kernels.cpp
new file mode 100644
index 00000000000..0e5791cd7cf
--- /dev/null
+++ b/hip/test/solver/ir_kernels.cpp
@@ -0,0 +1,259 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/ir.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+
+
+#include "core/solver/ir_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Ir : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    Ir() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+};
+
+
+TEST_F(Ir, InitializeIsEquivalentToRef)
+{
+    auto stop_status = gko::Array<gko::stopping_status>(ref, 43);
+    for (size_t i = 0; i < stop_status.get_num_elems(); ++i) {
+        stop_status.get_data()[i].reset();
+    }
+    auto d_stop_status = gko::Array<gko::stopping_status>(hip, stop_status);
+
+    gko::kernels::reference::ir::initialize(ref, &stop_status);
+    gko::kernels::hip::ir::initialize(hip, &d_stop_status);
+
+    auto tmp = gko::Array<gko::stopping_status>(ref, d_stop_status);
+    for (int i = 0; i < stop_status.get_num_elems(); ++i) {
+        ASSERT_EQ(stop_status.get_const_data()[i], tmp.get_const_data()[i]);
+    }
+}
+
+
+TEST_F(Ir, ApplyIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(hip, mtx);
+    auto d_x = clone(hip, x);
+    auto d_b = clone(hip, b);
+    // Forget about accuracy - Richardson is not going to converge for a random
+    // matrix, just check that a couple of iterations gives the same result on
+    // both executors
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(hip))
+            .on(hip);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(hip, mtx);
+    auto d_x = clone(hip, x);
+    auto d_b = clone(hip, b);
+
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            ref))
+                    .on(ref))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            hip))
+                    .on(hip))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(hip))
+            .on(hip);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres
+    // iteration gets amplified by the difference in IR.
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Ir, RichardsonApplyIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(hip, mtx);
+    auto d_x = clone(hip, x);
+    auto d_b = clone(hip, b);
+    // Forget about accuracy - Richardson is not going to converge for a random
+    // matrix, just check that a couple of iterations gives the same result on
+    // both executors
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_relaxation_factor(0.9)
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(hip))
+            .with_relaxation_factor(0.9)
+            .on(hip);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(hip, mtx);
+    auto d_x = clone(hip, x);
+    auto d_b = clone(hip, b);
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            ref))
+                    .on(ref))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_relaxation_factor(0.9)
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            hip))
+                    .on(hip))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(hip))
+            .with_relaxation_factor(0.9)
+            .on(hip);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres
+    // iteration gets amplified by the difference in IR.
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/lower_trs_kernels.cpp b/hip/test/solver/lower_trs_kernels.cpp
new file mode 100644
index 00000000000..b497b525020
--- /dev/null
+++ b/hip/test/solver/lower_trs_kernels.cpp
@@ -0,0 +1,167 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/lower_trs.hpp>
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/solver/lower_trs_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class LowerTrs : public ::testing::Test {
+protected:
+    using CsrMtx = gko::matrix::Csr<double, gko::int32>;
+    using Mtx = gko::matrix::Dense<>;
+
+    LowerTrs() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    std::unique_ptr<Mtx> gen_l_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_lower_triangular_matrix<Mtx>(
+            num_rows, num_cols, false,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data(int m, int n)
+    {
+        mtx = gen_l_mtx(m, m);
+        b = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        csr_mtx = CsrMtx::create(ref);
+        mtx->convert_to(csr_mtx.get());
+        d_csr_mtx = CsrMtx::create(hip);
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_csr_mtx->copy_from(csr_mtx.get());
+        b2 = Mtx::create(ref);
+        d_b2 = Mtx::create(hip);
+        d_b2->copy_from(b.get());
+        b2->copy_from(b.get());
+    }
+
+    std::shared_ptr<Mtx> b;
+    std::shared_ptr<Mtx> b2;
+    std::shared_ptr<Mtx> x;
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<CsrMtx> csr_mtx;
+    std::shared_ptr<Mtx> d_b;
+    std::shared_ptr<Mtx> d_b2;
+    std::shared_ptr<Mtx> d_x;
+    std::shared_ptr<CsrMtx> d_csr_mtx;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+    std::ranlux48 rand_engine;
+};
+
+
+TEST_F(LowerTrs, HipLowerTrsFlagCheckIsCorrect)
+{
+    bool trans_flag = false;
+    bool expected_flag = true;
+    gko::kernels::hip::lower_trs::should_perform_transpose(hip, trans_flag);
+
+    ASSERT_EQ(expected_flag, trans_flag);
+}
+
+
+TEST_F(LowerTrs, HipSingleRhsApplyIsEquivalentToRef)
+{
+    initialize_data(50, 1);
+    auto lower_trs_factory = gko::solver::LowerTrs<>::build().on(ref);
+    auto d_lower_trs_factory = gko::solver::LowerTrs<>::build().on(hip);
+    auto solver = lower_trs_factory->generate(csr_mtx);
+    auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(d_b2.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(LowerTrs, HipMultipleRhsApplyIsEquivalentToRef)
+{
+    initialize_data(50, 3);
+    auto lower_trs_factory =
+        gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref);
+    auto d_lower_trs_factory =
+        gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(hip);
+    auto solver = lower_trs_factory->generate(csr_mtx);
+    auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(d_b2.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/solver/upper_trs_kernels.cpp b/hip/test/solver/upper_trs_kernels.cpp
new file mode 100644
index 00000000000..ba55bc6325c
--- /dev/null
+++ b/hip/test/solver/upper_trs_kernels.cpp
@@ -0,0 +1,167 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/upper_trs.hpp>
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/solver/upper_trs_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class UpperTrs : public ::testing::Test {
+protected:
+    using CsrMtx = gko::matrix::Csr<double, gko::int32>;
+    using Mtx = gko::matrix::Dense<>;
+
+    UpperTrs() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    std::unique_ptr<Mtx> gen_u_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_upper_triangular_matrix<Mtx>(
+            num_rows, num_cols, false,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data(int m, int n)
+    {
+        mtx = gen_u_mtx(m, m);
+        b = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        csr_mtx = CsrMtx::create(ref);
+        mtx->convert_to(csr_mtx.get());
+        d_csr_mtx = CsrMtx::create(hip);
+        d_x = Mtx::create(hip);
+        d_x->copy_from(x.get());
+        d_csr_mtx->copy_from(csr_mtx.get());
+        b2 = Mtx::create(ref);
+        d_b2 = Mtx::create(hip);
+        d_b2->copy_from(b.get());
+        b2->copy_from(b.get());
+    }
+
+    std::shared_ptr<Mtx> b;
+    std::shared_ptr<Mtx> b2;
+    std::shared_ptr<Mtx> x;
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<CsrMtx> csr_mtx;
+    std::shared_ptr<Mtx> d_b;
+    std::shared_ptr<Mtx> d_b2;
+    std::shared_ptr<Mtx> d_x;
+    std::shared_ptr<CsrMtx> d_csr_mtx;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+    std::ranlux48 rand_engine;
+};
+
+
+TEST_F(UpperTrs, HipUpperTrsFlagCheckIsCorrect)
+{
+    bool trans_flag = false;
+    bool expected_flag = true;
+    gko::kernels::hip::upper_trs::should_perform_transpose(hip, trans_flag);
+
+    ASSERT_EQ(expected_flag, trans_flag);
+}
+
+
+TEST_F(UpperTrs, HipSingleRhsApplyIsEquivalentToRef)
+{
+    initialize_data(50, 1);
+    auto upper_trs_factory = gko::solver::UpperTrs<>::build().on(ref);
+    auto d_upper_trs_factory = gko::solver::UpperTrs<>::build().on(hip);
+    auto solver = upper_trs_factory->generate(csr_mtx);
+    auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(d_b2.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(UpperTrs, HipMultipleRhsApplyIsEquivalentToRef)
+{
+    initialize_data(50, 3);
+    auto upper_trs_factory =
+        gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref);
+    auto d_upper_trs_factory =
+        gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(hip);
+    auto solver = upper_trs_factory->generate(csr_mtx);
+    auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(d_b2.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/hip/test/stop/CMakeLists.txt b/hip/test/stop/CMakeLists.txt
new file mode 100644
index 00000000000..844f7037768
--- /dev/null
+++ b/hip/test/stop/CMakeLists.txt
@@ -0,0 +1,2 @@
+ginkgo_create_hip_test(criterion_kernels)
+ginkgo_create_hip_test_special_linkage(residual_norm_kernels)
diff --git a/hip/test/stop/criterion_kernels.hip.cpp b/hip/test/stop/criterion_kernels.hip.cpp
new file mode 100644
index 00000000000..92935ea4867
--- /dev/null
+++ b/hip/test/stop/criterion_kernels.hip.cpp
@@ -0,0 +1,111 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/stop/criterion.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+constexpr gko::size_type test_iterations = 10;
+
+
+class Criterion : public ::testing::Test {
+protected:
+    Criterion()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        hip_ = gko::HipExecutor::create(0, ref_);
+        // Actually use an iteration stopping criterion because Criterion is an
+        // abstract class
+        factory_ = gko::stop::Iteration::build()
+                       .with_max_iters(test_iterations)
+                       .on(hip_);
+    }
+
+    std::unique_ptr<gko::stop::Iteration::Factory> factory_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+    std::shared_ptr<const gko::HipExecutor> hip_;
+};
+
+
+TEST_F(Criterion, SetsOneStopStatus)
+{
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    auto criterion = factory_->generate(nullptr, nullptr, nullptr);
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+
+    stop_status.set_executor(hip_);
+    criterion->update()
+        .num_iterations(test_iterations)
+        .check(RelativeStoppingId, true, &stop_status, &one_changed);
+    stop_status.set_executor(ref_);
+
+    ASSERT_EQ(stop_status.get_data()[0].has_stopped(), true);
+}
+
+
+TEST_F(Criterion, SetsMultipleStopStatuses)
+{
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    auto criterion = factory_->generate(nullptr, nullptr, nullptr);
+    gko::Array<gko::stopping_status> stop_status(ref_, 3);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.get_data()[2].reset();
+
+    stop_status.set_executor(hip_);
+    criterion->update()
+        .num_iterations(test_iterations)
+        .check(RelativeStoppingId, true, &stop_status, &one_changed);
+    stop_status.set_executor(ref_);
+
+    ASSERT_EQ(stop_status.get_data()[0].has_stopped(), true);
+    ASSERT_EQ(stop_status.get_data()[1].has_stopped(), true);
+    ASSERT_EQ(stop_status.get_data()[2].has_stopped(), true);
+}
+
+
+}  // namespace
diff --git a/hip/test/stop/residual_norm_kernels.cpp b/hip/test/stop/residual_norm_kernels.cpp
new file mode 100644
index 00000000000..42c505da601
--- /dev/null
+++ b/hip/test/stop/residual_norm_kernels.cpp
@@ -0,0 +1,369 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+constexpr double tol = 1.0e-14;
+
+
+class ResidualNormReduction : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+
+    ResidualNormReduction()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        hip_ = gko::HipExecutor::create(0, ref_);
+        factory_ = gko::stop::ResidualNormReduction<>::build()
+                       .with_reduction_factor(tol)
+                       .on(hip_);
+    }
+
+    std::unique_ptr<gko::stop::ResidualNormReduction<>::Factory> factory_;
+    std::shared_ptr<const gko::HipExecutor> hip_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+};
+
+
+TEST_F(ResidualNormReduction, WaitsTillResidualGoal)
+{
+    auto res = gko::initialize<Mtx>({100.0}, ref_);
+    auto d_res = Mtx::create(hip_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(hip_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+    stop_status.set_executor(hip_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0) = tol * 1.1e+2;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_FALSE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(hip_);
+    ASSERT_FALSE(one_changed);
+
+    res->at(0) = tol * 0.9e+2;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
+{
+    auto res = gko::initialize<Mtx>({{100.0, 100.0}}, ref_);
+    auto d_res = Mtx::create(hip_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({{10.0, 10.0}}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(hip_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.set_executor(hip_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0, 0) = tol * 0.9e+2;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(hip_);
+    ASSERT_TRUE(one_changed);
+
+    res->at(0, 1) = tol * 0.9e+2;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[1].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+class RelativeResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+
+    RelativeResidualNorm()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        hip_ = gko::HipExecutor::create(0, ref_);
+        factory_ =
+            gko::stop::RelativeResidualNorm<>::build().with_tolerance(tol).on(
+                hip_);
+    }
+
+    std::unique_ptr<gko::stop::RelativeResidualNorm<>::Factory> factory_;
+    std::shared_ptr<const gko::HipExecutor> hip_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+};
+
+
+TEST_F(RelativeResidualNorm, WaitsTillResidualGoal)
+{
+    auto res = gko::initialize<Mtx>({100.0}, ref_);
+    auto d_res = Mtx::create(hip_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(hip_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+    stop_status.set_executor(hip_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0) = tol * 1.1e+1;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_FALSE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(hip_);
+    ASSERT_FALSE(one_changed);
+
+    res->at(0) = tol * 0.9e+1;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+TEST_F(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    auto res = gko::initialize<Mtx>({{100.0, 100.0}}, ref_);
+    auto d_res = Mtx::create(hip_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({{10.0, 10.0}}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(hip_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.set_executor(hip_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0, 0) = tol * 0.9e+1;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(hip_);
+    ASSERT_TRUE(one_changed);
+
+    res->at(0, 1) = tol * 0.9e+1;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[1].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+class AbsoluteResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+
+    AbsoluteResidualNorm()
+    {
+        ref_ = gko::ReferenceExecutor::create();
+        hip_ = gko::HipExecutor::create(0, ref_);
+        factory_ =
+            gko::stop::AbsoluteResidualNorm<>::build().with_tolerance(tol).on(
+                hip_);
+    }
+
+    std::unique_ptr<gko::stop::AbsoluteResidualNorm<>::Factory> factory_;
+    std::shared_ptr<const gko::HipExecutor> hip_;
+    std::shared_ptr<gko::ReferenceExecutor> ref_;
+};
+
+
+TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoal)
+{
+    auto res = gko::initialize<Mtx>({100.0}, ref_);
+    auto d_res = Mtx::create(hip_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(hip_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 1);
+    stop_status.get_data()[0].reset();
+    stop_status.set_executor(hip_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0) = tol * 1.1;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_FALSE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(hip_);
+    ASSERT_FALSE(one_changed);
+
+    res->at(0) = tol * 0.9;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    auto res = gko::initialize<Mtx>({{100.0, 100.0}}, ref_);
+    auto d_res = Mtx::create(hip_);
+    d_res->copy_from(res.get());
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({{10.0, 10.0}}, ref_);
+    std::shared_ptr<gko::LinOp> d_rhs = Mtx::create(hip_);
+    d_rhs->copy_from(rhs.get());
+    auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(ref_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+    stop_status.set_executor(hip_);
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res->at(0, 0) = tol * 0.9;
+    d_res->copy_from(res.get());
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[0].has_converged());
+    stop_status.set_executor(hip_);
+    ASSERT_TRUE(one_changed);
+
+    res->at(0, 1) = tol * 0.9;
+    d_res->copy_from(res.get());
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(d_res.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    stop_status.set_executor(ref_);
+    ASSERT_TRUE(stop_status.get_data()[1].has_converged());
+    ASSERT_TRUE(one_changed);
+}
+
+
+}  // namespace
diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp
new file mode 100644
index 00000000000..8d524b0b615
--- /dev/null
+++ b/hip/test/utils.hip.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_TEST_UTILS_HIP_HPP_
+#define GKO_HIP_TEST_UTILS_HIP_HPP_
+
+
+#include "core/test/utils.hpp"
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace {
+
+
+// prevent device reset after each test
+auto no_reset_exec =
+    gko::HipExecutor::create(0, gko::ReferenceExecutor::create(), true);
+
+
+}  // namespace
+
+
+#endif  // GKO_HIP_TEST_UTILS_HIP_HPP_
diff --git a/hip/test/utils/CMakeLists.txt b/hip/test/utils/CMakeLists.txt
new file mode 100644
index 00000000000..a6c52f65d9c
--- /dev/null
+++ b/hip/test/utils/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_hip_test(assertions_test)
diff --git a/hip/test/utils/assertions_test.hip.cpp b/hip/test/utils/assertions_test.hip.cpp
new file mode 100644
index 00000000000..2d5c67addc1
--- /dev/null
+++ b/hip/test/utils/assertions_test.hip.cpp
@@ -0,0 +1,84 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/test/utils/assertions.hpp"
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class MatricesNear : public ::testing::Test {
+protected:
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+};
+
+
+TEST_F(MatricesNear, CanPassHipMatrix)
+{
+    auto mtx = gko::initialize<gko::matrix::Dense<>>(
+        {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, ref);
+    auto csr_ref = gko::matrix::Csr<>::create(ref);
+    csr_ref->copy_from(mtx.get());
+    auto csr_mtx = gko::matrix::Csr<>::create(hip);
+    csr_mtx->copy_from(std::move(csr_ref));
+
+    GKO_EXPECT_MTX_NEAR(csr_mtx, mtx, 0.0);
+    GKO_ASSERT_MTX_NEAR(csr_mtx, mtx, 0.0);
+}
+
+
+}  // namespace
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 00000000000..004b7b359ad
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,6 @@
+if (GINKGO_CHECK_CIRCULAR_DEPS)
+    add_library(ginkgo_public_api INTERFACE) # dummy target
+    set_property(TARGET ginkgo_public_api APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}")
+    set_property(TARGET ginkgo_public_api APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}")
+    ginkgo_check_headers(ginkgo_public_api)
+endif()
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index aeb8cb30712..fc5bae0b225 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,6 +59,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS
 
 
+/* What is HIP compiled for, hcc or nvcc? */
+// clang-format off
+#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
+
+
+#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@
+// clang-format on
+
+
 /* Is PAPI SDE available for Logging? */
 // clang-format off
 #define GKO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@
diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp
index 58a3407d3fc..2db193027a1 100644
--- a/include/ginkgo/core/base/abstract_factory.hpp
+++ b/include/ginkgo/core/base/abstract_factory.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp
index 1084ad324bb..5d53eeb58bb 100644
--- a/include/ginkgo/core/base/array.hpp
+++ b/include/ginkgo/core/base/array.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,15 +30,18 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_BASE_ARRAY_H_
-#define GKO_CORE_BASE_ARRAY_H_
+#ifndef GKO_CORE_BASE_ARRAY_HPP_
+#define GKO_CORE_BASE_ARRAY_HPP_
 
 
+#include <algorithm>
+#include <iterator>
 #include <memory>
 #include <utility>
 
 
 #include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
@@ -47,6 +50,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
+namespace detail {
+
+
+/**
+ * @internal
+ *
+ * Converts `size` elements of type `SourceType` stored at `src` on `exec`
+ * to `TargetType` stored at `dst`.
+ */
+template <typename SourceType, typename TargetType>
+void convert_data(std::shared_ptr<const Executor> exec, size_type size,
+                  const SourceType *src, TargetType *dst);
+
+
+}  // namespace detail
+
+
 /**
  * An Array is a container which encapsulates fixed-sized arrays, stored on the
  * Executor tied to the Array.
@@ -178,11 +198,8 @@ class Array {
           RandomAccessIterator end)
         : Array(exec)
     {
-        Array tmp(exec->get_master(), end - begin);
-        int i = 0;
-        for (auto it = begin; it != end; ++it, ++i) {
-            tmp.data_[i] = *it;
-        }
+        Array tmp(exec->get_master(), std::distance(begin, end));
+        std::copy(begin, end, tmp.data_.get());
         *this = std::move(tmp);
     }
 
@@ -255,7 +272,8 @@ class Array {
      * Creates an Array from existing memory.
      *
      * The Array does not take ownership of the memory, and will not deallocate
-     * it once it goes out of scope.
+     * it once it goes out of scope. This array type cannot use the function
+     * `resize_and_reset` since it does not own the data it should resize.
      *
      * @param exec  executor where `data` is located
      * @param num_elems  number of elements in `data`
@@ -270,7 +288,10 @@ class Array {
     }
 
     /**
-     * Copies data from another array.
+     * Copies data from another array or view. In the case of an array target,
+     * the array is resized to match the source's size. In the case of a view
+     * target, if the dimensions are not compatible a gko::OutOfBoundsError is
+     * thrown.
      *
      * This does not invoke the constructors of the elements, instead they are
      * copied as POD types.
@@ -292,17 +313,39 @@ class Array {
             data_ = data_manager{nullptr, other.data_.get_deleter()};
         }
         if (other.get_executor() == nullptr) {
-            this->resize_and_reset(0);
+            this->clear();
             return *this;
         }
-        this->resize_and_reset(other.get_num_elems());
-        exec_->copy_from(other.get_executor().get(), num_elems_,
+
+        if (this->is_owning()) {
+            this->resize_and_reset(other.get_num_elems());
+        } else {
+            GKO_ENSURE_COMPATIBLE_BOUNDS(other.get_num_elems(),
+                                         this->num_elems_);
+        }
+        exec_->copy_from(other.get_executor().get(), other.get_num_elems(),
                          other.get_const_data(), this->get_data());
         return *this;
     }
 
     /**
-     * Moves data from another array.
+     * Moves data from another array or view. Only the pointer and deleter type
+     * change, a copy only happens when targeting another executor's data. This
+     * means that in the following situation:
+     * ```cpp
+     *   gko::Array<int> a; // an existing array or view
+     *   gko::Array<int> b; // an existing array or view
+     *   b = std::move(a);
+     * ```
+     * Depending on whether `a` and `b` are array or view, this happens:
+     * + `a` and `b` are views, `b` becomes the only valid view of `a`;
+     * + `a` and `b` are arrays, `b` becomes the only valid array of `a`;
+     * + `a` is a view and `b` is an array, `b` frees its data and becomes the
+     *    only valid view of `a` ();
+     * + `a` is an array and `b` is a view, `b` becomes the only valid array
+     *    of `a`.
+     *
+     * In all the previous cases, `a` becomes invalid (e.g., a `nullptr`).
      *
      * This does not invoke the constructors of the elements, instead they are
      * copied as POD types.
@@ -324,22 +367,70 @@ class Array {
             data_ = data_manager{nullptr, other.data_.get_deleter()};
         }
         if (other.get_executor() == nullptr) {
-            this->resize_and_reset(0);
+            this->clear();
             return *this;
         }
-        if (exec_ == other.get_executor() &&
-            data_.get_deleter().target_type() != typeid(view_deleter)) {
-            // same device and not a view, only move the pointer
+        if (exec_ == other.get_executor()) {
+            // same device, only move the pointer
             using std::swap;
             swap(data_, other.data_);
             swap(num_elems_, other.num_elems_);
+            other.clear();
         } else {
-            // different device or a view, copy the data
+            // different device, copy the data
             *this = other;
         }
         return *this;
     }
 
+    /**
+     * Copies and converts data from another array with another data type.
+     * In the case of an array target, the array is resized to match the
+     * source's size. In the case of a view target, if the dimensions are not
+     * compatible a gko::OutOfBoundsError is thrown.
+     *
+     * This does not invoke the constructors of the elements, instead they are
+     * copied as POD types.
+     *
+     * The executor of this is preserved. In case this does not have an assigned
+     * executor, it will inherit the executor of other.
+     *
+     * @param other  the Array to copy from
+     * @tparam OtherValueType  the value type of `other`
+     *
+     * @return this
+     */
+    template <typename OtherValueType>
+    xstd::enable_if_t<!std::is_same<ValueType, OtherValueType>::value, Array>
+        &operator=(const Array<OtherValueType> &other)
+    {
+        if (this->exec_ == nullptr) {
+            this->exec_ = other.get_executor();
+            this->data_ = data_manager{nullptr, default_deleter{this->exec_}};
+        }
+        if (other.get_executor() == nullptr) {
+            this->clear();
+            return *this;
+        }
+
+        if (this->is_owning()) {
+            this->resize_and_reset(other.get_num_elems());
+        } else {
+            GKO_ENSURE_COMPATIBLE_BOUNDS(other.get_num_elems(),
+                                         this->num_elems_);
+        }
+        Array<OtherValueType> tmp{this->exec_};
+        const OtherValueType *source = other.get_const_data();
+        // if we are on different executors: copy, then convert
+        if (this->exec_ != other.get_executor()) {
+            tmp = other;
+            source = tmp.get_const_data();
+        }
+        detail::convert_data(this->exec_, other.get_num_elems(), source,
+                             this->get_data());
+        return *this;
+    }
+
     /**
      * Deallocates all data used by the Array.
      *
@@ -355,6 +446,8 @@ class Array {
 
     /**
      * Resizes the array so it is able to hold the specified number of elements.
+     * For a view and other non-owning Array types, this throws an exception
+     * since these types cannot be resized.
      *
      * All data stored in the array will be lost.
      *
@@ -372,11 +465,16 @@ class Array {
             throw gko::NotSupported(__FILE__, __LINE__, __func__,
                                     "gko::Executor (nullptr)");
         }
-        num_elems_ = num_elems;
-        if (num_elems > 0) {
+        if (!this->is_owning()) {
+            throw gko::NotSupported(__FILE__, __LINE__, __func__,
+                                    "Non owning gko::Array cannot be resized.");
+        }
+
+        if (num_elems > 0 && this->is_owning()) {
+            num_elems_ = num_elems;
             data_.reset(exec_->alloc<value_type>(num_elems));
         } else {
-            data_.reset(nullptr);
+            this->clear();
         }
     }
 
@@ -433,7 +531,28 @@ class Array {
         data_ = std::move(tmp.data_);
     }
 
+    /**
+     * Tells whether this Array owns its data or not.
+     *
+     * Views do not own their data and this has multiple implications. They
+     * cannot be resized since the data is not owned by the Array which stores a
+     * view. It is also unclear whether custom deleter types are owning types as
+     * they could be a user-created view-type, therefore only proper Array which
+     * use the `default_deleter` are considered owning types.
+     *
+     * @return whether this Array can be resized or not.
+     */
+    bool is_owning()
+    {
+        return data_.get_deleter().target_type() == typeid(default_deleter);
+    }
+
+
 private:
+    // Allow other Array types to access private members
+    template <typename OtherValueType>
+    friend class Array;
+
     using data_manager =
         std::unique_ptr<value_type[], std::function<void(value_type[])>>;
 
@@ -446,4 +565,4 @@ class Array {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_BASE_ARRAY_H_
+#endif  // GKO_CORE_BASE_ARRAY_HPP_
diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp
index 330a11f8cd3..908013a3e57 100644
--- a/include/ginkgo/core/base/combination.hpp
+++ b/include/ginkgo/core/base/combination.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -53,12 +53,14 @@ namespace gko {
  */
 template <typename ValueType = default_precision>
 class Combination : public EnableLinOp<Combination<ValueType>>,
-                    public EnableCreateMethod<Combination<ValueType>> {
+                    public EnableCreateMethod<Combination<ValueType>>,
+                    public Transposable {
     friend class EnablePolymorphicObject<Combination, LinOp>;
     friend class EnableCreateMethod<Combination>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Combination<ValueType>;
 
     /**
      * Returns a list of coefficients of the combination.
@@ -82,6 +84,10 @@ class Combination : public EnableLinOp<Combination<ValueType>>,
         return operators_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
 protected:
     /**
      * Creates an empty linear combination (0x0 operator).
diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp
index 740c3e235f9..4a7ecc7874f 100644
--- a/include/ginkgo/core/base/composition.hpp
+++ b/include/ginkgo/core/base/composition.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -48,18 +48,27 @@ namespace gko {
  * The Composition class can be used to compose linear operators `op1, op2, ...,
  * opn` and obtain the operator `op1 * op2 * ... * opn`.
  *
+ * All LinOps of the Composition must operate on Dense inputs.
+ * For an operator `op_k` that require an initial guess for their `apply`,
+ * Composition provides either
+ * * the output of the previous `op_{k+1}->apply` if `op_k` has square dimension
+ * * zero if `op_k` is rectangular
+ * as an initial guess.
+ *
  * @tparam ValueType  precision of input and result vectors
  *
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
 class Composition : public EnableLinOp<Composition<ValueType>>,
-                    public EnableCreateMethod<Composition<ValueType>> {
+                    public EnableCreateMethod<Composition<ValueType>>,
+                    public Transposable {
     friend class EnablePolymorphicObject<Composition, LinOp>;
     friend class EnableCreateMethod<Composition>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Composition<ValueType>;
 
     /**
      * Returns a list of operators of the composition.
@@ -72,6 +81,10 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
         return operators_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
 protected:
     /**
      * Creates an empty operator composition (0x0 operator).
@@ -79,7 +92,7 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
      * @param exec  Executor associated to the composition
      */
     explicit Composition(std::shared_ptr<const Executor> exec)
-        : EnableLinOp<Composition>(exec)
+        : EnableLinOp<Composition>(exec), storage_{exec}
     {}
 
     /**
@@ -101,6 +114,7 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
               }
               return (*begin)->get_executor();
           }()),
+          storage_{(*begin)->get_executor()},
           operators_(begin, end)
     {
         this->set_size(gko::dim<2>{operators_.front()->get_size()[0],
@@ -138,7 +152,8 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
      */
     explicit Composition(std::shared_ptr<const LinOp> oper)
         : EnableLinOp<Composition>(oper->get_executor(), oper->get_size()),
-          operators_{oper}
+          operators_{oper},
+          storage_{oper->get_executor()}
     {}
 
     void apply_impl(const LinOp *b, LinOp *x) const override;
@@ -148,18 +163,7 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
 
 private:
     std::vector<std::shared_ptr<const LinOp>> operators_;
-
-    // TODO: solve race conditions when multithreading
-    mutable struct cache_struct {
-        cache_struct() = default;
-        ~cache_struct() = default;
-        cache_struct(const cache_struct &other) {}
-        cache_struct &operator=(const cache_struct &other) { return *this; }
-
-        // TODO: reduce the amount of intermediate vectors we need (careful --
-        //       not all of them are of the same size)
-        std::vector<std::unique_ptr<LinOp>> intermediate;
-    } cache_;
+    mutable Array<ValueType> storage_;
 };
 
 
diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp
index 8c0155aee44..c0256df30dc 100644
--- a/include/ginkgo/core/base/dim.hpp
+++ b/include/ginkgo/core/base/dim.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -97,7 +97,8 @@ struct dim {
     constexpr GKO_ATTRIBUTES const dimension_type &operator[](
         const size_type &dimension) const noexcept
     {
-        return GKO_ASSERT(dimension < dimensionality), *(&first_ + dimension);
+        return GKO_ASSERT(dimension < dimensionality),
+               dimension == 0 ? first_ : rest_[dimension - 1];
     }
 
     /**
@@ -106,7 +107,8 @@ struct dim {
     GKO_ATTRIBUTES dimension_type &operator[](
         const size_type &dimension) noexcept
     {
-        return GKO_ASSERT(dimension < dimensionality), *(&first_ + dimension);
+        return GKO_ASSERT(dimension < dimensionality),
+               dimension == 0 ? first_ : rest_[dimension - 1];
     }
 
     /**
@@ -173,12 +175,12 @@ struct dim<1u, DimensionType> {
     constexpr GKO_ATTRIBUTES const dimension_type &operator[](
         const size_type &dimension) const noexcept
     {
-        return *(&first_ + dimension);
+        return GKO_ASSERT(dimension == 0), first_;
     }
 
     GKO_ATTRIBUTES dimension_type &operator[](const size_type &dimension)
     {
-        return *(&first_ + dimension);
+        return GKO_ASSERT(dimension == 0), first_;
     }
 
     constexpr GKO_ATTRIBUTES operator bool() const
diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp
index 1855a9dc5c7..78fe81a617e 100644
--- a/include/ginkgo/core/base/exception.hpp
+++ b/include/ginkgo/core/base/exception.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,17 +30,17 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_EXCEPTION_HPP_
-#define GKO_CORE_EXCEPTION_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
+#ifndef GKO_CORE_BASE_EXCEPTION_HPP_
+#define GKO_CORE_BASE_EXCEPTION_HPP_
 
 
 #include <exception>
 #include <string>
 
 
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 
 
@@ -88,9 +88,9 @@ class Error : public std::exception {
     /**
      * Initializes an error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param what The error message
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param what  The error message
      */
     Error(const std::string &file, int line, const std::string &what)
         : what_(file + ":" + std::to_string(line) + ": " + what)
@@ -116,9 +116,9 @@ class NotImplemented : public Error {
     /**
      * Initializes a NotImplemented error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the not-yet implemented function
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the not-yet implemented function
      */
     NotImplemented(const std::string &file, int line, const std::string &func)
         : Error(file, line, func + " is not implemented")
@@ -135,10 +135,10 @@ class NotCompiled : public Error {
     /**
      * Initializes a NotCompiled error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the function that has not been compiled
-     * @param module The name of the module which contains the function
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the function that has not been compiled
+     * @param module  The name of the module which contains the function
      */
     NotCompiled(const std::string &file, int line, const std::string &func,
                 const std::string &module)
@@ -158,10 +158,10 @@ class NotSupported : public Error {
     /**
      * Initializes a NotSupported error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the function where the error occured
-     * @param obj_type The object type on which the requested operation
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the function where the error occured
+     * @param obj_type  The object type on which the requested operation
                        cannot be performed.
      */
     NotSupported(const std::string &file, int line, const std::string &func,
@@ -181,10 +181,10 @@ class CudaError : public Error {
     /**
      * Initializes a CUDA error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the CUDA routine that failed
-     * @param error_code The resulting CUDA error code
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the CUDA routine that failed
+     * @param error_code  The resulting CUDA error code
      */
     CudaError(const std::string &file, int line, const std::string &func,
               int64 error_code)
@@ -204,10 +204,10 @@ class CublasError : public Error {
     /**
      * Initializes a cuBLAS error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the cuBLAS routine that failed
-     * @param error_code The resulting cuBLAS error code
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the cuBLAS routine that failed
+     * @param error_code  The resulting cuBLAS error code
      */
     CublasError(const std::string &file, int line, const std::string &func,
                 int64 error_code)
@@ -227,10 +227,10 @@ class CusparseError : public Error {
     /**
      * Initializes a cuSPARSE error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the cuSPARSE routine that failed
-     * @param error_code The resulting cuSPARSE error code
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the cuSPARSE routine that failed
+     * @param error_code  The resulting cuSPARSE error code
      */
     CusparseError(const std::string &file, int line, const std::string &func,
                   int64 error_code)
@@ -242,6 +242,76 @@ class CusparseError : public Error {
 };
 
 
+/**
+ * HipError is thrown when a HIP routine throws a non-zero error code.
+ */
+class HipError : public Error {
+public:
+    /**
+     * Initializes a HIP error.
+     *
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the HIP routine that failed
+     * @param error_code  The resulting HIP error code
+     */
+    HipError(const std::string &file, int line, const std::string &func,
+             int64 error_code)
+        : Error(file, line, func + ": " + get_error(error_code))
+    {}
+
+private:
+    static std::string get_error(int64 error_code);
+};
+
+
+/**
+ * HipblasError is thrown when a hipBLAS routine throws a non-zero error code.
+ */
+class HipblasError : public Error {
+public:
+    /**
+     * Initializes a hipBLAS error.
+     *
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the hipBLAS routine that failed
+     * @param error_code  The resulting hipBLAS error code
+     */
+    HipblasError(const std::string &file, int line, const std::string &func,
+                 int64 error_code)
+        : Error(file, line, func + ": " + get_error(error_code))
+    {}
+
+private:
+    static std::string get_error(int64 error_code);
+};
+
+
+/**
+ * HipsparseError is thrown when a hipSPARSE routine throws a non-zero error
+ * code.
+ */
+class HipsparseError : public Error {
+public:
+    /**
+     * Initializes a hipSPARSE error.
+     *
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the hipSPARSE routine that failed
+     * @param error_code  The resulting hipSPARSE error code
+     */
+    HipsparseError(const std::string &file, int line, const std::string &func,
+                   int64 error_code)
+        : Error(file, line, func + ": " + get_error(error_code))
+    {}
+
+private:
+    static std::string get_error(int64 error_code);
+};
+
+
 /**
  * DimensionMismatch is thrown if an operation is being applied to LinOps of
  * incompatible size.
@@ -251,16 +321,16 @@ class DimensionMismatch : public Error {
     /**
      * Initializes a dimension mismatch error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The function name where the error occurred
-     * @param first_name The name of the first operator
-     * @param first_rows The output dimension of the first operator
-     * @param first_cols The input dimension of the first operator
-     * @param second_name The name of the second operator
-     * @param second_rows The output dimension of the second operator
-     * @param second_cols The input dimension of the second operator
-     * @param clarification An additional message describing the error further
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The function name where the error occurred
+     * @param first_name  The name of the first operator
+     * @param first_rows  The output dimension of the first operator
+     * @param first_cols  The input dimension of the first operator
+     * @param second_name  The name of the second operator
+     * @param second_rows  The output dimension of the second operator
+     * @param second_cols  The input dimension of the second operator
+     * @param clarification  An additional message describing the error further
      */
     DimensionMismatch(const std::string &file, int line,
                       const std::string &func, const std::string &first_name,
@@ -286,13 +356,13 @@ class BadDimension : public Error {
     /**
      * Initializes a bad dimension error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The function name where the error occurred
-     * @param op_name The name of the operator
-     * @param op_num_rows The row dimension of the operator
-     * @param op_num_cols The column dimension of the operator
-     * @param clarification An additional message further describing the error
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The function name where the error occurred
+     * @param op_name  The name of the operator
+     * @param op_num_rows  The row dimension of the operator
+     * @param op_num_cols  The column dimension of the operator
+     * @param clarification  An additional message further describing the error
      */
     BadDimension(const std::string &file, int line, const std::string &func,
                  const std::string &op_name, size_type op_num_rows,
@@ -313,12 +383,12 @@ class ValueMismatch : public Error {
     /**
      * Initializes a value mismatch error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The function name where the error occurred
-     * @param val1 The first value to be compared.
-     * @param val2 The second value to be compared.
-     * @param clarification An additional message further describing the error
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The function name where the error occurred
+     * @param val1  The first value to be compared.
+     * @param val2  The second value to be compared.
+     * @param clarification  An additional message further describing the error
      */
     ValueMismatch(const std::string &file, int line, const std::string &func,
                   size_type val1, size_type val2,
@@ -338,10 +408,10 @@ class AllocationError : public Error {
     /**
      * Initializes an allocation error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param device The device on which the error occurred
-     * @param bytes The size of the memory block whose allocation failed.
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param device  The device on which the error occurred
+     * @param bytes  The size of the memory block whose allocation failed.
      */
     AllocationError(const std::string &file, int line,
                     const std::string &device, size_type bytes)
@@ -384,10 +454,10 @@ class StreamError : public Error {
     /**
      * Initializes a file access error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the function that tried to access the file
-     * @param message The error message
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the function that tried to access the file
+     * @param message  The error message
      */
     StreamError(const std::string &file, int line, const std::string &func,
                 const std::string &message)
@@ -405,9 +475,9 @@ class KernelNotFound : public Error {
     /**
      * Initializes a KernelNotFound error.
      *
-     * @param file The name of the offending source file
-     * @param line The source code line number where the error occurred
-     * @param func The name of the function where the error occurred
+     * @param file  The name of the offending source file
+     * @param line  The source code line number where the error occurred
+     * @param func  The name of the function where the error occurred
      */
     KernelNotFound(const std::string &file, int line, const std::string &func)
         : Error(file, line, func + ": unable to find an eligible kernel")
@@ -418,4 +488,4 @@ class KernelNotFound : public Error {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_EXCEPTION_HPP_
+#endif  // GKO_CORE_BASE_EXCEPTION_HPP_
diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp
index 41225f06c98..774ff3fda07 100644
--- a/include/ginkgo/core/base/exception_helpers.hpp
+++ b/include/ginkgo/core/base/exception_helpers.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_EXCEPTION_HELPERS_HPP_
-#define GKO_CORE_EXCEPTION_HELPERS_HPP_
+#ifndef GKO_CORE_BASE_EXCEPTION_HELPERS_HPP_
+#define GKO_CORE_BASE_EXCEPTION_HELPERS_HPP_
+
+
+#include <typeinfo>
 
 
 #include <ginkgo/core/base/dim.hpp>
@@ -39,9 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/name_demangling.hpp>
 
 
-#include <typeinfo>
-
-
 namespace gko {
 
 
@@ -88,6 +88,34 @@ namespace gko {
                   "semi-colon warnings")
 
 
+namespace detail {
+
+
+template <typename T, typename T2 = void>
+struct dynamic_type_helper {
+    static const std::type_info &get(const T &obj) { return typeid(obj); }
+};
+
+template <typename T>
+struct dynamic_type_helper<T,
+                           typename std::enable_if<std::is_pointer<T>::value ||
+                                                   have_ownership<T>()>::type> {
+    static const std::type_info &get(const T &obj)
+    {
+        return obj ? typeid(*obj) : typeid(nullptr);
+    }
+};
+
+template <typename T>
+const std::type_info &get_dynamic_type(const T &obj)
+{
+    return dynamic_type_helper<T>::get(obj);
+}
+
+
+}  // namespace detail
+
+
 /**
  * Throws a NotSupported exception.
  * This macro sets the correct information about the location of the error
@@ -95,14 +123,14 @@ namespace gko {
  *
  * @param _obj  the object referenced by NotSupported exception
  */
-#define GKO_NOT_SUPPORTED(_obj)                                              \
-    {                                                                        \
-        throw ::gko::NotSupported(                                           \
-            __FILE__, __LINE__, __func__,                                    \
-            ::gko::name_demangling::get_type_name(typeid(_obj)));            \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
+#define GKO_NOT_SUPPORTED(_obj)                                                \
+    {                                                                          \
+        throw ::gko::NotSupported(__FILE__, __LINE__, __func__,                \
+                                  ::gko::name_demangling::get_type_name(       \
+                                      ::gko::detail::get_dynamic_type(_obj))); \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
                   "semi-colon warnings")
 
 
@@ -280,7 +308,7 @@ inline dim<2> get_size(const dim<2> &size) { return size; }
 /**
  * Asserts that a cuBLAS library call completed without errors.
  *
- * @param _cuda_call  a library call expression
+ * @param _cublas_call  a library call expression
  */
 #define GKO_ASSERT_NO_CUBLAS_ERRORS(_cublas_call) \
     do {                                          \
@@ -294,7 +322,7 @@ inline dim<2> get_size(const dim<2> &size) { return size; }
 /**
  * Asserts that a cuSPARSE library call completed without errors.
  *
- * @param _cuda_call  a library call expression
+ * @param _cusparse_call  a library call expression
  */
 #define GKO_ASSERT_NO_CUSPARSE_ERRORS(_cusparse_call) \
     do {                                              \
@@ -305,6 +333,75 @@ inline dim<2> get_size(const dim<2> &size) { return size; }
     } while (false)
 
 
+/**
+ * Instantiates a HipError.
+ *
+ * @param errcode  The error code returned from a HIP runtime API routine.
+ */
+#define GKO_HIP_ERROR(_errcode) \
+    ::gko::HipError(__FILE__, __LINE__, __func__, _errcode)
+
+
+/**
+ * Instantiates a HipblasError.
+ *
+ * @param errcode  The error code returned from the HIPBLAS routine.
+ */
+#define GKO_HIPBLAS_ERROR(_errcode) \
+    ::gko::HipblasError(__FILE__, __LINE__, __func__, _errcode)
+
+
+/**
+ * Instantiates a HipsparseError.
+ *
+ * @param errcode  The error code returned from the HIPSPARSE routine.
+ */
+#define GKO_HIPSPARSE_ERROR(_errcode) \
+    ::gko::HipsparseError(__FILE__, __LINE__, __func__, _errcode)
+
+
+/**
+ * Asserts that a HIP library call completed without errors.
+ *
+ * @param _hip_call  a library call expression
+ */
+#define GKO_ASSERT_NO_HIP_ERRORS(_hip_call) \
+    do {                                    \
+        auto _errcode = _hip_call;          \
+        if (_errcode != hipSuccess) {       \
+            throw GKO_HIP_ERROR(_errcode);  \
+        }                                   \
+    } while (false)
+
+
+/**
+ * Asserts that a HIPBLAS library call completed without errors.
+ *
+ * @param _hipblas_call  a library call expression
+ */
+#define GKO_ASSERT_NO_HIPBLAS_ERRORS(_hipblas_call) \
+    do {                                            \
+        auto _errcode = _hipblas_call;              \
+        if (_errcode != HIPBLAS_STATUS_SUCCESS) {   \
+            throw GKO_HIPBLAS_ERROR(_errcode);      \
+        }                                           \
+    } while (false)
+
+
+/**
+ * Asserts that a HIPSPARSE library call completed without errors.
+ *
+ * @param _hipsparse_call  a library call expression
+ */
+#define GKO_ASSERT_NO_HIPSPARSE_ERRORS(_hipsparse_call) \
+    do {                                                \
+        auto _errcode = _hipsparse_call;                \
+        if (_errcode != HIPSPARSE_STATUS_SUCCESS) {     \
+            throw GKO_HIPSPARSE_ERROR(_errcode);        \
+        }                                               \
+    } while (false)
+
+
 namespace detail {
 
 
@@ -357,6 +454,25 @@ inline T ensure_allocated_impl(T ptr, const std::string &file, int line,
                   "semi-colon warnings")
 
 
+/**
+ * Ensures that two dimensions have compatible bounds, in particular before a
+ * copy operation. This means the target should have at least as much elements
+ * as the source.
+ *
+ * @param _source  the source of the expected copy operation
+ * @param _target  the destination of the expected copy operation
+ *
+ * @throw OutOfBoundsError  if `_source > _target`
+ */
+#define GKO_ENSURE_COMPATIBLE_BOUNDS(_source, _target)                       \
+    if (_source > _target) {                                                 \
+        throw ::gko::OutOfBoundsError(__FILE__, __LINE__, _source, _target); \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
 /**
  * Creates a StreamError exception.
  * This macro sets the correct information about the location of the error
@@ -389,4 +505,4 @@ inline T ensure_allocated_impl(T ptr, const std::string &file, int line,
 }  // namespace gko
 
 
-#endif  // GKO_CORE_EXCEPTION_HELPERS_HPP_
+#endif  // GKO_CORE_BASE_EXCEPTION_HELPERS_HPP_
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 115978f9b18..1df29abc59c 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_EXECUTOR_HPP_
-#define GKO_CORE_EXECUTOR_HPP_
+#ifndef GKO_CORE_BASE_EXECUTOR_HPP_
+#define GKO_CORE_BASE_EXECUTOR_HPP_
 
 
 #include <memory>
@@ -50,6 +50,10 @@ struct cublasContext;
 
 struct cusparseContext;
 
+struct hipblasContext;
+
+struct hipsparseContext;
+
 
 namespace gko {
 
@@ -107,6 +111,9 @@ class ExecutorBase;
  *     void run(const gko::CudaExecutor *exec) const override
  *     { os_ << "CUDA(" << exec->get_device_id() << ")"; }
  *
+ *     void run(const gko::HipExecutor *exec) const override
+ *     { os_ << "HIP(" << exec->get_device_id() << ")"; }
+ *
  *     // This is optional, if not overloaded, defaults to OmpExecutor overload
  *     void run(const gko::ReferenceExecutor *) const override
  *     { os_ << "Reference CPU"; }
@@ -134,6 +141,7 @@ class ExecutorBase;
  * auto omp = gko::OmpExecutor::create();
  * std::cout << *omp << std::endl
  *           << *gko::CudaExecutor::create(0, omp) << std::endl
+ *           << *gko::HipExecutor::create(0, omp) << std::endl
  *           << *gko::ReferenceExecutor::create() << std::endl;
  * ```
  *
@@ -142,15 +150,16 @@ class ExecutorBase;
  * ```
  * OMP
  * CUDA(0)
+ * HIP(0)
  * Reference CPU
  * ```
  *
  * One might feel that this code is too complicated for such a simple task.
  * Luckily, there is an overload of the Executor::run() method, which is
  * designed to facilitate writing simple operations like this one. The method
- * takes two closures as input: one which is run for OMP, and the other one for
- * CUDA executors. Using this method, there is no need to implement an Operation
- * subclass:
+ * takes three closures as input: one which is run for OMP, one for
+ * CUDA executors, and the last one for HIP executors. Using this method, there
+ * is no need to implement an Operation subclass:
  *
  * ```
  * std::ostream& operator<<(std::ostream &os, const gko::Executor &exec)
@@ -160,6 +169,10 @@ class ExecutorBase;
  *         [&]() { os << "CUDA("    // CUDA closure
  *                    << static_cast<gko::CudaExecutor&>(exec)
  *                         .get_device_id()
+ *                    << ")"; },
+ *         [&]() { os << "HIP("    // HIP closure
+ *                    << static_cast<gko::HipExecutor&>(exec)
+ *                         .get_device_id()
  *                    << ")"; });
  *     return os;
  * }
@@ -237,7 +250,8 @@ private:                                                                     \
  * kernel when the operation is executed.
  *
  * The kernels used to bind the operation are searched in `kernels::DEV_TYPE`
- * namespace, where `DEV_TYPE` is replaced by `omp`, `cuda` and `reference`.
+ * namespace, where `DEV_TYPE` is replaced by `omp`, `cuda`, `hip` and
+ * `reference`.
  *
  * @param _name  operation name
  * @param _kernel  kernel which will be bound to the operation
@@ -246,7 +260,7 @@ private:                                                                     \
  * -------
  *
  * ```c++
- * // define the omp, cuda and reference kernels which will be bound to the
+ * // define the omp, cuda, hip and reference kernels which will be bound to the
  * // operation
  * namespace kernels {
  * namespace omp {
@@ -259,6 +273,11 @@ private:                                                                     \
  *      // cuda code
  * }
  * }
+ * namespace hip {
+ * void my_kernel(int x) {
+ *      // hip code
+ * }
+ * }
  * namespace reference {
  * void my_kernel(int x) {
  *     // reference code
@@ -272,6 +291,7 @@ private:                                                                     \
  *     // create executors
  *     auto omp = OmpExecutor::create();
  *     auto cuda = CudaExecutor::create(omp, 0);
+ *     auto hip = HipExecutor::create(omp, 0);
  *     auto ref = ReferenceExecutor::create();
  *
  *     // create the operation
@@ -279,6 +299,7 @@ private:                                                                     \
  *
  *     omp->run(op);  // run omp kernel
  *     cuda->run(op);  // run cuda kernel
+ *     hip->run(op);  // run hip kernel
  *     ref->run(op);  // run reference kernel
  * }
  * ```
@@ -308,6 +329,7 @@ private:                                                                     \
                                                                              \
         GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(OmpExecutor, omp, _kernel);    \
         GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(CudaExecutor, cuda, _kernel);  \
+        GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(HipExecutor, hip, _kernel);    \
         GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(ReferenceExecutor, reference,  \
                                               _kernel);                      \
                                                                              \
@@ -335,6 +357,8 @@ private:                                                                     \
  *      operations executed on an OpenMP-supporting device (e.g. host CPU);
  * +    CudaExecutor specifies that the data should be stored and the
  *      operations executed on the NVIDIA GPU accelerator;
+ * +    HipExecutor specifies that the data should be stored and the
+ *      operations executed on either an NVIDIA or AMD GPU accelerator;
  * +    ReferenceExecutor executes a non-optimized reference implementation,
  *      which can be used to debug the library.
  *
@@ -433,15 +457,19 @@ class Executor : public log::EnableLogging<Executor> {
      *
      * @tparam ClosureOmp  type of op_omp
      * @tparam ClosureCuda  type of op_cuda
+     * @tparam ClosureHip  type of op_hip
      *
      * @param op_omp  functor to run in case of a OmpExecutor or
      *                ReferenceExecutor
      * @param op_cuda  functor to run in case of a CudaExecutor
+     * @param op_hip  functor to run in case of a HipExecutor
      */
-    template <typename ClosureOmp, typename ClosureCuda>
-    void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda) const
+    template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip>
+    void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda,
+             const ClosureHip &op_hip) const
     {
-        LambdaOperation<ClosureOmp, ClosureCuda> op(op_omp, op_cuda);
+        LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip> op(op_omp, op_cuda,
+                                                                op_hip);
         this->run(op);
     }
 
@@ -508,6 +536,40 @@ class Executor : public log::EnableLogging<Executor> {
             reinterpret_cast<uintptr>(dest_ptr), num_elems * sizeof(T));
     }
 
+    /**
+     * Copies data within this Executor.
+     *
+     * @tparam T  datatype to copy
+     *
+     * @param num_elems  number of elements of type T to copy
+     * @param src_ptr  pointer to a block of memory containing the data to be
+     *                 copied
+     * @param dest_ptr  pointer to an allocated block of memory
+     *                  where the data will be copied to
+     */
+    template <typename T>
+    void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
+    {
+        this->copy_from(this, num_elems, src_ptr, dest_ptr);
+    }
+
+    /**
+     * Retrieves a single element at the given location from executor memory.
+     *
+     * @tparam T  datatype to copy
+     *
+     * @param ptr  the pointer to the element to be copied
+     *
+     * @return the value stored at ptr
+     */
+    template <typename T>
+    T copy_val_to_host(const T *ptr) const
+    {
+        T out{};
+        this->get_master()->copy_from(this, 1, ptr, &out);
+        return out;
+    }
+
     /**
      * Returns the master OmpExecutor of this Executor.
      * @return the master OmpExecutor of this Executor.
@@ -577,16 +639,19 @@ class Executor : public log::EnableLogging<Executor> {
 
 private:
     /**
-     * The LambdaOperation class wraps two functor objects into an Operation.
+     * The LambdaOperation class wraps three functor objects into an
+     * Operation.
      *
-     * The first object is called by the OmpExecutor, while the other one by the
-     * CudaExecutor. When run on the ReferenceExecutor, the implementation will
-     * launch the CPU reference version.
+     * The first object is called by the OmpExecutor, the second one by the
+     * CudaExecutor and the last one by the HipExecutor. When run on the
+     * ReferenceExecutor, the implementation will launch the CPU reference
+     * version.
      *
      * @tparam ClosureOmp  the type of the first functor
      * @tparam ClosureCuda  the type of the second functor
+     * @tparam ClosureHip  the type of the third functor
      */
-    template <typename ClosureOmp, typename ClosureCuda>
+    template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip>
     class LambdaOperation : public Operation {
     public:
         /**
@@ -595,9 +660,11 @@ class Executor : public log::EnableLogging<Executor> {
          * @param op_omp  a functor object which will be called by OmpExecutor
          *                and ReferenceExecutor
          * @param op_cuda  a functor object which will be called by CudaExecutor
+         * @param op_hip  a functor object which will be called by HipExecutor
          */
-        LambdaOperation(const ClosureOmp &op_omp, const ClosureCuda &op_cuda)
-            : op_omp_(op_omp), op_cuda_(op_cuda)
+        LambdaOperation(const ClosureOmp &op_omp, const ClosureCuda &op_cuda,
+                        const ClosureHip &op_hip)
+            : op_omp_(op_omp), op_cuda_(op_cuda), op_hip_(op_hip)
         {}
 
         void run(std::shared_ptr<const OmpExecutor>) const override
@@ -610,9 +677,15 @@ class Executor : public log::EnableLogging<Executor> {
             op_cuda_();
         }
 
+        void run(std::shared_ptr<const HipExecutor>) const override
+        {
+            op_hip_();
+        }
+
     private:
         ClosureOmp op_omp_;
         ClosureCuda op_cuda_;
+        ClosureHip op_hip_;
     };
 };
 
@@ -710,6 +783,43 @@ class ExecutorBase : public Executor {
 };
 
 
+/**
+ * Controls whether the DeviceReset function should be called thanks to a
+ * boolean. Note that in any case, `DeviceReset` is called only after destroying
+ * the last Ginkgo executor. Therefore, it is sufficient to set this flag to the
+ * last living executor in Ginkgo. Setting this flag to an executor which is not
+ * destroyed last has no effect.
+ */
+class EnableDeviceReset {
+public:
+    /**
+     * Set the device reset capability.
+     *
+     * @param device_reset  whether to allow a device reset or not
+     */
+    void set_device_reset(bool device_reset) { device_reset_ = device_reset; }
+
+    /**
+     * Returns the current status of the device reset boolean for this executor.
+     *
+     * @return the current status of the device reset boolean for this executor.
+     */
+    bool get_device_reset() { return device_reset_; }
+
+protected:
+    /**
+     * Instantiate an EnableDeviceReset class
+     *
+     * @param device_reset  the starting device_reset status. Defaults to false.
+     */
+    EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset}
+    {}
+
+private:
+    bool device_reset_{};
+};
+
+
 }  // namespace detail
 
 
@@ -803,7 +913,8 @@ using DefaultExecutor = ReferenceExecutor;
  * @ingroup Executor
  */
 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
-                     public std::enable_shared_from_this<CudaExecutor> {
+                     public std::enable_shared_from_this<CudaExecutor>,
+                     public detail::EnableDeviceReset {
     friend class detail::ExecutorBase<CudaExecutor>;
 
 public:
@@ -815,7 +926,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      * kernels
      */
     static std::shared_ptr<CudaExecutor> create(
-        int device_id, std::shared_ptr<Executor> master);
+        int device_id, std::shared_ptr<Executor> master,
+        bool device_reset = false);
 
     ~CudaExecutor() { decrease_num_execs(this->device_id_); }
 
@@ -838,9 +950,9 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     static int get_num_devices();
 
     /**
-     * Get the number of cores per SM of this executor.
+     * Get the number of warps per SM of this executor.
      */
-    int get_num_cores_per_sm() const noexcept { return num_cores_per_sm_; }
+    int get_num_warps_per_sm() const noexcept { return num_warps_per_sm_; }
 
     /**
      * Get the number of multiprocessor of this executor.
@@ -852,11 +964,14 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      */
     int get_num_warps() const noexcept
     {
-        constexpr uint32 warp_size = 32;
-        auto warps_per_sm = num_cores_per_sm_ / warp_size;
-        return num_multiprocessor_ * warps_per_sm;
+        return num_multiprocessor_ * num_warps_per_sm_;
     }
 
+    /**
+     * Get the warp size of this executor.
+     */
+    int get_warp_size() const noexcept { return warp_size_; }
+
     /**
      * Get the major verion of compute capability.
      */
@@ -889,13 +1004,16 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     void init_handles();
 
-    CudaExecutor(int device_id, std::shared_ptr<Executor> master)
-        : device_id_(device_id),
+    CudaExecutor(int device_id, std::shared_ptr<Executor> master,
+                 bool device_reset = false)
+        : EnableDeviceReset{device_reset},
+          device_id_(device_id),
           master_(master),
-          num_cores_per_sm_(0),
+          num_warps_per_sm_(0),
           num_multiprocessor_(0),
           major_(0),
-          minor_(0)
+          minor_(0),
+          warp_size_(0)
     {
         assert(device_id < max_devices && device_id >= 0);
         this->set_gpu_property();
@@ -930,10 +1048,11 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 private:
     int device_id_;
     std::shared_ptr<Executor> master_;
-    int num_cores_per_sm_;
+    int num_warps_per_sm_;
     int num_multiprocessor_;
     int major_;
     int minor_;
+    int warp_size_;
 
     template <typename T>
     using handle_manager = std::unique_ptr<T, std::function<void(T *)>>;
@@ -953,10 +1072,176 @@ using DefaultExecutor = CudaExecutor;
 }  // namespace kernels
 
 
+/**
+ * This is the Executor subclass which represents the HIP enhanced device.
+ *
+ * @ingroup exec_hip
+ * @ingroup Executor
+ */
+class HipExecutor : public detail::ExecutorBase<HipExecutor>,
+                    public std::enable_shared_from_this<HipExecutor>,
+                    public detail::EnableDeviceReset {
+    friend class detail::ExecutorBase<HipExecutor>;
+
+public:
+    /**
+     * Creates a new HipExecutor.
+     *
+     * @param device_id  the HIP device id of this device
+     * @param master  an executor on the host that is used to invoke the device
+     *                kernels
+     */
+    static std::shared_ptr<HipExecutor> create(int device_id,
+                                               std::shared_ptr<Executor> master,
+                                               bool device_reset = false);
+
+    ~HipExecutor() { decrease_num_execs(this->device_id_); }
+
+    std::shared_ptr<Executor> get_master() noexcept override;
+
+    std::shared_ptr<const Executor> get_master() const noexcept override;
+
+    void synchronize() const override;
+
+    void run(const Operation &op) const override;
+
+    /**
+     * Get the HIP device id of the device associated to this executor.
+     */
+    int get_device_id() const noexcept { return device_id_; }
+
+    /**
+     * Get the number of devices present on the system.
+     */
+    static int get_num_devices();
+
+    /**
+     * Get the number of warps per SM of this executor.
+     */
+    int get_num_warps_per_sm() const noexcept { return num_warps_per_sm_; }
+
+    /**
+     * Get the number of multiprocessor of this executor.
+     */
+    int get_num_multiprocessor() const noexcept { return num_multiprocessor_; }
+
+    /**
+     * Get the major verion of compute capability.
+     */
+    int get_major_version() const noexcept { return major_; }
+
+    /**
+     * Get the minor verion of compute capability.
+     */
+    int get_minor_version() const noexcept { return minor_; }
+
+    /**
+     * Get the number of warps of this executor.
+     */
+    int get_num_warps() const noexcept
+    {
+        return num_multiprocessor_ * num_warps_per_sm_;
+    }
+
+    /**
+     * Get the warp size of this executor.
+     */
+    int get_warp_size() const noexcept { return warp_size_; }
+
+    /**
+     * Get the hipblas handle for this executor
+     *
+     * @return  the hipblas handle (hipblasContext*) for this executor
+     */
+    hipblasContext *get_hipblas_handle() const { return hipblas_handle_.get(); }
+
+    /**
+     * Get the hipsparse handle for this executor
+     *
+     * @return the hipsparse handle (hipsparseContext*) for this executor
+     */
+    hipsparseContext *get_hipsparse_handle() const
+    {
+        return hipsparse_handle_.get();
+    }
+
+protected:
+    void set_gpu_property();
+
+    void init_handles();
+
+    HipExecutor(int device_id, std::shared_ptr<Executor> master,
+                bool device_reset = false)
+        : EnableDeviceReset{device_reset},
+          device_id_(device_id),
+          master_(master),
+          num_multiprocessor_(0),
+          num_warps_per_sm_(0),
+          major_(0),
+          minor_(0),
+          warp_size_(0)
+    {
+        assert(device_id < max_devices);
+        this->set_gpu_property();
+        this->init_handles();
+        increase_num_execs(device_id);
+    }
+
+    void *raw_alloc(size_type size) const override;
+
+    void raw_free(void *ptr) const noexcept override;
+
+    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
+
+    static void increase_num_execs(int device_id)
+    {
+        std::lock_guard<std::mutex> guard(mutex[device_id]);
+        num_execs[device_id]++;
+    }
+
+    static void decrease_num_execs(int device_id)
+    {
+        std::lock_guard<std::mutex> guard(mutex[device_id]);
+        num_execs[device_id]--;
+    }
+
+    static int get_num_execs(int device_id)
+    {
+        std::lock_guard<std::mutex> guard(mutex[device_id]);
+        return num_execs[device_id];
+    }
+
+private:
+    int device_id_;
+    std::shared_ptr<Executor> master_;
+    int num_multiprocessor_;
+    int num_warps_per_sm_;
+    int major_;
+    int minor_;
+    int warp_size_;
+
+    template <typename T>
+    using handle_manager = std::unique_ptr<T, std::function<void(T *)>>;
+    handle_manager<hipblasContext> hipblas_handle_;
+    handle_manager<hipsparseContext> hipsparse_handle_;
+
+    static constexpr int max_devices = 64;
+    static int num_execs[max_devices];
+    static std::mutex mutex[max_devices];
+};
+
+
+namespace kernels {
+namespace hip {
+using DefaultExecutor = HipExecutor;
+}  // namespace hip
+}  // namespace kernels
+
+
 #undef GKO_OVERRIDE_RAW_COPY_TO
 
 
 }  // namespace gko
 
 
-#endif  // GKO_CORE_EXECUTOR_HPP_
+#endif  // GKO_CORE_BASE_EXECUTOR_HPP_
diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp
index 4a85c73326d..b57a9c1cc1a 100644
--- a/include/ginkgo/core/base/lin_op.hpp
+++ b/include/ginkgo/core/base/lin_op.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef GKO_CORE_BASE_LIN_OP_HPP_
 #define GKO_CORE_BASE_LIN_OP_HPP_
 
+
+#include <memory>
+#include <utility>
+
+
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -44,10 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/log/logger.hpp>
 
 
-#include <memory>
-#include <utility>
-
-
 namespace gko {
 
 
@@ -220,6 +221,15 @@ class LinOp : public EnableAbstractPolymorphicObject<LinOp> {
      */
     const dim<2> &get_size() const noexcept { return size_; }
 
+    /**
+     * Returns true if the linear operator uses the data given in x as
+     * an initial guess. Returns false otherwise.
+     *
+     * @return true if the linear operator uses the data given in x as
+     *         an initial guess. Returns false otherwise.
+     */
+    virtual bool apply_uses_initial_guess() const { return false; }
+
 protected:
     /**
      * Creates a linear operator.
@@ -416,6 +426,89 @@ class Transposable {
 };
 
 
+/**
+ * Linear operators which support permutation should implement the
+ * Permutable interface.
+ *
+ * It provides four functionalities, the row permute, the
+ * column permute, the inverse row permute and the inverse column permute.
+ *
+ * The row permute returns the permutation of the linear operator after
+ * permuting the rows of the linear operator. For example, if for a matrix A,
+ * the permuted matrix A' and the permutation array perm, the row i of the
+ * matrix A is the row perm[i] in the matrix A'. And similarly, for the inverse
+ * permutation, the row i in the matrix A' is the row perm[i] in the matrix A.
+ *
+ * The column permute returns the permutation of the linear operator after
+ * permuting the columns of the linear operator. The definitions of permute and
+ * inverse permute for the row_permute hold here as well.
+ *
+ * Example: Permuting a Csr matrix:
+ * ------------------------------------
+ *
+ * ```c++
+ * //Permuting an object of LinOp type.
+ * //The object you want to permute.
+ * auto op = matrix::Csr::create(exec);
+ * //Permute the object by first converting it to a Permutable type.
+ * auto perm = op->row_permute(permutation_indices);
+ * ```
+ */
+template <typename IndexType>
+class Permutable {
+public:
+    virtual ~Permutable() = default;
+
+    /**
+     * Returns a LinOp representing the row permutation of the Permutable
+     * object.
+     *
+     * @param permutation_indices  the array of indices contaning the
+     * permutation order.
+     *
+     * @return a pointer to the new permuted object
+     */
+    virtual std::unique_ptr<LinOp> row_permute(
+        const Array<IndexType> *permutation_indices) const = 0;
+
+    /**
+     * Returns a LinOp representing the column permutation of the Permutable
+     * object.
+     *
+     * @param permutation_indices  the array of indices contaning the
+     * permutation order.
+     *
+     * @return a pointer to the new column permuted object
+     */
+    virtual std::unique_ptr<LinOp> column_permute(
+        const Array<IndexType> *permutation_indices) const = 0;
+
+    /**
+     * Returns a LinOp representing the row permutation of the inverse permuted
+     * object.
+     *
+     * @param inverse_permutation_indices  the array of indices contaning the
+     * inverse permutation order.
+     *
+     * @return a pointer to the new inverse permuted object
+     */
+    virtual std::unique_ptr<LinOp> inverse_row_permute(
+        const Array<IndexType> *inverse_permutation_indices) const = 0;
+
+    /**
+     * Returns a LinOp representing the row permutation of the inverse permuted
+     * object.
+     *
+     * @param inverse_permutation_indices  the array of indices contaning the
+     * inverse permutation order.
+     *
+     * @return a pointer to the new inverse permuted object
+     */
+    virtual std::unique_ptr<LinOp> inverse_column_permute(
+        const Array<IndexType> *inverse_permutation_indices) const = 0;
+};
+
+
 /**
  * A LinOp implementing this interface can read its data from a matrix_data
  * structure.
@@ -762,7 +855,7 @@ public:                                                                      \
                   "semi-colon warnings")
 
 
-#ifndef __CUDACC__
+#if !(defined(__CUDACC__) || defined(__HIPCC__))
 /**
  * Creates a factory parameter in the factory parameters structure.
  *
@@ -787,18 +880,24 @@ public:                                                                      \
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
-#else  // __CUDACC__
+#else  // defined(__CUDACC__) || defined(__HIPCC__)
 // A workaround for the NVCC compiler - parameter pack expansion does not work
 // properly. You won't be able to use factories in code compiled with NVCC, but
 // at least this won't trigger a compiler error as soon as a header using it is
-// included.
-#define GKO_FACTORY_PARAMETER(_name, ...) \
-    mutable _name{__VA_ARGS__};           \
-                                          \
-    template <typename... Args>           \
-    auto with_##_name(Args &&... _value)  \
-        const->const ::gko::xstd::decay_t<decltype(*this)> &
-#endif  // __CUDACC__
+// included. To not get a linker error, we provide a dummy body.
+#define GKO_FACTORY_PARAMETER(_name, ...)                                    \
+    mutable _name{__VA_ARGS__};                                              \
+                                                                             \
+    template <typename... Args>                                              \
+    auto with_##_name(Args &&... _value)                                     \
+        const->const ::gko::xstd::decay_t<decltype(*this)> &                 \
+    {                                                                        \
+        return *this;                                                        \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)
 
 
 }  // namespace gko
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index e993b18d240..1b7a2f9f18f 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,17 +34,75 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_BASE_MATH_HPP_
 
 
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <limits>
+
+
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
 
-#include <cmath>
-#include <complex>
-#include <cstdlib>
+namespace gko {
 
 
-namespace gko {
+// HIP should not see std::abs or std::sqrt, we want the custom implementation.
+// Hence, provide the using declaration only for some cases
+namespace kernels {
+namespace reference {
+
+
+using std::abs;
+
+
+using std::sqrt;
+
+
+}  // namespace reference
+}  // namespace kernels
+
+
+namespace kernels {
+namespace omp {
+
+
+using std::abs;
+
+
+using std::sqrt;
+
+
+}  // namespace omp
+}  // namespace kernels
+
+
+namespace kernels {
+namespace cuda {
+
+
+using std::abs;
+
+
+using std::sqrt;
+
+
+}  // namespace cuda
+}  // namespace kernels
+
+
+namespace test {
+
+
+using std::abs;
+
+
+using std::sqrt;
+
+
+}  // namespace test
 
 
 // type manipulations
@@ -86,6 +144,29 @@ struct is_complex_impl<std::complex<T>>
 }  // namespace detail
 
 
+/**
+ * Access the underlying real type of a complex number.
+ *
+ * @tparam T  the type being checked.
+ */
+template <typename T>
+struct cpx_real_type {
+    /** The type. When the type is not complex, return the type itself.*/
+    using type = T;
+};
+
+/**
+ * Specialization for complex types.
+ *
+ * @copydoc cpx_real_type
+ */
+template <typename T>
+struct cpx_real_type<std::complex<T>> {
+    /** The type. When the type is complex, return the underlying value_type.*/
+    using type = typename std::complex<T>::value_type;
+};
+
+
 /**
  * Obtains a real counterpart of a std::complex type, and leaves the type
  * unchanged if it is not a complex type.
@@ -122,6 +203,26 @@ GKO_INLINE GKO_ATTRIBUTES constexpr bool is_complex()
 namespace detail {
 
 
+// singly linked list of all our supported precisions
+template <typename T>
+struct next_precision_impl {};
+
+template <>
+struct next_precision_impl<float> {
+    using type = double;
+};
+
+template <>
+struct next_precision_impl<double> {
+    using type = float;
+};
+
+template <typename T>
+struct next_precision_impl<std::complex<T>> {
+    using type = std::complex<typename next_precision_impl<T>::type>;
+};
+
+
 template <typename T>
 struct reduce_precision_impl {
     using type = T;
@@ -164,9 +265,24 @@ struct increase_precision_impl<half> {
 };
 
 
+template <typename T>
+struct infinity_impl {
+    // CUDA doesn't allow us to call std::numeric_limits functions
+    // so we need to store the value instead.
+    static constexpr auto value = std::numeric_limits<T>::infinity();
+};
+
+
 }  // namespace detail
 
 
+/**
+ * Obtains the next type in the singly-linked precision list.
+ */
+template <typename T>
+using next_precision = typename detail::next_precision_impl<T>::type;
+
+
 /**
  * Obtains the next type in the hierarchy with lower precision than T.
  */
@@ -295,6 +411,128 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den)
 }
 
 
+#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+
+
+/**
+ * Returns the additive identity for T.
+ *
+ * @return additive identity for T
+ */
+template <typename T>
+GKO_INLINE __host__ constexpr T zero()
+{
+    return T{};
+}
+
+
+/**
+ * Returns the additive identity for T.
+ *
+ * @return additive identity for T
+ *
+ * @note This version takes an unused reference argument to avoid
+ *       complicated calls like `zero<decltype(x)>()`. Instead, it allows
+ *       `zero(x)`.
+ */
+template <typename T>
+GKO_INLINE __host__ constexpr T zero(const T &)
+{
+    return zero<T>();
+}
+
+
+/**
+ * Returns the multiplicative identity for T.
+ *
+ * @return the multiplicative identity for T
+ */
+template <typename T>
+GKO_INLINE __host__ constexpr T one()
+{
+    return T(1);
+}
+
+
+/**
+ * Returns the multiplicative identity for T.
+ *
+ * @return the multiplicative identity for T
+ *
+ * @note This version takes an unused reference argument to avoid
+ *       complicated calls like `one<decltype(x)>()`. Instead, it allows
+ *       `one(x)`.
+ */
+template <typename T>
+GKO_INLINE __host__ constexpr T one(const T &)
+{
+    return one<T>();
+}
+
+
+/**
+ * Returns the additive identity for T.
+ *
+ * @return additive identity for T
+ */
+template <typename T>
+GKO_INLINE __device__ constexpr xstd::enable_if_t<
+    !std::is_same<T, std::complex<remove_complex<T>>>::value, T>
+zero()
+{
+    return T{};
+}
+
+
+/**
+ * Returns the additive identity for T.
+ *
+ * @return additive identity for T
+ *
+ * @note This version takes an unused reference argument to avoid
+ *       complicated calls like `zero<decltype(x)>()`. Instead, it allows
+ *       `zero(x)`.
+ */
+template <typename T>
+GKO_INLINE __device__ constexpr T zero(const T &)
+{
+    return zero<T>();
+}
+
+
+/**
+ * Returns the multiplicative identity for T.
+ *
+ * @return the multiplicative identity for T
+ */
+template <typename T>
+GKO_INLINE __device__ constexpr xstd::enable_if_t<
+    !std::is_same<T, std::complex<remove_complex<T>>>::value, T>
+one()
+{
+    return T(1);
+}
+
+
+/**
+ * Returns the multiplicative identity for T.
+ *
+ * @return the multiplicative identity for T
+ *
+ * @note This version takes an unused reference argument to avoid
+ *       complicated calls like `one<decltype(x)>()`. Instead, it allows
+ *       `one(x)`.
+ */
+template <typename T>
+GKO_INLINE __device__ constexpr T one(const T &)
+{
+    return one<T>();
+}
+
+
+#else
+
+
 /**
  * Returns the additive identity for T.
  *
@@ -303,7 +541,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den)
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES constexpr T zero()
 {
-    return T(0);
+    return T{};
 }
 
 
@@ -312,8 +550,9 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero()
  *
  * @return additive identity for T
  *
- * @note This version takes an unused reference argument to avoid complicated
- *       calls like `zero<decltype(x)>()`. Instead, it allows `zero(x)`.
+ * @note This version takes an unused reference argument to avoid
+ *       complicated calls like `zero<decltype(x)>()`. Instead, it allows
+ *       `zero(x)`.
  */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T &)
@@ -339,8 +578,9 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one()
  *
  * @return the multiplicative identity for T
  *
- * @note This version takes an unused reference argument to avoid complicated
- *       calls like `one<decltype(x)>()`. Instead, it allows `one(x)`.
+ * @note This version takes an unused reference argument to avoid
+ *       complicated calls like `one<decltype(x)>()`. Instead, it allows
+ *       `one(x)`.
  */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T &)
@@ -349,6 +589,12 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T &)
 }
 
 
+#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+
+
+#undef GKO_BIND_ZERO_ONE
+
+
 /**
  * Returns the absolute value of the object.
  *
@@ -365,9 +611,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T abs(const T &x)
 }
 
 
-using std::abs;  // use optimized abs functions for basic types
-
-
 /**
  * Returns the larger of the arguments.
  *
@@ -418,11 +661,21 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T min(const T &x, const T &y)
  * @return real part of the object (by default, the object itself)
  */
 template <typename T>
-GKO_ATTRIBUTES GKO_INLINE constexpr T real(const T &x)
+GKO_ATTRIBUTES
+    GKO_INLINE constexpr xstd::enable_if_t<!is_complex_s<T>::value, T>
+    real(const T &x)
 {
     return x;
 }
 
+template <typename T>
+GKO_ATTRIBUTES GKO_INLINE constexpr xstd::enable_if_t<is_complex_s<T>::value,
+                                                      remove_complex<T>>
+real(const T &x)
+{
+    return x.real();
+}
+
 
 /**
  * Returns the imaginary part of the object.
@@ -434,11 +687,21 @@ GKO_ATTRIBUTES GKO_INLINE constexpr T real(const T &x)
  * @return imaginary part of the object (by default, zero<T>())
  */
 template <typename T>
-GKO_ATTRIBUTES GKO_INLINE constexpr T imag(const T &)
+GKO_ATTRIBUTES
+    GKO_INLINE constexpr xstd::enable_if_t<!is_complex_s<T>::value, T>
+    imag(const T &)
 {
     return zero<T>();
 }
 
+template <typename T>
+GKO_ATTRIBUTES GKO_INLINE constexpr xstd::enable_if_t<is_complex_s<T>::value,
+                                                      remove_complex<T>>
+imag(const T &x)
+{
+    return x.imag();
+}
+
 
 /**
  * Returns the conjugate of an object.
@@ -448,13 +711,18 @@ GKO_ATTRIBUTES GKO_INLINE constexpr T imag(const T &)
  * @return  conjugate of the object (by default, the object itself)
  */
 template <typename T>
-GKO_ATTRIBUTES GKO_INLINE T conj(const T &x)
+GKO_ATTRIBUTES GKO_INLINE xstd::enable_if_t<!is_complex_s<T>::value, T> conj(
+    const T &x)
 {
     return x;
 }
 
-
-using std::sqrt;  // use standard sqrt functions for basic types
+template <typename T>
+GKO_ATTRIBUTES GKO_INLINE xstd::enable_if_t<is_complex_s<T>::value, T> conj(
+    const T &x)
+{
+    return T{x.real(), -x.imag()};
+}
 
 
 /**
@@ -485,8 +753,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T &x)
  * @return maximum of `hint` and the significant bit position of `n`
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr uint32 get_significant_bit(
-    const T &n, uint32 hint = 0u) noexcept
+constexpr uint32 get_significant_bit(const T &n, uint32 hint = 0u) noexcept
 {
     return (T{1} << (hint + 1)) > n ? hint : get_significant_bit(n, hint + 1u);
 }
@@ -504,29 +771,32 @@ GKO_INLINE GKO_ATTRIBUTES constexpr uint32 get_significant_bit(
  * @return the smallest power of `base` not smaller than `limit`
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T get_superior_power(
-    const T &base, const T &limit, const T &hint = T{1}) noexcept
+constexpr T get_superior_power(const T &base, const T &limit,
+                               const T &hint = T{1}) noexcept
 {
     return hint >= limit ? hint : get_superior_power(base, limit, hint * base);
 }
 
 
-#if !defined(__CUDA_ARCH__)
-
-
-// Since a lot of compiler in combination with CUDA seem to have difficulties
-// distinguishing between the CUDA `isfinite` and the `std::isfinite` when
-// it is put into the `gko` namespace, only enable `std::isfinite` when
-// compiling host code.
+/**
+ * Checks if a floating point number is finite, meaning it is
+ * neither +/- infinity nor NaN.
+ *
+ * @tparam T  type of the value to check
+ *
+ * @param value  value to check
+ *
+ * @return `true` if the value is finite, meaning it are neither
+ *         +/- infinity nor NaN.
+ */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES xstd::enable_if_t<!is_complex_s<T>::value, bool>
-isfinite(const T &value)
+is_finite(const T &value)
 {
-    return std::isfinite(value);
+    constexpr T infinity{detail::infinity_impl<T>::value};
+    return abs(value) < infinity;
 }
 
-#endif  // defined(__CUDA_ARCH__)
-
 
 /**
  * Checks if all components of a complex value are finite, meaning they are
@@ -536,14 +806,14 @@ isfinite(const T &value)
  *
  * @param value  complex value to check
  *
- * returns `true` if both components of the given value are finite, meaning
+ * @return `true` if both components of the given value are finite, meaning
  *         they are neither +/- infinity nor NaN.
  */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES xstd::enable_if_t<is_complex_s<T>::value, bool>
-isfinite(const T &value)
+is_finite(const T &value)
 {
-    return isfinite(value.real()) && isfinite(value.imag());
+    return is_finite(value.real()) && is_finite(value.imag());
 }
 
 
diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp
index 7513dcdc0d8..94c01461079 100644
--- a/include/ginkgo/core/base/matrix_data.hpp
+++ b/include/ginkgo/core/base/matrix_data.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_BASE_MATRIX_DATA_HPP_
 
 
+#include <algorithm>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range.hpp>
@@ -42,12 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 
 
-#include <algorithm>
-#include <numeric>
-#include <tuple>
-#include <vector>
-
-
 namespace gko {
 
 
diff --git a/include/ginkgo/core/base/mtx_io.hpp b/include/ginkgo/core/base/mtx_io.hpp
index a20fb67e572..8ebe65e973a 100644
--- a/include/ginkgo/core/base/mtx_io.hpp
+++ b/include/ginkgo/core/base/mtx_io.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/include/ginkgo/core/base/name_demangling.hpp b/include/ginkgo/core/base/name_demangling.hpp
index 418f1246367..fb73c7ac370 100644
--- a/include/ginkgo/core/base/name_demangling.hpp
+++ b/include/ginkgo/core/base/name_demangling.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,12 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_NAME_DEMANGLING_HPP
-#define GKO_CORE_NAME_DEMANGLING_HPP
+#ifndef GKO_CORE_BASE_NAME_DEMANGLING_HPP_
+#define GKO_CORE_BASE_NAME_DEMANGLING_HPP_
 
 
 #include <ginkgo/config.hpp>
 
+
 #ifdef GKO_HAVE_CXXABI_H
 #include <cxxabi.h>
 #endif  // GKO_HAVE_CXXABI_H
@@ -140,4 +141,4 @@ std::string get_enclosing_scope(const T &)
 }  // namespace gko
 
 
-#endif  //  GKO_CORE_NAME_DEMANGLING_HPP
+#endif  // GKO_CORE_BASE_NAME_DEMANGLING_HPP_
diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp
index 5c84ef7f8fc..16e4406605b 100644
--- a/include/ginkgo/core/base/perturbation.hpp
+++ b/include/ginkgo/core/base/perturbation.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp
index 918764fc615..d65e9a1cb31 100644
--- a/include/ginkgo/core/base/polymorphic_object.hpp
+++ b/include/ginkgo/core/base/polymorphic_object.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -290,24 +290,24 @@ class PolymorphicObject : public log::EnableLogging<PolymorphicObject> {
  * @see EnablePolymorphicObject for creating a concrete subclass of
  *      PolymorphicObject.
  */
-template <typename AbstactObject, typename PolymorphicBase = PolymorphicObject>
+template <typename AbstractObject, typename PolymorphicBase = PolymorphicObject>
 class EnableAbstractPolymorphicObject : public PolymorphicBase {
 public:
     using PolymorphicBase::PolymorphicBase;
 
-    std::unique_ptr<AbstactObject> create_default(
+    std::unique_ptr<AbstractObject> create_default(
         std::shared_ptr<const Executor> exec) const
     {
-        return std::unique_ptr<AbstactObject>{static_cast<AbstactObject *>(
+        return std::unique_ptr<AbstractObject>{static_cast<AbstractObject *>(
             this->create_default_impl(std::move(exec)).release())};
     }
 
-    std::unique_ptr<AbstactObject> create_default() const
+    std::unique_ptr<AbstractObject> create_default() const
     {
         return this->create_default(this->get_executor());
     }
 
-    std::unique_ptr<AbstactObject> clone(
+    std::unique_ptr<AbstractObject> clone(
         std::shared_ptr<const Executor> exec) const
     {
         auto new_op = this->create_default(exec);
@@ -315,25 +315,25 @@ class EnableAbstractPolymorphicObject : public PolymorphicBase {
         return new_op;
     }
 
-    std::unique_ptr<AbstactObject> clone() const
+    std::unique_ptr<AbstractObject> clone() const
     {
         return this->clone(this->get_executor());
     }
 
-    AbstactObject *copy_from(const PolymorphicObject *other)
+    AbstractObject *copy_from(const PolymorphicObject *other)
     {
-        return static_cast<AbstactObject *>(this->copy_from_impl(other));
+        return static_cast<AbstractObject *>(this->copy_from_impl(other));
     }
 
-    AbstactObject *copy_from(std::unique_ptr<PolymorphicObject> other)
+    AbstractObject *copy_from(std::unique_ptr<PolymorphicObject> other)
     {
-        return static_cast<AbstactObject *>(
+        return static_cast<AbstractObject *>(
             this->copy_from_impl(std::move(other)));
     }
 
-    AbstactObject *clear()
+    AbstractObject *clear()
     {
-        return static_cast<AbstactObject *>(this->clear_impl());
+        return static_cast<AbstractObject *>(this->clear_impl());
     }
 };
 
diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp
index e238f276fab..83bff8dd7d5 100644
--- a/include/ginkgo/core/base/range.hpp
+++ b/include/ginkgo/core/base/range.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -382,7 +382,7 @@ class range {
         return *this;
     }
 
-    GKO_ATTRIBUTES range(const range &other) = default;
+    range(const range &other) = default;
 
     /**
      * Returns the length of the specified dimension of the range.
diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp
index ec252007ee2..4040301f287 100644
--- a/include/ginkgo/core/base/range_accessors.hpp
+++ b/include/ginkgo/core/base/range_accessors.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_BASE_RANGE_ACCESSORS_HPP_
 
 
-#include <ginkgo/core/base/range.hpp>
+#include <array>
 
 
-#include <array>
+#include <ginkgo/core/base/range.hpp>
 
 
 namespace gko {
diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp
index 08a56b34cb6..34aa9cf05c3 100644
--- a/include/ginkgo/core/base/std_extensions.hpp
+++ b/include/ginkgo/core/base/std_extensions.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_STD_EXTENSIONS_HPP_
-#define GKO_CORE_STD_EXTENSIONS_HPP_
+#ifndef GKO_CORE_BASE_STD_EXTENSIONS_HPP_
+#define GKO_CORE_BASE_STD_EXTENSIONS_HPP_
 
 
 #include <memory>
@@ -79,8 +79,78 @@ template <typename T>
 using decay_t = typename std::decay<T>::type;
 
 
+/**
+ * constexpr helper which checks if lhs > rhs. This is helpful within template
+ * declarations since ">" cannot directly be used. Note that std::greater is
+ * available as constexpr only since C++14. This does not implement all the
+ * functionality of C++14, only what is needed so far.
+ *
+ * @tparam T  type of both values which are checked
+ *
+ * @param lhs  first operand
+ * @param rhs  second operand
+ *
+ * @return whether lhs > rhs
+ */
+template <typename T>
+constexpr bool greater(const T &&lhs, const T &&rhs)
+{
+    return lhs > rhs;
+}
+
+/**
+ * constexpr helper checking if lhs >= rhs
+ *
+ * @tparam T  type of both values which are checked
+ *
+ * @param lhs  first operand
+ * @param rhs  second operand
+ *
+ * @return whether lhs >= rhs
+ */
+template <typename T>
+constexpr bool greater_equal(const T &&lhs, const T &&rhs)
+{
+    return lhs >= rhs;
+}
+
+
+/**
+ * constexpr helper checking if lhs < rhs
+ *
+ * @tparam T  type of both values which are checked
+ *
+ * @param lhs  first operand
+ * @param rhs  second operand
+ *
+ * @return whether lhs < rhs
+ */
+template <typename T>
+constexpr bool less(const T &&lhs, const T &&rhs)
+{
+    return !greater_equal(lhs, rhs);
+}
+
+
+/**
+ * constexpr helper checking if lhs <= rhs
+ *
+ * @tparam T  type of both values which are checked
+ *
+ * @param lhs  first operand
+ * @param rhs  second operand
+ *
+ * @return whether lhs <= rhs
+ */
+template <typename T>
+constexpr bool less_equal(const T &&lhs, const T &&rhs)
+{
+    return !greater(lhs, rhs);
+}
+
+
 }  // namespace xstd
 }  // namespace gko
 
 
-#endif  // GKO_CORE_STD_EXTENSIONS_HPP_
+#endif  // GKO_CORE_BASE_STD_EXTENSIONS_HPP_
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 4c72c5f1a8c..6c2ab2a50d2 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,32 +30,36 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_TYPES_HPP_
-#define GKO_CORE_TYPES_HPP_
+#ifndef GKO_CORE_BASE_TYPES_HPP_
+#define GKO_CORE_BASE_TYPES_HPP_
 
 
 #include <cassert>
 #include <climits>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
+#include <type_traits>
 
 
-#include <complex>
-#include <type_traits>
+#ifdef __HIPCC__
+#include <hip/hip_runtime.h>
+#endif
 
 
 // Macros for handling different compilers / architectures uniformly
-
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #define GKO_ATTRIBUTES __host__ __device__
 #define GKO_INLINE __forceinline__
 #else
 #define GKO_ATTRIBUTES
 #define GKO_INLINE inline
-#endif  // __CUDACC__
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)
 
 
-#if defined(__CUDA_ARCH__) && defined(__APPLE__)
+#if (defined(__CUDA_ARCH__) && defined(__APPLE__)) || \
+    defined(__HIP_DEVICE_COMPILE__)
 
 #ifdef NDEBUG
 #define GKO_ASSERT(condition) ((void)0)
@@ -69,12 +73,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                          __FILE__, __LINE__, __func__)))
 #endif  // NDEBUG
 
-#else  // defined(__CUDA_ARCH__) && defined(__APPLE__)
+#else  // (defined(__CUDA_ARCH__) && defined(__APPLE__)) ||
+       // defined(__HIP_DEVICE_COMPILE__)
 
 // Handle assertions normally on other systems
 #define GKO_ASSERT(condition) assert(condition)
 
-#endif  // defined(__CUDA_ARCH__) && defined(__APPLE__)
+#endif  // (defined(__CUDA_ARCH__) && defined(__APPLE__)) ||
+        // defined(__HIP_DEVICE_COMPILE__)
 
 
 // Handle deprecated notices correctly on different systems
@@ -385,9 +391,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  */
 #define GKO_ENABLE_FOR_ALL_EXECUTORS(_enable_macro) \
     _enable_macro(OmpExecutor, omp);                \
+    _enable_macro(HipExecutor, hip);                \
     _enable_macro(CudaExecutor, cuda)
 
 
+/**
+ * Instantiates a template for each non-complex value type compiled by Ginkgo.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take one argument, which is replaced by the
+ *                value type.
+ */
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \
+    template _macro(float);                                     \
+    template _macro(double)
+
+
 /**
  * Instantiates a template for each value type compiled by Ginkgo.
  *
@@ -396,10 +416,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take one argument, which is replaced by the
  *                value type.
  */
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \
-    template _macro(float);                         \
-    template _macro(double);                        \
-    template _macro(std::complex<float>);           \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \
+    template _macro(std::complex<float>);                    \
     template _macro(std::complex<double>)
 
 
@@ -435,7 +454,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>, int64)
 
 
+/**
+ * Instantiates a template for each value type conversion pair compiled by
+ * Ginkgo.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take two arguments `src` and `dst`, which
+ *                are replaced by the source and destination value type.
+ */
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)       \
+    template _macro(float, double);                             \
+    template _macro(double, float);                             \
+    template _macro(std::complex<float>, std::complex<double>); \
+    template _macro(std::complex<double>, std::complex<float>)
+
+
 }  // namespace gko
 
 
-#endif  // GKO_CORE_TYPES_HPP_
+#endif  // GKO_CORE_BASE_TYPES_HPP_
diff --git a/include/ginkgo/core/base/utils.hpp b/include/ginkgo/core/base/utils.hpp
index 11434a30b9c..d3b81ffff23 100644
--- a/include/ginkgo/core/base/utils.hpp
+++ b/include/ginkgo/core/base/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,16 +34,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_BASE_UTILS_HPP_
 
 
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-
 #include <functional>
 #include <memory>
 #include <type_traits>
 
 
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
 #ifndef NDEBUG
 #include <cstdio>
 #endif  // NDEBUG
@@ -290,7 +291,10 @@ inline typename std::decay<T>::type *as(U *obj)
     if (auto p = dynamic_cast<typename std::decay<T>::type *>(obj)) {
         return p;
     } else {
-        throw NotSupported(__FILE__, __LINE__, __func__, typeid(obj).name());
+        throw NotSupported(__FILE__, __LINE__,
+                           std::string{"gko::as<"} +
+                               name_demangling::get_type_name(typeid(T)) + ">",
+                           name_demangling::get_type_name(typeid(*obj)));
     }
 }
 
@@ -313,7 +317,88 @@ inline const typename std::decay<T>::type *as(const U *obj)
     if (auto p = dynamic_cast<const typename std::decay<T>::type *>(obj)) {
         return p;
     } else {
-        throw NotSupported(__FILE__, __LINE__, __func__, typeid(obj).name());
+        throw NotSupported(__FILE__, __LINE__,
+                           std::string{"gko::as<"} +
+                               name_demangling::get_type_name(typeid(T)) + ">",
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * Performs polymorphic type conversion of a unique_ptr.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the unique_ptr to the object which should be converted.
+ *             If successful, it will be reset to a nullptr.
+ *
+ * @return If successful, returns a unique_ptr to the subtype, otherwise throws
+ *         NotSupported.
+ */
+template <typename T, typename U>
+inline std::unique_ptr<typename std::decay<T>::type> as(
+    std::unique_ptr<U> &&obj)
+{
+    if (auto p = dynamic_cast<typename std::decay<T>::type *>(obj.get())) {
+        obj.release();
+        return std::unique_ptr<typename std::decay<T>::type>{p};
+    } else {
+        throw NotSupported(__FILE__, __LINE__, __func__,
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * Performs polymorphic type conversion of a shared_ptr.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the shared_ptr to the object which should be converted.
+ *
+ * @return If successful, returns a shared_ptr to the subtype, otherwise throws
+ *         NotSupported. This pointer shares ownership with the input pointer.
+ */
+template <typename T, typename U>
+inline std::shared_ptr<typename std::decay<T>::type> as(std::shared_ptr<U> obj)
+{
+    auto ptr = std::dynamic_pointer_cast<typename std::decay<T>::type>(obj);
+    if (ptr) {
+        return ptr;
+    } else {
+        throw NotSupported(__FILE__, __LINE__, __func__,
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * Performs polymorphic type conversion of a shared_ptr.
+ *
+ * This is the constant version of the function.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the shared_ptr to the object which should be converted.
+ *
+ * @return If successful, returns a shared_ptr to the subtype, otherwise throws
+ *         NotSupported. This pointer shares ownership with the input pointer.
+ */
+template <typename T, typename U>
+inline std::shared_ptr<const typename std::decay<T>::type> as(
+    std::shared_ptr<const U> obj)
+{
+    auto ptr =
+        std::dynamic_pointer_cast<const typename std::decay<T>::type>(obj);
+    if (ptr) {
+        return ptr;
+    } else {
+        throw NotSupported(__FILE__, __LINE__, __func__,
+                           name_demangling::get_type_name(typeid(*obj)));
     }
 }
 
diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp
index 580f28eb394..52731ab56e3 100644
--- a/include/ginkgo/core/base/version.hpp
+++ b/include/ginkgo/core/base/version.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,12 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_BASE_VERSION_HPP_
 
 
-#include <ginkgo/config.hpp>
-#include <ginkgo/core/base/types.hpp>
+#include <ostream>
 
 
-#include <ostream>
-#include <tuple>
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/types.hpp>
 
 
 namespace gko {
@@ -81,24 +80,41 @@ struct version {
     const char *const tag;
 };
 
+inline bool operator==(const version &first, const version &second)
+{
+    return first.major == second.major && first.minor == second.minor &&
+           first.patch == second.patch;
+}
+
+inline bool operator!=(const version &first, const version &second)
+{
+    return !(first == second);
+}
 
-#define GKO_ENABLE_VERSION_COMPARISON(_operator)                             \
-    inline bool operator _operator(const version &first,                     \
-                                   const version &second)                    \
-    {                                                                        \
-        return std::tie(first.major, first.minor, first.patch)               \
-            _operator std::tie(second.major, second.minor, second.patch);    \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-GKO_ENABLE_VERSION_COMPARISON(<);
-GKO_ENABLE_VERSION_COMPARISON(<=);
-GKO_ENABLE_VERSION_COMPARISON(==);
-GKO_ENABLE_VERSION_COMPARISON(!=);
-GKO_ENABLE_VERSION_COMPARISON(>=);
-GKO_ENABLE_VERSION_COMPARISON(>);
+inline bool operator<(const version &first, const version &second)
+{
+    if (first.major < second.major) return true;
+    if (first.major == second.major && first.minor < second.minor) return true;
+    if (first.major == second.major && first.minor == second.minor &&
+        first.patch < second.patch)
+        return true;
+    return false;
+}
+
+inline bool operator<=(const version &first, const version &second)
+{
+    return !(second < first);
+}
+
+inline bool operator>(const version &first, const version &second)
+{
+    return second < first;
+}
+
+inline bool operator>=(const version &first, const version &second)
+{
+    return !(first < second);
+}
 
 #undef GKO_ENABLE_VERSION_COMPARISON
 
@@ -138,7 +154,7 @@ inline std::ostream &operator<<(std::ostream &os, const version &ver)
  *     earlier version may have this implemented or fixed in a later version).
  *
  * This structure provides versions of different parts of Ginkgo: the headers,
- * the core and the kernel modules (reference, OpenMP, CUDA).
+ * the core and the kernel modules (reference, OpenMP, CUDA, HIP).
  * To obtain an instance of version_info filled with information about the
  * current version of Ginkgo, call the version_info::get() static method.
  */
@@ -189,6 +205,13 @@ class version_info {
      */
     version cuda_version;
 
+    /**
+     * Contains version information of the HIP module.
+     *
+     * This is the version of the static/shared library called "ginkgo_hip".
+     */
+    version hip_version;
+
 private:
     static constexpr version get_header_version() noexcept
     {
@@ -204,12 +227,15 @@ class version_info {
 
     static version get_cuda_version() noexcept;
 
+    static version get_hip_version() noexcept;
+
     version_info()
         : header_version{get_header_version()},
           core_version{get_core_version()},
           reference_version{get_reference_version()},
           omp_version{get_omp_version()},
-          cuda_version{get_cuda_version()}
+          cuda_version{get_cuda_version()},
+          hip_version{get_hip_version()}
     {}
 };
 
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
new file mode 100644
index 00000000000..1d92bc59c54
--- /dev/null
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -0,0 +1,153 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_ILU_HPP_
+#define GKO_CORE_FACTORIZATION_ILU_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+/**
+ * @brief The Factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+/**
+ * Represents an incomplete LU factorization -- ILU(0) -- of a sparse matrix.
+ *
+ * More specifically, it consists of a lower unitriangular factor $L$ and
+ * an upper triangular factor $U$ with sparsity pattern
+ * $\mathcal S(L + U)$ = $\mathcal S(A)$
+ * fulfilling $LU = A$ at every non-zero location of $A$.
+ *
+ * @tparam ValueType  Type of the values of all matrices used in this class
+ * @tparam IndexType  Type of the indices of all matrices used in this class
+ *
+ * @ingroup factor
+ * @ingroup LinOp
+ */
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32>
+class Ilu : public Composition<ValueType> {
+public:
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using matrix_type = matrix::Csr<ValueType, IndexType>;
+
+    std::shared_ptr<const matrix_type> get_l_factor() const
+    {
+        // Can be `static_cast` since the type is guaranteed in this class
+        return std::static_pointer_cast<const matrix_type>(
+            this->get_operators()[0]);
+    }
+
+    std::shared_ptr<const matrix_type> get_u_factor() const
+    {
+        // Can be `static_cast` since the type is guaranteed in this class
+        return std::static_pointer_cast<const matrix_type>(
+            this->get_operators()[1]);
+    }
+
+    // Remove the possibility of calling `create`, which was enabled by
+    // `Composition`
+    template <typename... Args>
+    static std::unique_ptr<Composition<ValueType>> create(Args &&... args) =
+        delete;
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * Strategy which will be used by the L matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(l_strategy, nullptr);
+
+        /**
+         * Strategy which will be used by the U matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(u_strategy, nullptr);
+    };
+    GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    Ilu(const Factory *factory, std::shared_ptr<const gko::LinOp> system_matrix)
+        : Composition<ValueType>{factory->get_executor()},
+          parameters_{factory->get_parameters()}
+    {
+        if (parameters_.l_strategy == nullptr) {
+            parameters_.l_strategy =
+                std::make_shared<typename matrix_type::classical>();
+        }
+        if (parameters_.u_strategy == nullptr) {
+            parameters_.u_strategy =
+                std::make_shared<typename matrix_type::classical>();
+        }
+        generate_l_u(system_matrix)->move_to(this);
+    }
+
+    /**
+     * Generates the incomplete LU factors, which will be returned as a
+     * composition of the lower (first element of the composition) and the
+     * upper factor (second element). The dynamic type of L is l_matrix_type,
+     * while the dynamic type of U is u_matrix_type.
+     *
+     * @param system_matrix  the source matrix used to generate the factors.
+     *                       @note: system_matrix must be convertible to a Csr
+     *                              Matrix, otherwise, an exception is thrown.
+     * @return  A Composition, containing the incomplete LU factors for the
+     *          given system_matrix (first element is L, then U)
+     */
+    std::unique_ptr<Composition<ValueType>> generate_l_u(
+        const std::shared_ptr<const LinOp> &system_matrix) const;
+};
+
+
+}  // namespace factorization
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_ILU_HPP_
diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp
new file mode 100644
index 00000000000..d1f22f2f029
--- /dev/null
+++ b/include/ginkgo/core/factorization/par_ict.hpp
@@ -0,0 +1,253 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_PAR_ICT_HPP_
+#define GKO_CORE_FACTORIZATION_PAR_ICT_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+/**
+ * @brief The Factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+/**
+ * ParICT is an incomplete threshold-based Cholesky factorization which is
+ * computed in parallel.
+ *
+ * $L$ is a lower triangular matrix which approximates a given symmetric
+ * positive definite matrix $A$ with $A \approx LL^T$. Here, $L$ has a sparsity
+ * pattern that is improved iteratively based on its element-wise magnitude.
+ * The initial sparsity pattern is chosen based on the lower triangle of $A$.
+ *
+ * One iteration of the ParICT algorithm consists of the following steps:
+ *
+ * 1. Calculating the residual $R = A - LL^T$
+ * 2. Adding new non-zero locations from $R$ to $L$.
+ *    The new non-zero locations are initialized based on the corresponding
+ *    residual value.
+ * 3. Executing a fixed-point iteration on $L$ according to
+ * $
+ * F(L) =
+ * \begin{cases}
+ *     \frac{1}{l_{jj}}
+ *         \left(a_{ij}-\sum_{k=1}^{j-1}l_{ik}l_{jk}\right), \quad & i \neq j \\
+ *     \sqrt{a_{ij}-\sum_{k=1}^{j-1}l_{ik}l_{jk}}, \quad & i = j \\
+ * \end{cases}
+ * $
+ * 4. Removing the smallest entries (by magnitude) from $L$
+ * 5. Executing a fixed-point iteration on the (now sparser) $L$
+ *
+ * This ParICT algorithm thus improves the sparsity pattern and the
+ * approximation of $L$ simultaneously.
+ *
+ * The implementation follows the design of H. Anzt et al.,
+ * ParILUT - A Parallel Threshold ILU for GPUs, 2019 IEEE International
+ * Parallel and Distributed Processing Symposium (IPDPS), pp. 231–241.
+ *
+ * @tparam ValueType  Type of the values of all matrices used in this class
+ * @tparam IndexType  Type of the indices of all matrices used in this class
+ *
+ * @ingroup factor
+ * @ingroup LinOp
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class ParIct : public Composition<ValueType> {
+public:
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using matrix_type = matrix::Csr<ValueType, IndexType>;
+
+    std::shared_ptr<const matrix_type> get_l_factor() const
+    {
+        // Can be `static_cast` since the type is guaranteed in this class
+        return std::static_pointer_cast<const matrix_type>(
+            this->get_operators()[0]);
+    }
+
+    std::shared_ptr<const matrix_type> get_lt_factor() const
+    {
+        // Can be `static_cast` since the type is guaranteed in this class
+        return std::static_pointer_cast<const matrix_type>(
+            this->get_operators()[1]);
+    }
+
+    // Remove the possibility of calling `create`, which was enabled by
+    // `Composition`
+    template <typename... Args>
+    static std::unique_ptr<Composition<ValueType>> create(Args &&... args) =
+        delete;
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * The number of total iterations of ParICT that will be executed.
+         * The default value is 5.
+         */
+        size_type GKO_FACTORY_PARAMETER(iterations, 5);
+
+        /**
+         * @brief `true` means it is known that the matrix given to this
+         *        factory will be sorted first by row, then by column index,
+         *        `false` means it is unknown or not sorted, so an additional
+         *        sorting step will be performed during the factorization
+         *        (it will not change the matrix given).
+         *        The matrix must be sorted for this factorization to work.
+         *
+         * The `system_matrix`, which will be given to this factory, must be
+         * sorted (first by row, then by column) in order for the algorithm
+         * to work. If it is known that the matrix will be sorted, this
+         * parameter can be set to `true` to skip the sorting (therefore,
+         * shortening the runtime).
+         * However, if it is unknown or if the matrix is known to be not sorted,
+         * it must remain `false`, otherwise, the factorization might be
+         * incorrect.
+         */
+        bool GKO_FACTORY_PARAMETER(skip_sorting, false);
+
+        /**
+         * @brief `true` means the candidate selection will use an inexact
+         * selection algorithm. `false` means an exact selection algorithm will
+         * be used.
+         *
+         * Using the approximate selection algorithm can give a significant
+         * speed-up, but may in the worst case cause the algorithm to vastly
+         * exceed its `fill_in_limit`.
+         * The exact selection needs more time, but more closely fulfills the
+         * `fill_in_limit` except for pathological cases (many candidates with
+         * equal magnitude).
+         *
+         * The default behavior is to use approximate selection.
+         */
+        bool GKO_FACTORY_PARAMETER(approximate_select, true);
+
+        /**
+         * @brief `true` means the sample used for the selection algorithm will
+         *        be chosen deterministically. This is only relevant when using
+         *        `approximate_select`. It is mostly used for testing.
+         *
+         * The selection algorithm used for `approximate_select` uses a small
+         * sample of the input data to determine an approximate threshold.
+         * The choice of elements can either be randomized, i.e., we may use
+         * different elements during each execution, or deterministic, i.e., the
+         * element choices are always the same.
+         *
+         * Note that even though the threshold selection step may be made
+         * deterministic this way, the calculation of the IC factors can still
+         * be non-deterministic due to its asynchronous iterations.
+         *
+         * The default behavior is to use a random sample.
+         */
+        bool GKO_FACTORY_PARAMETER(deterministic_sample, false);
+
+        /**
+         * @brief the amount of fill-in that is allowed in L compared to
+         *        the lower triangle of A.
+         *
+         * The threshold for removing candidates from the intermediate L
+         * is set such that the resulting sparsity pattern has at most
+         * `fill_in_limit` times the number of non-zeros of the lower triangle
+         * of A factorization..
+         *
+         * The default value `2.0` allows twice the number of non-zeros in
+         * L compared to the lower triangle of A.
+         */
+        double GKO_FACTORY_PARAMETER(fill_in_limit, 2.0);
+
+        /**
+         * Strategy which will be used by the L matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(l_strategy, nullptr);
+
+        /**
+         * Strategy which will be used by the L^T matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(lt_strategy, nullptr);
+    };
+    GKO_ENABLE_LIN_OP_FACTORY(ParIct, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    explicit ParIct(const Factory *factory,
+                    std::shared_ptr<const LinOp> system_matrix)
+        : Composition<ValueType>(factory->get_executor()),
+          parameters_{factory->get_parameters()}
+    {
+        if (parameters_.l_strategy == nullptr) {
+            parameters_.l_strategy =
+                std::make_shared<typename matrix_type::classical>();
+        }
+        if (parameters_.lt_strategy == nullptr) {
+            parameters_.lt_strategy =
+                std::make_shared<typename matrix_type::classical>();
+        }
+        generate_l_lt(std::move(system_matrix))->move_to(this);
+    }
+
+    /**
+     * Generates the incomplete LL^T factors, which will be returned as a
+     * composition of the lower (first element of the composition) and the
+     * upper factor (second element). The dynamic type of L and L^T is
+     * matrix_type
+     *
+     * @param system_matrix  the source matrix used to generate the factors.
+     *                       @note: system_matrix must be convertable to a Csr
+     *                              Matrix, otherwise, an exception is thrown.
+     * @return  A Composition, containing the incomplete LU factors for the
+     *          given system_matrix (first element is L, then L^T)
+     */
+    std::unique_ptr<Composition<ValueType>> generate_l_lt(
+        const std::shared_ptr<const LinOp> &system_matrix) const;
+};
+
+
+}  // namespace factorization
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_PAR_ICT_HPP_
diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp
index 8e42b4dc5a9..f3f4458a581 100644
--- a/include/ginkgo/core/factorization/par_ilu.hpp
+++ b/include/ginkgo/core/factorization/par_ilu.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -147,6 +147,20 @@ class ParIlu : public Composition<ValueType> {
          * incorrect.
          */
         bool GKO_FACTORY_PARAMETER(skip_sorting, false);
+
+        /**
+         * Strategy which will be used by the L matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename l_matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(l_strategy, nullptr);
+
+        /**
+         * Strategy which will be used by the U matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename u_matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(u_strategy, nullptr);
     };
     GKO_ENABLE_LIN_OP_FACTORY(ParIlu, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
@@ -157,7 +171,17 @@ class ParIlu : public Composition<ValueType> {
         : Composition<ValueType>(factory->get_executor()),
           parameters_{factory->get_parameters()}
     {
-        generate_l_u(system_matrix, parameters_.skip_sorting)->move_to(this);
+        if (parameters_.l_strategy == nullptr) {
+            parameters_.l_strategy =
+                std::make_shared<typename l_matrix_type::classical>();
+        }
+        if (parameters_.u_strategy == nullptr) {
+            parameters_.u_strategy =
+                std::make_shared<typename u_matrix_type::classical>();
+        }
+        generate_l_u(system_matrix, parameters_.skip_sorting,
+                     parameters_.l_strategy, parameters_.u_strategy)
+            ->move_to(this);
     }
 
     /**
@@ -167,17 +191,21 @@ class ParIlu : public Composition<ValueType> {
      * while the dynamic type of U is u_matrix_type.
      *
      * @param system_matrix  the source matrix used to generate the factors.
-     *                       @note: system_matrix must be convertable to a Csr
+     *                       @note: system_matrix must be convertible to a Csr
      *                              Matrix, otherwise, an exception is thrown.
      * @param skip_sorting  if set to `true`, the sorting will be skipped.
      *                      @note: If the matrix is not sorted, the
      *                             factorization fails.
+     * @param l_strategy  Strategy, which will be used by the L matrix.
+     * @param u_strategy  Strategy, which will be used by the U matrix.
      * @return  A Composition, containing the incomplete LU factors for the
      *          given system_matrix (first element is L, then U)
      */
     std::unique_ptr<Composition<ValueType>> generate_l_u(
-        const std::shared_ptr<const LinOp> &system_matrix,
-        bool skip_sorting) const;
+        const std::shared_ptr<const LinOp> &system_matrix, bool skip_sorting,
+        std::shared_ptr<typename l_matrix_type::strategy_type> l_strategy,
+        std::shared_ptr<typename u_matrix_type::strategy_type> u_strategy)
+        const;
 };
 
 
diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp
new file mode 100644
index 00000000000..364a7bd9351
--- /dev/null
+++ b/include/ginkgo/core/factorization/par_ilut.hpp
@@ -0,0 +1,258 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_FACTORIZATION_PAR_ILUT_HPP_
+#define GKO_CORE_FACTORIZATION_PAR_ILUT_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+/**
+ * @brief The Factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+/**
+ * ParILUT is an incomplete threshold-based LU factorization which is computed
+ * in parallel.
+ *
+ * $L$ is a lower unitriangular, while $U$ is an upper triangular matrix, which
+ * approximate a given matrix $A$ with $A \approx LU$. Here, $L$ and $U$ have
+ * a sparsity pattern that is improved iteratively based on their element-wise
+ * magnitude. The initial sparsity pattern is chosen based on the $ILU(0)$
+ * factorization of $A$.
+ *
+ * One iteration of the ParILUT algorithm consists of the following steps:
+ *
+ * 1. Calculating the residual $R = A - LU$
+ * 2. Adding new non-zero locations from $R$ to $L$ and $U$.
+ *    The new non-zero locations are initialized based on the corresponding
+ *    residual value.
+ * 3. Executing a fixed-point iteration on $L$ and $U$ according to
+ * $
+ * F(L, U) =
+ * \begin{cases}
+ *     \frac{1}{u_{jj}}
+ *         \left(a_{ij}-\sum_{k=1}^{j-1}l_{ik}u_{kj}\right), \quad & i>j \\
+ *     a_{ij}-\sum_{k=1}^{i-1}l_{ik}u_{kj}, \quad & i\leq j
+ * \end{cases}
+ * $
+ *    For a more detailed description of the fixed-point iteration, see
+ *    @ref ParIlu.
+ * 4. Removing the smallest entries (by magnitude) from $L$ and $U$
+ * 5. Executing a fixed-point iteration on the (now sparser) $L$ and $U$
+ *
+ * This ParILUT algorithm thus improves the sparsity pattern and the
+ * approximation of $L$ and $U$ simultaneously.
+ *
+ * The implementation follows the design of H. Anzt et al.,
+ * ParILUT - A Parallel Threshold ILU for GPUs, 2019 IEEE International
+ * Parallel and Distributed Processing Symposium (IPDPS), pp. 231–241.
+ *
+ * @tparam ValueType  Type of the values of all matrices used in this class
+ * @tparam IndexType  Type of the indices of all matrices used in this class
+ *
+ * @ingroup factor
+ * @ingroup LinOp
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class ParIlut : public Composition<ValueType> {
+public:
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using l_matrix_type = matrix::Csr<ValueType, IndexType>;
+    using u_matrix_type = matrix::Csr<ValueType, IndexType>;
+
+    std::shared_ptr<const l_matrix_type> get_l_factor() const
+    {
+        // Can be `static_cast` since the type is guaranteed in this class
+        return std::static_pointer_cast<const l_matrix_type>(
+            this->get_operators()[0]);
+    }
+
+    std::shared_ptr<const u_matrix_type> get_u_factor() const
+    {
+        // Can be `static_cast` since the type is guaranteed in this class
+        return std::static_pointer_cast<const u_matrix_type>(
+            this->get_operators()[1]);
+    }
+
+    // Remove the possibility of calling `create`, which was enabled by
+    // `Composition`
+    template <typename... Args>
+    static std::unique_ptr<Composition<ValueType>> create(Args &&... args) =
+        delete;
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * The number of total iterations of ParILUT that will be executed.
+         * The default value is 5.
+         */
+        size_type GKO_FACTORY_PARAMETER(iterations, 5);
+
+        /**
+         * @brief `true` means it is known that the matrix given to this
+         *        factory will be sorted first by row, then by column index,
+         *        `false` means it is unknown or not sorted, so an additional
+         *        sorting step will be performed during the factorization
+         *        (it will not change the matrix given).
+         *        The matrix must be sorted for this factorization to work.
+         *
+         * The `system_matrix`, which will be given to this factory, must be
+         * sorted (first by row, then by column) in order for the algorithm
+         * to work. If it is known that the matrix will be sorted, this
+         * parameter can be set to `true` to skip the sorting (therefore,
+         * shortening the runtime).
+         * However, if it is unknown or if the matrix is known to be not sorted,
+         * it must remain `false`, otherwise, the factorization might be
+         * incorrect.
+         */
+        bool GKO_FACTORY_PARAMETER(skip_sorting, false);
+
+        /**
+         * @brief `true` means the candidate selection will use an inexact
+         * selection algorithm. `false` means an exact selection algorithm will
+         * be used.
+         *
+         * Using the approximate selection algorithm can give a significant
+         * speed-up, but may in the worst case cause the algorithm to vastly
+         * exceed its `fill_in_limit`.
+         * The exact selection needs more time, but more closely fulfills the
+         * `fill_in_limit` except for pathological cases (many candidates with
+         * equal magnitude).
+         *
+         * The default behavior is to use approximate selection.
+         */
+        bool GKO_FACTORY_PARAMETER(approximate_select, true);
+
+        /**
+         * @brief `true` means the sample used for the selection algorithm will
+         *        be chosen deterministically. This is only relevant when using
+         *        `approximate_select`. It is mostly used for testing.
+         *
+         * The selection algorithm used for `approximate_select` uses a small
+         * sample of the input data to determine an approximate threshold.
+         * The choice of elements can either be randomized, i.e., we may use
+         * different elements during each execution, or deterministic, i.e., the
+         * element choices are always the same.
+         *
+         * Note that even though the threshold selection step may be made
+         * deterministic this way, the calculation of the ILU factors can still
+         * be non-deterministic due to its asynchronous iterations.
+         *
+         * The default behavior is to use a random sample.
+         */
+        bool GKO_FACTORY_PARAMETER(deterministic_sample, false);
+
+        /**
+         * @brief the amount of fill-in that is allowed in L and U compared to
+         *        the ILU(0) factorization.
+         *
+         * The threshold for removing candidates from the intermediate L and U
+         * is set such that the resulting sparsity pattern has at most
+         * `fill_in_limit` times the number of non-zeros of the ILU(0)
+         * factorization. This selection is executed separately for both
+         * factors L and U.
+         *
+         * The default value `2.0` allows twice the number of non-zeros in
+         * L and U compared to ILU(0).
+         */
+        double GKO_FACTORY_PARAMETER(fill_in_limit, 2.0);
+
+        /**
+         * Strategy which will be used by the L matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename l_matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(l_strategy, nullptr);
+
+        /**
+         * Strategy which will be used by the U matrix. The default value
+         * `nullptr` will result in the strategy `classical`.
+         */
+        std::shared_ptr<typename u_matrix_type::strategy_type>
+            GKO_FACTORY_PARAMETER(u_strategy, nullptr);
+    };
+    GKO_ENABLE_LIN_OP_FACTORY(ParIlut, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    explicit ParIlut(const Factory *factory,
+                     std::shared_ptr<const LinOp> system_matrix)
+        : Composition<ValueType>(factory->get_executor()),
+          parameters_{factory->get_parameters()}
+    {
+        if (parameters_.l_strategy == nullptr) {
+            parameters_.l_strategy =
+                std::make_shared<typename l_matrix_type::classical>();
+        }
+        if (parameters_.u_strategy == nullptr) {
+            parameters_.u_strategy =
+                std::make_shared<typename u_matrix_type::classical>();
+        }
+        generate_l_u(std::move(system_matrix))->move_to(this);
+    }
+
+    /**
+     * Generates the incomplete LU factors, which will be returned as a
+     * composition of the lower (first element of the composition) and the
+     * upper factor (second element). The dynamic type of L is l_matrix_type,
+     * while the dynamic type of U is u_matrix_type.
+     *
+     * @param system_matrix  the source matrix used to generate the factors.
+     *                       @note: system_matrix must be convertable to a Csr
+     *                              Matrix, otherwise, an exception is thrown.
+     * @return  A Composition, containing the incomplete LU factors for the
+     *          given system_matrix (first element is L, then U)
+     */
+    std::unique_ptr<Composition<ValueType>> generate_l_u(
+        const std::shared_ptr<const LinOp> &system_matrix) const;
+};
+
+
+}  // namespace factorization
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_PAR_ILUT_HPP_
diff --git a/include/ginkgo/core/log/convergence.hpp b/include/ginkgo/core/log/convergence.hpp
index 5079f587137..64cd16ea1aa 100644
--- a/include/ginkgo/core/log/convergence.hpp
+++ b/include/ginkgo/core/log/convergence.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,12 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_LOG_CONVERGENCE_HPP_
 
 
-#include <ginkgo/core/log/logger.hpp>
-
-
 #include <memory>
 
 
+#include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index 9b0abe9ed47..90eb1a3063c 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_LOGGER_HPP_
-#define GKO_CORE_LOGGER_HPP_
+#ifndef GKO_CORE_LOG_LOGGER_HPP_
+#define GKO_CORE_LOG_LOGGER_HPP_
 
 
 #include <algorithm>
@@ -556,4 +556,4 @@ class EnableLogging : public Loggable {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_LOGGER_HPP_
+#endif  // GKO_CORE_LOG_LOGGER_HPP_
diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp
index 7d8279c023d..7b54e478f6e 100644
--- a/include/ginkgo/core/log/papi.hpp
+++ b/include/ginkgo/core/log/papi.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -46,18 +46,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <mutex>
 
 
+#include <papi.h>
+
+
 #include <ginkgo/core/base/polymorphic_object.hpp>
 #include <ginkgo/core/log/logger.hpp>
-#include "third_party/papi_sde/papi_sde_interface.h"
 
 
-#include <papi.h>
+#include "third_party/papi_sde/papi_sde_interface.h"
 
 
 namespace gko {
 namespace log {
 
 
+static size_type papi_logger_count = 0;
+static std::mutex papi_count_mutex;
+
+
 /**
  * Papi is a Logger which logs every event to the PAPI software. Thanks to this
  * logger, applications which interface with PAPI can access Ginkgo internal
@@ -179,7 +185,7 @@ class Papi : public Logger {
      */
     static std::shared_ptr<Papi> create(
         std::shared_ptr<const gko::Executor> exec,
-        const Logger::mask_type &enabled_events)
+        const Logger::mask_type &enabled_events = Logger::all_events_mask)
     {
         return std::shared_ptr<Papi>(new Papi(exec, enabled_events));
     }
@@ -200,11 +206,11 @@ class Papi : public Logger {
     {
         std::ostringstream os;
 
-        std::lock_guard<std::mutex> guard(count_mutex);
-        os << "ginkgo" << logger_count;
+        std::lock_guard<std::mutex> guard(papi_count_mutex);
+        os << "ginkgo" << papi_logger_count;
         name = os.str();
         papi_handle = papi_sde_init(name.c_str());
-        logger_count++;
+        papi_logger_count++;
     }
 
 private:
@@ -301,8 +307,6 @@ class Papi : public Logger {
     mutable papi_queue<LinOp> iteration_complete{&papi_handle,
                                                  "iteration_complete"};
 
-    static size_type logger_count;
-    std::mutex count_mutex;
 
     std::string name{"ginkgo"};
     papi_handle_t papi_handle;
@@ -314,4 +318,4 @@ class Papi : public Logger {
 
 
 #endif  // GKO_HAVE_PAPI_SDE
-#endif  // GKO_CORE_LOG_OSTREAM_HPP_
+#endif  // GKO_CORE_LOG_PAPI_HPP_
diff --git a/include/ginkgo/core/log/record.hpp b/include/ginkgo/core/log/record.hpp
index f675dda4495..0c791e5e278 100644
--- a/include/ginkgo/core/log/record.hpp
+++ b/include/ginkgo/core/log/record.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,13 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_LOG_RECORD_HPP_
 
 
-#include <ginkgo/core/log/logger.hpp>
-
-
 #include <deque>
 #include <memory>
 
 
+#include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 
diff --git a/include/ginkgo/core/log/stream.hpp b/include/ginkgo/core/log/stream.hpp
index 02fd4ae46f2..d46a0d07be0 100644
--- a/include/ginkgo/core/log/stream.hpp
+++ b/include/ginkgo/core/log/stream.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,13 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_LOG_STREAM_HPP_
 
 
-#include <ginkgo/core/log/logger.hpp>
-
-
 #include <fstream>
 #include <iostream>
 
 
+#include <ginkgo/core/log/logger.hpp>
+
+
 namespace gko {
 namespace log {
 
diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp
index 36468411a8a..e50c8c033d5 100644
--- a/include/ginkgo/core/matrix/coo.hpp
+++ b/include/ginkgo/core/matrix/coo.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -55,6 +55,10 @@ template <typename ValueType>
 class Dense;
 
 
+template <typename ValueType, typename IndexType>
+class CooBuilder;
+
+
 /**
  * COO stores a matrix in the coordinate matrix format.
  *
@@ -72,6 +76,7 @@ class Dense;
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
             public EnableCreateMethod<Coo<ValueType, IndexType>>,
+            public ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>,
             public ConvertibleTo<Csr<ValueType, IndexType>>,
             public ConvertibleTo<Dense<ValueType>>,
             public ReadableFromMatrixData<ValueType, IndexType>,
@@ -80,6 +85,7 @@ class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
     friend class EnablePolymorphicObject<Coo, LinOp>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
+    friend class CooBuilder<ValueType, IndexType>;
 
 public:
     using EnableLinOp<Coo>::convert_to;
@@ -89,6 +95,13 @@ class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
     using index_type = IndexType;
     using mat_data = matrix_data<ValueType, IndexType>;
 
+    friend class Coo<next_precision<ValueType>, IndexType>;
+
+    void convert_to(
+        Coo<next_precision<ValueType>, IndexType> *result) const override;
+
+    void move_to(Coo<next_precision<ValueType>, IndexType> *result) override;
+
     void convert_to(Csr<ValueType, IndexType> *other) const override;
 
     void move_to(Csr<ValueType, IndexType> *other) override;
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 4dd2e5d862c..227b6b06a28 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -60,6 +60,22 @@ class Sellp;
 template <typename ValueType, typename IndexType>
 class SparsityCsr;
 
+template <typename ValueType, typename IndexType>
+class Csr;
+
+template <typename ValueType, typename IndexType>
+class CsrBuilder;
+
+
+namespace detail {
+
+
+template <typename ValueType = default_precision, typename IndexType = int32>
+void strategy_rebuild_helper(Csr<ValueType, IndexType> *result);
+
+
+}  // namespace detail
+
 
 /**
  * CSR is a matrix format which stores only the nonzero coefficients by
@@ -70,6 +86,28 @@ class SparsityCsr;
  * An additional column index array is used to identify the column of each
  * nonzero element.
  *
+ * The Csr LinOp supports different operations:
+ *
+ * ```cpp
+ * matrix::Csr *A, *B, *C;      // matrices
+ * matrix::Dense *b, *x;        // vectors tall-and-skinny matrices
+ * matrix::Dense *alpha, *beta; // scalars of dimension 1x1
+ * matrix::Identity *I;         // identity matrix
+ *
+ * // Applying to Dense matrices computes an SpMV/SpMM product
+ * A->apply(b, x)              // x = A*b
+ * A->apply(alpha, b, beta, x) // x = alpha*A*b + beta*x
+ *
+ * // Applying to Csr matrices computes a SpGEMM product of two sparse matrices
+ * A->apply(B, C)              // C = A*B
+ * A->apply(alpha, B, beta, C) // C = alpha*A*B + beta*C
+ *
+ * // Applying to an Identity matrix computes a SpGEAM sparse matrix addition
+ * A->apply(alpha, I, beta, B) // B = alpha*A + beta*B
+ * ```
+ * Both the SpGEMM and SpGEAM operation require the input matrices to be sorted
+ * by column index, otherwise the algorithms will produce incorrect results.
+ *
  * @tparam ValueType  precision of matrix elements
  * @tparam IndexType  precision of matrix indexes
  *
@@ -80,6 +118,7 @@ class SparsityCsr;
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             public EnableCreateMethod<Csr<ValueType, IndexType>>,
+            public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
             public ConvertibleTo<Dense<ValueType>>,
             public ConvertibleTo<Coo<ValueType, IndexType>>,
             public ConvertibleTo<Ell<ValueType, IndexType>>,
@@ -88,7 +127,8 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
             public ReadableFromMatrixData<ValueType, IndexType>,
             public WritableToMatrixData<ValueType, IndexType>,
-            public Transposable {
+            public Transposable,
+            public Permutable<IndexType> {
     friend class EnableCreateMethod<Csr>;
     friend class EnablePolymorphicObject<Csr, LinOp>;
     friend class Coo<ValueType, IndexType>;
@@ -97,30 +137,64 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     friend class Hybrid<ValueType, IndexType>;
     friend class Sellp<ValueType, IndexType>;
     friend class SparsityCsr<ValueType, IndexType>;
+    friend class CsrBuilder<ValueType, IndexType>;
 
 public:
-    using EnableLinOp<Csr>::convert_to;
-    using EnableLinOp<Csr>::move_to;
-
     using value_type = ValueType;
     using index_type = IndexType;
+    using transposed_type = Csr<ValueType, IndexType>;
     using mat_data = matrix_data<ValueType, IndexType>;
 
     class automatical;
 
+    /**
+     * strategy_type is to decide how to set the csr algorithm.
+     *
+     * The practical strategy method should inherit strategy_type and implement
+     * its `process`, `clac_size` function and the corresponding device kernel.
+     */
     class strategy_type {
         friend class automatical;
 
     public:
+        /**
+         * Creates a strategy_type.
+         *
+         * @param name  the name of strategy
+         */
         strategy_type(std::string name) : name_(name) {}
 
+        /**
+         * Returns the name of strategy
+         *
+         * @return the name of strategy
+         */
         std::string get_name() { return name_; }
 
+        /**
+         * Computes srow according to row pointers.
+         *
+         * @param mtx_row_ptrs  the row pointers of the matrix
+         * @param mtx_srow  the srow of the matrix
+         */
         virtual void process(const Array<index_type> &mtx_row_ptrs,
                              Array<index_type> *mtx_srow) = 0;
 
+        /**
+         * Computes the srow size according to the number of nonzeros.
+         *
+         * @param nnz  the number of nonzeros
+         *
+         * @return the size of srow
+         */
         virtual int64_t clac_size(const int64_t nnz) = 0;
 
+        /**
+         * Copy a strategy. This is a workaround until strategies are revamped,
+         * since strategies like `automatical` do not work when actually shared.
+         */
+        virtual std::shared_ptr<strategy_type> copy() = 0;
+
     protected:
         void set_name(std::string name) { name_ = name; }
 
@@ -128,58 +202,183 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         std::string name_;
     };
 
+    /**
+     * classical is a strategy_type which uses the same number of threads on
+     * each row. Classical strategy uses multithreads to calculate on parts of
+     * rows and then do a reduction of these threads results. The number of
+     * threads per row depends on the max number of stored elements per row.
+     */
     class classical : public strategy_type {
     public:
-        classical() : strategy_type("classical") {}
+        /**
+         * Creates a classical strategy.
+         */
+        classical() : strategy_type("classical"), max_length_per_row_(0) {}
 
         void process(const Array<index_type> &mtx_row_ptrs,
-                     Array<index_type> *mtx_srow)
-        {}
+                     Array<index_type> *mtx_srow) override
+        {
+            auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
+            Array<index_type> row_ptrs_host(host_mtx_exec);
+            const bool is_mtx_on_host{host_mtx_exec ==
+                                      mtx_row_ptrs.get_executor()};
+            const index_type *row_ptrs{};
+            if (is_mtx_on_host) {
+                row_ptrs = mtx_row_ptrs.get_const_data();
+            } else {
+                row_ptrs_host = mtx_row_ptrs;
+                row_ptrs = row_ptrs_host.get_const_data();
+            }
+            auto num_rows = mtx_row_ptrs.get_num_elems() - 1;
+            max_length_per_row_ = 0;
+            for (index_type i = 1; i < num_rows + 1; i++) {
+                max_length_per_row_ = std::max(max_length_per_row_,
+                                               row_ptrs[i] - row_ptrs[i - 1]);
+            }
+        }
+
+        int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        index_type get_max_length_per_row() const noexcept
+        {
+            return max_length_per_row_;
+        }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<classical>();
+        }
 
-        int64_t clac_size(const int64_t nnz) { return 0; }
+    private:
+        index_type max_length_per_row_;
     };
 
+    /**
+     * merge_path is a strategy_type which uses the merge_path algorithm.
+     * merge_path is according to Merrill and Garland: Merge-Based Parallel
+     * Sparse Matrix-Vector Multiplication
+     */
     class merge_path : public strategy_type {
     public:
+        /**
+         * Creates a merge_path strategy.
+         */
         merge_path() : strategy_type("merge_path") {}
 
         void process(const Array<index_type> &mtx_row_ptrs,
-                     Array<index_type> *mtx_srow)
+                     Array<index_type> *mtx_srow) override
         {}
 
-        int64_t clac_size(const int64_t nnz) { return 0; }
+        int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<merge_path>();
+        }
     };
 
+    /**
+     * cusparse is a strategy_type which uses the sparselib csr.
+     *
+     * @note cusparse is also known to the hip executor which converts between
+     *       cuda and hip.
+     */
     class cusparse : public strategy_type {
     public:
+        /**
+         * Creates a cusparse strategy.
+         */
         cusparse() : strategy_type("cusparse") {}
 
         void process(const Array<index_type> &mtx_row_ptrs,
-                     Array<index_type> *mtx_srow)
+                     Array<index_type> *mtx_srow) override
         {}
 
-        int64_t clac_size(const int64_t nnz) { return 0; }
+        int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<cusparse>();
+        }
     };
 
+    /**
+     * sparselib is a strategy_type which uses the sparselib csr.
+     *
+     * @note Uses cusparse in cuda and hipsparse in hip.
+     */
+    class sparselib : public strategy_type {
+    public:
+        /**
+         * Creates a sparselib strategy.
+         */
+        sparselib() : strategy_type("sparselib") {}
+
+        void process(const Array<index_type> &mtx_row_ptrs,
+                     Array<index_type> *mtx_srow) override
+        {}
+
+        int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<sparselib>();
+        }
+    };
+
+    /**
+     * load_balance is a strategy_type which uses the load balance algorithm.
+     */
     class load_balance : public strategy_type {
     public:
+        /**
+         * Creates a load_balance strategy.
+         */
         load_balance()
             : load_balance(std::move(
                   gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
         {}
 
+        /**
+         * Creates a load_balance strategy with CUDA executor.
+         *
+         * @param exec the CUDA executor
+         */
         load_balance(std::shared_ptr<const CudaExecutor> exec)
-            : load_balance(exec->get_num_warps())
+            : load_balance(exec->get_num_warps(), exec->get_warp_size())
+        {}
+
+        /**
+         * Creates a load_balance strategy with HIP executor.
+         *
+         * @param exec the HIP executor
+         */
+        load_balance(std::shared_ptr<const HipExecutor> exec)
+            : load_balance(exec->get_num_warps(), exec->get_warp_size(), false)
         {}
 
-        load_balance(int64_t nwarps)
-            : strategy_type("load_balance"), nwarps_(nwarps)
+        /**
+         * Creates a load_balance strategy with specified parameters
+         *
+         * @param nwarps the number of warps in the executor
+         * @param warp_size the warp size of the executor
+         * @param cuda_strategy  whether the `cuda_strategy` needs to be used.
+         *
+         * @note The warp_size must be the size of full warp. When using this
+         *       constructor, set_strategy needs to be called with correct
+         *       parameters which is replaced during the conversion.
+         */
+        load_balance(int64_t nwarps, int warp_size = 32,
+                     bool cuda_strategy = true)
+            : strategy_type("load_balance"),
+              nwarps_(nwarps),
+              warp_size_(warp_size),
+              cuda_strategy_(cuda_strategy)
         {}
 
         void process(const Array<index_type> &mtx_row_ptrs,
-                     Array<index_type> *mtx_srow)
+                     Array<index_type> *mtx_srow) override
         {
-            constexpr uint32 warp_size = 32;
             auto nwarps = mtx_srow->get_num_elems();
 
             if (nwarps > 0) {
@@ -212,8 +411,8 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 const auto num_elems = row_ptrs[num_rows];
                 for (size_type i = 0; i < num_rows; i++) {
                     auto bucket =
-                        ceildiv((ceildiv(row_ptrs[i + 1], warp_size) * nwarps),
-                                ceildiv(num_elems, warp_size));
+                        ceildiv((ceildiv(row_ptrs[i + 1], warp_size_) * nwarps),
+                                ceildiv(num_elems, warp_size_));
                     if (bucket < nwarps) {
                         srow[bucket]++;
                     }
@@ -228,44 +427,121 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             }
         }
 
-        int64_t clac_size(const int64_t nnz)
+        int64_t clac_size(const int64_t nnz) override
         {
-            constexpr uint32 warp_size = 32;
-            int multiple = 8;
-            if (nnz >= 2000000) {
-                multiple = 128;
-            } else if (nnz >= 200000) {
-                multiple = 32;
+            if (warp_size_ > 0) {
+                int multiple = 8;
+                if (nnz >= 2e6) {
+                    multiple = 128;
+                } else if (nnz >= 2e5) {
+                    multiple = 32;
+                }
+
+#if GINKGO_HIP_PLATFORM_HCC
+                if (!cuda_strategy_) {
+                    multiple = 8;
+                    if (nnz >= 1e7) {
+                        multiple = 64;
+                    } else if (nnz >= 1e6) {
+                        multiple = 16;
+                    }
+                }
+#endif  // GINKGO_HIP_PLATFORM_HCC
+
+                auto nwarps = nwarps_ * multiple;
+                return min(ceildiv(nnz, warp_size_), int64_t(nwarps));
+            } else {
+                return 0;
             }
-            auto nwarps = nwarps_ * multiple;
-            return min(ceildiv(nnz, warp_size), static_cast<int64_t>(nwarps));
+        }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<load_balance>(nwarps_, warp_size_,
+                                                  cuda_strategy_);
         }
 
     private:
         int64_t nwarps_;
+        int warp_size_;
+        bool cuda_strategy_;
     };
 
     class automatical : public strategy_type {
     public:
+        /* Use imbalance strategy when the maximum number of nonzero per row is
+         * more than 1024 on NVIDIA hardware */
+        const index_type nvidia_row_len_limit = 1024;
+        /* Use imbalance strategy when the matrix has more more than 1e6 on
+         * NVIDIA hardware */
+        const index_type nvidia_nnz_limit = 1e6;
+        /* Use imbalance strategy when the maximum number of nonzero per row is
+         * more than 768 on AMD hardware */
+        const index_type amd_row_len_limit = 768;
+        /* Use imbalance strategy when the matrix has more more than 1e8 on AMD
+         * hardware */
+        const index_type amd_nnz_limit = 1e8;
+
+        /**
+         * Creates an automatical strategy.
+         */
         automatical()
             : automatical(std::move(
                   gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
         {}
 
+        /**
+         * Creates an automatical strategy with CUDA executor.
+         *
+         * @param exec the CUDA executor
+         */
         automatical(std::shared_ptr<const CudaExecutor> exec)
-            : automatical(exec->get_num_warps())
+            : automatical(exec->get_num_warps(), exec->get_warp_size())
+        {}
+
+        /**
+         * Creates an automatical strategy with HIP executor.
+         *
+         * @param exec the HIP executor
+         */
+        automatical(std::shared_ptr<const HipExecutor> exec)
+            : automatical(exec->get_num_warps(), exec->get_warp_size(), false)
         {}
 
-        automatical(int64_t nwarps)
-            : strategy_type("automatical"), nwarps_(nwarps)
+        /**
+         * Creates an automatical strategy with specified parameters
+         *
+         * @param nwarps the number of warps in the executor
+         * @param warp_size the warp size of the executor
+         * @param cuda_strategy  whether the `cuda_strategy` needs to be used.
+         *
+         * @note The warp_size must be the size of full warp. When using this
+         *       constructor, set_strategy needs to be called with correct
+         *       parameters which is replaced during the conversion.
+         */
+        automatical(int64_t nwarps, int warp_size = 32,
+                    bool cuda_strategy = true)
+            : strategy_type("automatical"),
+              nwarps_(nwarps),
+              warp_size_(warp_size),
+              cuda_strategy_(cuda_strategy),
+              max_length_per_row_(0)
         {}
 
         void process(const Array<index_type> &mtx_row_ptrs,
-                     Array<index_type> *mtx_srow)
+                     Array<index_type> *mtx_srow) override
         {
-            // if the number of stored elements is larger than 1e6 or
+            // if the number of stored elements is larger than <nnz_limit> or
             // the maximum number of stored elements per row is larger than
-            // 64, use load_balance otherwise use classical
+            // <row_len_limit>, use load_balance otherwise use classical
+            index_type nnz_limit = nvidia_nnz_limit;
+            index_type row_len_limit = nvidia_row_len_limit;
+#if GINKGO_HIP_PLATFORM_HCC
+            if (!cuda_strategy_) {
+                nnz_limit = amd_nnz_limit;
+                row_len_limit = amd_row_len_limit;
+            }
+#endif  // GINKGO_HIP_PLATFORM_HCC
             auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
             const bool is_mtx_on_host{host_mtx_exec ==
                                       mtx_row_ptrs.get_executor()};
@@ -278,8 +554,9 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 row_ptrs = row_ptrs_host.get_const_data();
             }
             const auto num_rows = mtx_row_ptrs.get_num_elems() - 1;
-            if (row_ptrs[num_rows] > index_type(1e6)) {
-                load_balance actual_strategy(nwarps_);
+            if (row_ptrs[num_rows] > nnz_limit) {
+                load_balance actual_strategy(nwarps_, warp_size_,
+                                             cuda_strategy_);
                 if (is_mtx_on_host) {
                     actual_strategy.process(mtx_row_ptrs, mtx_srow);
                 } else {
@@ -291,8 +568,9 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 for (index_type i = 1; i < num_rows + 1; i++) {
                     maxnum = max(maxnum, row_ptrs[i] - row_ptrs[i - 1]);
                 }
-                if (maxnum > 64) {
-                    load_balance actual_strategy(nwarps_);
+                if (maxnum > row_len_limit) {
+                    load_balance actual_strategy(nwarps_, warp_size_,
+                                                 cuda_strategy_);
                     if (is_mtx_on_host) {
                         actual_strategy.process(mtx_row_ptrs, mtx_srow);
                     } else {
@@ -303,23 +581,75 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                     classical actual_strategy;
                     if (is_mtx_on_host) {
                         actual_strategy.process(mtx_row_ptrs, mtx_srow);
+                        max_length_per_row_ =
+                            actual_strategy.get_max_length_per_row();
                     } else {
                         actual_strategy.process(row_ptrs_host, mtx_srow);
+                        max_length_per_row_ =
+                            actual_strategy.get_max_length_per_row();
                     }
                     this->set_name(actual_strategy.get_name());
                 }
             }
         }
 
-        int64_t clac_size(const int64_t nnz)
+        int64_t clac_size(const int64_t nnz) override
+        {
+            return std::make_shared<load_balance>(nwarps_, warp_size_,
+                                                  cuda_strategy_)
+                ->clac_size(nnz);
+        }
+
+        index_type get_max_length_per_row() const noexcept
         {
-            return std::make_shared<load_balance>(nwarps_)->clac_size(nnz);
+            return max_length_per_row_;
+        }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<automatical>(nwarps_, warp_size_,
+                                                 cuda_strategy_);
         }
 
     private:
         int64_t nwarps_;
+        int warp_size_;
+        bool cuda_strategy_;
+        index_type max_length_per_row_;
     };
 
+    void convert_to(Csr<ValueType, IndexType> *result) const override
+    {
+        bool same_executor = this->get_executor() == result->get_executor();
+        // NOTE: as soon as strategies are improved, this can be reverted
+        result->values_ = this->values_;
+        result->col_idxs_ = this->col_idxs_;
+        result->row_ptrs_ = this->row_ptrs_;
+        result->srow_ = this->srow_;
+        result->set_size(this->get_size());
+        if (!same_executor) {
+            convert_strategy_helper(result);
+        } else {
+            result->set_strategy(std::move(this->get_strategy()->copy()));
+        }
+        // END NOTE
+    }
+
+    void move_to(Csr<ValueType, IndexType> *result) override
+    {
+        bool same_executor = this->get_executor() == result->get_executor();
+        EnableLinOp<Csr>::move_to(result);
+        if (!same_executor) {
+            detail::strategy_rebuild_helper(result);
+        }
+    }
+    friend class Csr<next_precision<ValueType>, IndexType>;
+
+    void convert_to(
+        Csr<next_precision<ValueType>, IndexType> *result) const override;
+
+    void move_to(Csr<next_precision<ValueType>, IndexType> *result) override;
+
     void convert_to(Dense<ValueType> *other) const override;
 
     void move_to(Dense<ValueType> *other) override;
@@ -352,6 +682,18 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 
     std::unique_ptr<LinOp> conj_transpose() const override;
 
+    std::unique_ptr<LinOp> row_permute(
+        const Array<IndexType> *permutation_indices) const override;
+
+    std::unique_ptr<LinOp> column_permute(
+        const Array<IndexType> *permutation_indices) const override;
+
+    std::unique_ptr<LinOp> inverse_row_permute(
+        const Array<IndexType> *inverse_permutation_indices) const override;
+
+    std::unique_ptr<LinOp> inverse_column_permute(
+        const Array<IndexType> *inverse_permutation_indices) const override;
+
     /**
      * Sorts all (value, col_idx) pairs in each row by column index
      */
@@ -470,6 +812,17 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         return strategy_;
     }
 
+    /**
+     * Set the strategy
+     *
+     * @param strategy the csr strategy
+     */
+    void set_strategy(std::shared_ptr<strategy_type> strategy)
+    {
+        strategy_ = std::move(strategy->copy());
+        this->make_srow();
+    }
+
 protected:
     /**
      * Creates an uninitialized CSR matrix of the specified size.
@@ -492,14 +845,13 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
      */
     Csr(std::shared_ptr<const Executor> exec, const dim<2> &size = dim<2>{},
         size_type num_nonzeros = {},
-        std::shared_ptr<strategy_type> strategy = std::make_shared<cusparse>())
+        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
         : EnableLinOp<Csr>(exec, size),
           values_(exec, num_nonzeros),
           col_idxs_(exec, num_nonzeros),
-          // avoid allocation for empty matrix
-          row_ptrs_(exec, size[0] + (size[0] > 0)),
+          row_ptrs_(exec, size[0] + 1),
           srow_(exec, strategy->clac_size(num_nonzeros)),
-          strategy_(std::move(strategy))
+          strategy_(strategy->copy())
     {}
 
     /**
@@ -526,13 +878,13 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
               typename RowPtrsArray>
     Csr(std::shared_ptr<const Executor> exec, const dim<2> &size,
         ValuesArray &&values, ColIdxsArray &&col_idxs, RowPtrsArray &&row_ptrs,
-        std::shared_ptr<strategy_type> strategy = std::make_shared<cusparse>())
+        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
         : EnableLinOp<Csr>(exec, size),
           values_{exec, std::forward<ValuesArray>(values)},
           col_idxs_{exec, std::forward<ColIdxsArray>(col_idxs)},
           row_ptrs_{exec, std::forward<RowPtrsArray>(row_ptrs)},
           srow_(exec),
-          strategy_(std::move(strategy))
+          strategy_(strategy->copy())
     {
         GKO_ASSERT_EQ(values_.get_num_elems(), col_idxs_.get_num_elems());
         GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems());
@@ -544,6 +896,90 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta,
                     LinOp *x) const override;
 
+    // TODO clean this up as soon as we improve strategy_type
+    template <typename CsrType>
+    void convert_strategy_helper(CsrType *result) const
+    {
+        auto strat = this->get_strategy().get();
+        std::shared_ptr<typename CsrType::strategy_type> new_strat;
+        if (dynamic_cast<classical *>(strat)) {
+            new_strat = std::make_shared<typename CsrType::classical>();
+        } else if (dynamic_cast<merge_path *>(strat)) {
+            new_strat = std::make_shared<typename CsrType::merge_path>();
+        } else if (dynamic_cast<cusparse *>(strat)) {
+            new_strat = std::make_shared<typename CsrType::cusparse>();
+        } else if (dynamic_cast<sparselib *>(strat)) {
+            new_strat = std::make_shared<typename CsrType::sparselib>();
+        } else {
+            auto rexec = result->get_executor();
+            auto cuda_exec =
+                std::dynamic_pointer_cast<const CudaExecutor>(rexec);
+            auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);
+            auto lb = dynamic_cast<load_balance *>(strat);
+            if (cuda_exec) {
+                if (lb) {
+                    new_strat =
+                        std::make_shared<typename CsrType::load_balance>(
+                            cuda_exec);
+                } else {
+                    new_strat = std::make_shared<typename CsrType::automatical>(
+                        cuda_exec);
+                }
+            } else if (hip_exec) {
+                if (lb) {
+                    new_strat =
+                        std::make_shared<typename CsrType::load_balance>(
+                            hip_exec);
+                } else {
+                    new_strat = std::make_shared<typename CsrType::automatical>(
+                        hip_exec);
+                }
+            } else {
+                // Try to preserve this executor's configuration
+                auto this_cuda_exec =
+                    std::dynamic_pointer_cast<const CudaExecutor>(
+                        this->get_executor());
+                auto this_hip_exec =
+                    std::dynamic_pointer_cast<const HipExecutor>(
+                        this->get_executor());
+                if (this_cuda_exec) {
+                    if (lb) {
+                        new_strat =
+                            std::make_shared<typename CsrType::load_balance>(
+                                this_cuda_exec);
+                    } else {
+                        new_strat =
+                            std::make_shared<typename CsrType::automatical>(
+                                this_cuda_exec);
+                    }
+                } else if (this_hip_exec) {
+                    if (lb) {
+                        new_strat =
+                            std::make_shared<typename CsrType::load_balance>(
+                                this_hip_exec);
+                    } else {
+                        new_strat =
+                            std::make_shared<typename CsrType::automatical>(
+                                this_hip_exec);
+                    }
+                } else {
+                    // We had a load balance or automatical strategy from a non
+                    // HIP or Cuda executor and are moving to a non HIP or Cuda
+                    // executor.
+                    // FIXME this creates a long delay
+                    if (lb) {
+                        new_strat =
+                            std::make_shared<typename CsrType::load_balance>();
+                    } else {
+                        new_strat =
+                            std::make_shared<typename CsrType::automatical>();
+                    }
+                }
+            }
+        }
+        result->set_strategy(new_strat);
+    }
+
     /**
      * Computes srow. It should be run after changing any row_ptrs_ value.
      */
@@ -562,6 +998,43 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 };
 
 
+namespace detail {
+
+
+/**
+ * When strategy is load_balance or automatical, rebuild the strategy
+ * according to executor's property.
+ *
+ * @param result  the csr matrix.
+ */
+template <typename ValueType, typename IndexType>
+void strategy_rebuild_helper(Csr<ValueType, IndexType> *result)
+{
+    using load_balance = typename Csr<ValueType, IndexType>::load_balance;
+    using automatical = typename Csr<ValueType, IndexType>::automatical;
+    auto strategy = result->get_strategy();
+    auto executor = result->get_executor();
+    if (std::dynamic_pointer_cast<load_balance>(strategy)) {
+        if (auto exec =
+                std::dynamic_pointer_cast<const HipExecutor>(executor)) {
+            result->set_strategy(std::make_shared<load_balance>(exec));
+        } else if (auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
+                       executor)) {
+            result->set_strategy(std::make_shared<load_balance>(exec));
+        }
+    } else if (std::dynamic_pointer_cast<automatical>(strategy)) {
+        if (auto exec =
+                std::dynamic_pointer_cast<const HipExecutor>(executor)) {
+            result->set_strategy(std::make_shared<automatical>(exec));
+        } else if (auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
+                       executor)) {
+            result->set_strategy(std::make_shared<automatical>(exec));
+        }
+    }
+}
+
+
+}  // namespace detail
 }  // namespace matrix
 }  // namespace gko
 
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index ab0ae4fc9dd..a98861f75f2 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_MATRIX_DENSE_HPP_
 
 
+#include <initializer_list>
+
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
@@ -43,9 +46,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 
 
-#include <initializer_list>
-
-
 namespace gko {
 namespace matrix {
 
@@ -87,6 +87,7 @@ class SparsityCsr;
 template <typename ValueType = default_precision>
 class Dense : public EnableLinOp<Dense<ValueType>>,
               public EnableCreateMethod<Dense<ValueType>>,
+              public ConvertibleTo<Dense<next_precision<ValueType>>>,
               public ConvertibleTo<Coo<ValueType, int32>>,
               public ConvertibleTo<Coo<ValueType, int64>>,
               public ConvertibleTo<Csr<ValueType, int32>>,
@@ -103,7 +104,9 @@ class Dense : public EnableLinOp<Dense<ValueType>>,
               public ReadableFromMatrixData<ValueType, int64>,
               public WritableToMatrixData<ValueType, int32>,
               public WritableToMatrixData<ValueType, int64>,
-              public Transposable {
+              public Transposable,
+              public Permutable<int32>,
+              public Permutable<int64> {
     friend class EnableCreateMethod<Dense>;
     friend class EnablePolymorphicObject<Dense, LinOp>;
     friend class Coo<ValueType, int32>;
@@ -125,6 +128,7 @@ class Dense : public EnableLinOp<Dense<ValueType>>,
 
     using value_type = ValueType;
     using index_type = int64;
+    using transposed_type = Dense<ValueType>;
     using mat_data = gko::matrix_data<ValueType, int64>;
     using mat_data32 = gko::matrix_data<ValueType, int32>;
 
@@ -141,11 +145,15 @@ class Dense : public EnableLinOp<Dense<ValueType>>,
         // using operator `->`) is currently required to be compatible with
         // CUDA 10.1.
         // Otherwise, it results in a compile error.
-        // TODO Check if the compiler error is fixed and revert to `operator->`.
-        return Dense::create((*other).get_executor(), (*other).get_size(),
-                             (*other).get_stride());
+        return (*other).create_with_same_config();
     }
 
+    friend class Dense<next_precision<ValueType>>;
+
+    void convert_to(Dense<next_precision<ValueType>> *result) const override;
+
+    void move_to(Dense<next_precision<ValueType>> *result) override;
+
     void convert_to(Coo<ValueType, int32> *result) const override;
 
     void move_to(Coo<ValueType, int32> *result) override;
@@ -206,6 +214,31 @@ class Dense : public EnableLinOp<Dense<ValueType>>,
 
     std::unique_ptr<LinOp> conj_transpose() const override;
 
+    std::unique_ptr<LinOp> row_permute(
+        const Array<int32> *permutation_indices) const override;
+
+    std::unique_ptr<LinOp> row_permute(
+        const Array<int64> *permutation_indices) const override;
+
+    std::unique_ptr<LinOp> column_permute(
+        const Array<int32> *permutation_indices) const override;
+
+    std::unique_ptr<LinOp> column_permute(
+        const Array<int64> *permutation_indices) const override;
+
+    std::unique_ptr<LinOp> inverse_row_permute(
+        const Array<int32> *inverse_permutation_indices) const override;
+
+    std::unique_ptr<LinOp> inverse_row_permute(
+        const Array<int64> *inverse_permutation_indices) const override;
+
+    std::unique_ptr<LinOp> inverse_column_permute(
+        const Array<int32> *inverse_permutation_indices) const override;
+
+    std::unique_ptr<LinOp> inverse_column_permute(
+        const Array<int64> *inverse_permutation_indices) const override;
+
+
     /**
      * Returns a pointer to the array of values of the matrix.
      *
@@ -448,6 +481,17 @@ class Dense : public EnableLinOp<Dense<ValueType>>,
                              values_.get_num_elems());
     }
 
+    /**
+     * Creates a Dense matrix with the same configuration as the callers matrix.
+     *
+     * @returns a Dense matrix with the same configuration as the caller.
+     */
+    virtual std::unique_ptr<Dense> create_with_same_config() const
+    {
+        return Dense::create(this->get_executor(), this->get_size(),
+                             this->get_stride());
+    }
+
     /**
      * @copydoc scale(const LinOp *)
      *
diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp
index d9572ec0759..237ce920799 100644
--- a/include/ginkgo/core/matrix/ell.hpp
+++ b/include/ginkgo/core/matrix/ell.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -70,6 +70,7 @@ class Csr;
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
             public EnableCreateMethod<Ell<ValueType, IndexType>>,
+            public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>,
             public ConvertibleTo<Dense<ValueType>>,
             public ConvertibleTo<Csr<ValueType, IndexType>>,
             public ReadableFromMatrixData<ValueType, IndexType>,
@@ -87,6 +88,13 @@ class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
     using index_type = IndexType;
     using mat_data = matrix_data<ValueType, IndexType>;
 
+    friend class Ell<next_precision<ValueType>, IndexType>;
+
+    void convert_to(
+        Ell<next_precision<ValueType>, IndexType> *result) const override;
+
+    void move_to(Ell<next_precision<ValueType>, IndexType> *result) override;
+
     void convert_to(Dense<ValueType> *other) const override;
 
     void move_to(Dense<ValueType> *other) override;
diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp
index a759e0be7b2..9c3cb7bda33 100644
--- a/include/ginkgo/core/matrix/hybrid.hpp
+++ b/include/ginkgo/core/matrix/hybrid.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -68,12 +68,14 @@ class Csr;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Hybrid : public EnableLinOp<Hybrid<ValueType, IndexType>>,
-               public EnableCreateMethod<Hybrid<ValueType, IndexType>>,
-               public ConvertibleTo<Dense<ValueType>>,
-               public ConvertibleTo<Csr<ValueType, IndexType>>,
-               public ReadableFromMatrixData<ValueType, IndexType>,
-               public WritableToMatrixData<ValueType, IndexType> {
+class Hybrid
+    : public EnableLinOp<Hybrid<ValueType, IndexType>>,
+      public EnableCreateMethod<Hybrid<ValueType, IndexType>>,
+      public ConvertibleTo<Hybrid<next_precision<ValueType>, IndexType>>,
+      public ConvertibleTo<Dense<ValueType>>,
+      public ConvertibleTo<Csr<ValueType, IndexType>>,
+      public ReadableFromMatrixData<ValueType, IndexType>,
+      public WritableToMatrixData<ValueType, IndexType> {
     friend class EnableCreateMethod<Hybrid>;
     friend class EnablePolymorphicObject<Hybrid, LinOp>;
     friend class Dense<ValueType>;
@@ -239,6 +241,9 @@ class Hybrid : public EnableLinOp<Hybrid<ValueType, IndexType>>,
         {
             auto row_nnz_val = row_nnz->get_data();
             auto num_rows = row_nnz->get_num_elems();
+            if (num_rows == 0) {
+                return 0;
+            }
             std::sort(row_nnz_val, row_nnz_val + num_rows);
             if (percent_ < 1) {
                 auto percent_pos = static_cast<size_type>(num_rows * percent_);
@@ -253,7 +258,7 @@ class Hybrid : public EnableLinOp<Hybrid<ValueType, IndexType>>,
     };
 
     /**
-     * imbalance_bounded_limit is a stratgy_type which decides the number of
+     * imbalance_bounded_limit is a strategy_type which decides the number of
      * stored elements per row of the ell part. It uses the imbalance_limit and
      * adds the upper bound of the number of ell's cols by the number of rows.
      */
@@ -283,7 +288,7 @@ class Hybrid : public EnableLinOp<Hybrid<ValueType, IndexType>>,
 
 
     /**
-     * minimal_storage_limit is a stratgy_type which decides the number of
+     * minimal_storage_limit is a strategy_type which decides the number of
      * stored elements per row of the ell part. It is determined by the size of
      * ValueType and IndexType, the storage is the minimum among all partition.
      */
@@ -310,7 +315,7 @@ class Hybrid : public EnableLinOp<Hybrid<ValueType, IndexType>>,
 
 
     /**
-     * automatic is a stratgy_type which decides the number of stored elements
+     * automatic is a strategy_type which decides the number of stored elements
      * per row of the ell part automatically.
      */
     class automatic : public strategy_type {
@@ -330,6 +335,13 @@ class Hybrid : public EnableLinOp<Hybrid<ValueType, IndexType>>,
         imbalance_bounded_limit strategy_;
     };
 
+    friend class Hybrid<next_precision<ValueType>, IndexType>;
+
+    void convert_to(
+        Hybrid<next_precision<ValueType>, IndexType> *result) const override;
+
+    void move_to(Hybrid<next_precision<ValueType>, IndexType> *result) override;
+
     void convert_to(Dense<ValueType> *other) const override;
 
     void move_to(Dense<ValueType> *other) override;
diff --git a/include/ginkgo/core/matrix/identity.hpp b/include/ginkgo/core/matrix/identity.hpp
index c79a2389c7a..4cc9065d92f 100644
--- a/include/ginkgo/core/matrix/identity.hpp
+++ b/include/ginkgo/core/matrix/identity.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,8 @@ namespace matrix {
  */
 template <typename ValueType = default_precision>
 class Identity : public EnableLinOp<Identity<ValueType>>,
-                 public EnableCreateMethod<Identity<ValueType>> {
+                 public EnableCreateMethod<Identity<ValueType>>,
+                 public Transposable {
     friend class EnablePolymorphicObject<Identity, LinOp>;
     friend class EnableCreateMethod<Identity>;
 
@@ -70,6 +71,12 @@ class Identity : public EnableLinOp<Identity<ValueType>>,
     using EnableLinOp<Identity>::move_to;
 
     using value_type = ValueType;
+    using transposed_type = Identity<ValueType>;
+
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
 
 protected:
     /**
diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp
new file mode 100644
index 00000000000..656cee4ae3d
--- /dev/null
+++ b/include/ginkgo/core/matrix/permutation.hpp
@@ -0,0 +1,249 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_PERMUTATION_HPP_
+#define GKO_CORE_MATRIX_PERMUTATION_HPP_
+
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+/** @internal std::bitset allows to store any number of bits */
+using mask_type = gko::uint64;
+
+static constexpr mask_type row_permute = mask_type{1};
+static constexpr mask_type column_permute = mask_type{1 << 2};
+static constexpr mask_type inverse_permute = mask_type{1 << 3};
+
+/**
+ * Permutation is a matrix "format" which stores the row and column permutation
+ * arrays which can be used for re-ordering the rows and columns a matrix.
+ *
+ * @tparam IndexType  precision of permutation array indices.
+ *
+ * @note This format is used mainly to allow for an abstraction of the
+ * permutation/re-ordering and provides the user with an apply method which
+ * calls the respective LinOp's permute operation if the respective LinOp
+ * implements the Permutable interface. As such it only stores an array of the
+ * permutation indices.
+ *
+ * @ingroup permutation
+ * @ingroup mat_formats
+ * @ingroup LinOp
+ */
+template <typename IndexType = int32>
+class Permutation : public EnableLinOp<Permutation<IndexType>>,
+                    public EnableCreateMethod<Permutation<IndexType>> {
+    friend class EnableCreateMethod<Permutation>;
+    friend class EnablePolymorphicObject<Permutation, LinOp>;
+
+public:
+    using index_type = IndexType;
+
+    /**
+     * Returns a pointer to the array of permutation.
+     *
+     * @return the pointer to the row permutation array.
+     */
+    index_type *get_permutation() noexcept { return permutation_.get_data(); }
+
+    /**
+     * @copydoc get_permutation()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const index_type *get_const_permutation() const noexcept
+    {
+        return permutation_.get_const_data();
+    }
+
+    /**
+     * Returns the number of elements explicitly stored in the permutation
+     * array.
+     *
+     * @return the number of elements explicitly stored in the permutation
+     * array.
+     */
+    size_type get_permutation_size() const noexcept
+    {
+        return permutation_.get_num_elems();
+    }
+
+    /**
+     * Get the permute masks
+     *
+     * @return  permute_mask the permute masks
+     */
+    mask_type get_permute_mask() const { return enabled_permute_; }
+
+    /**
+     * Set the permute masks
+     *
+     * @param permute_mask the permute masks
+     */
+    void set_permute_mask(mask_type permute_mask)
+    {
+        enabled_permute_ = permute_mask;
+    }
+
+
+protected:
+    /**
+     * Creates an uninitialized Permutation arrays on the specified executor..
+     *
+     * @param exec  Executor associated to the LinOp
+     */
+    Permutation(std::shared_ptr<const Executor> exec)
+        : Permutation(std::move(exec), dim<2>{})
+    {}
+
+    /**
+     * Creates uninitialized Permutation arrays of the specified size.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the permutable matrix
+     * @param enabled_permute  mask for the type of permutation to apply.
+     */
+    Permutation(std::shared_ptr<const Executor> exec, const dim<2> &size,
+                const mask_type &enabled_permute = row_permute)
+        : EnableLinOp<Permutation>(exec, size),
+          permutation_(exec, size[0]),
+          row_size_(size[0]),
+          col_size_(size[1]),
+          enabled_permute_(enabled_permute)
+    {}
+
+    /**
+     * Creates a Permutation matrix from an already allocated (and initialized)
+     * row and column permutation arrays.
+     *
+     * @tparam IndicesArray  type of array of indices
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the permutation array.
+     * @param permutation_indices array of permutation array
+     * @param enabled_permute  mask for the type of permutation to apply.
+     *
+     * @note If `permutation_indices` is not an rvalue, not an array of
+     * IndexType, or is on the wrong executor, an internal copy will be created,
+     * and the original array data will not be used in the matrix.
+     */
+    template <typename IndicesArray>
+    Permutation(std::shared_ptr<const Executor> exec, const dim<2> &size,
+                IndicesArray &&permutation_indices,
+                const mask_type &enabled_permute = row_permute)
+        : EnableLinOp<Permutation>(exec, size),
+          permutation_{exec, std::forward<IndicesArray>(permutation_indices)},
+          row_size_(size[0]),
+          col_size_(size[1]),
+          enabled_permute_(enabled_permute)
+    {
+        if (enabled_permute_ & row_permute) {
+            GKO_ASSERT_EQ(size[0], permutation_.get_num_elems());
+        }
+        if (enabled_permute_ & column_permute) {
+            GKO_ASSERT_EQ(size[1], permutation_.get_num_elems());
+        }
+    }
+
+    void apply_impl(const LinOp *in, LinOp *out) const
+    {
+        auto perm = as<Permutable<index_type>>(in);
+        std::unique_ptr<gko::LinOp> tmp{};
+        if (enabled_permute_ & inverse_permute) {
+            if (enabled_permute_ & row_permute) {
+                tmp = perm->inverse_row_permute(&permutation_);
+            }
+            if (enabled_permute_ & column_permute) {
+                if (enabled_permute_ & row_permute) {
+                    tmp = as<Permutable<index_type>>(tmp.get())
+                              ->inverse_column_permute(&permutation_);
+                } else {
+                    tmp = perm->inverse_column_permute(&permutation_);
+                }
+            }
+        } else {
+            if (enabled_permute_ & row_permute) {
+                tmp = perm->row_permute(&permutation_);
+            }
+            if (enabled_permute_ & column_permute) {
+                if (enabled_permute_ & row_permute) {
+                    tmp = as<Permutable<index_type>>(tmp.get())->column_permute(
+                        &permutation_);
+                } else {
+                    tmp = perm->column_permute(&permutation_);
+                }
+            }
+        }
+        out->copy_from(std::move(tmp));
+    }
+
+
+    void apply_impl(const LinOp *, const LinOp *in, const LinOp *,
+                    LinOp *out) const
+    {
+        // Ignores alpha and beta and just performs a normal permutation as an
+        // advanced apply does not really make sense here.
+        this->apply_impl(in, out);
+    }
+
+
+private:
+    Array<index_type> permutation_;
+    size_type row_size_;
+    size_type col_size_;
+    mask_type enabled_permute_;
+};
+
+
+}  // namespace matrix
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_PERMUTATION_HPP_
diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp
index 59423f33fd4..021b3870885 100644
--- a/include/ginkgo/core/matrix/sellp.hpp
+++ b/include/ginkgo/core/matrix/sellp.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -67,6 +67,7 @@ class Csr;
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
               public EnableCreateMethod<Sellp<ValueType, IndexType>>,
+              public ConvertibleTo<Sellp<next_precision<ValueType>, IndexType>>,
               public ConvertibleTo<Dense<ValueType>>,
               public ConvertibleTo<Csr<ValueType, IndexType>>,
               public ReadableFromMatrixData<ValueType, IndexType>,
@@ -84,6 +85,13 @@ class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
     using index_type = IndexType;
     using mat_data = matrix_data<ValueType, IndexType>;
 
+    friend class Sellp<next_precision<ValueType>, IndexType>;
+
+    void convert_to(
+        Sellp<next_precision<ValueType>, IndexType> *result) const override;
+
+    void move_to(Sellp<next_precision<ValueType>, IndexType> *result) override;
+
     void convert_to(Dense<ValueType> *other) const override;
 
     void move_to(Dense<ValueType> *other) override;
@@ -305,10 +313,8 @@ class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
         : EnableLinOp<Sellp>(exec, size),
           values_(exec, slice_size * total_cols),
           col_idxs_(exec, slice_size * total_cols),
-          slice_lengths_(exec,
-                         (size[0] == 0) ? 0 : ceildiv(size[0], slice_size)),
-          slice_sets_(exec,
-                      (size[0] == 0) ? 0 : ceildiv(size[0], slice_size) + 1),
+          slice_lengths_(exec, ceildiv(size[0], slice_size)),
+          slice_sets_(exec, ceildiv(size[0], slice_size) + 1),
           slice_size_(slice_size),
           stride_factor_(stride_factor),
           total_cols_(total_cols)
diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp
index 5d1b0e580a8..25374a251b8 100644
--- a/include/ginkgo/core/matrix/sparsity_csr.hpp
+++ b/include/ginkgo/core/matrix/sparsity_csr.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -84,7 +84,7 @@ class SparsityCsr
 
     using value_type = ValueType;
     using index_type = IndexType;
-
+    using transposed_type = SparsityCsr<IndexType, ValueType>;
     using mat_data = matrix_data<ValueType, IndexType>;
 
     void read(const mat_data &data) override;
@@ -198,17 +198,9 @@ class SparsityCsr
                 const dim<2> &size = dim<2>{}, size_type num_nonzeros = {})
         : EnableLinOp<SparsityCsr>(exec, size),
           col_idxs_(exec, num_nonzeros),
-          // avoid allocation for empty matrix
-          row_ptrs_(exec, size[0] + (size[0] > 0))
-    {
-        if (size[0] > 0) {
-            auto tmp = Array<value_type>{exec->get_master(), 1};
-            tmp.get_data()[0] = one<ValueType>();
-            value_ = Array<value_type>{exec, std::move(tmp)};
-        } else {
-            value_ = Array<value_type>{exec};
-        }
-    }
+          row_ptrs_(exec, size[0] + 1),
+          value_(exec, {one<ValueType>()})
+    {}
 
     /**
      * Creates a SparsityCsr matrix from already allocated (and initialized) row
@@ -235,11 +227,9 @@ class SparsityCsr
                 value_type value = one<ValueType>())
         : EnableLinOp<SparsityCsr>(exec, size),
           col_idxs_{exec, std::forward<ColIdxsArray>(col_idxs)},
-          row_ptrs_{exec, std::forward<RowPtrsArray>(row_ptrs)}
+          row_ptrs_{exec, std::forward<RowPtrsArray>(row_ptrs)},
+          value_{exec, {value}}
     {
-        auto tmp = Array<value_type>{exec->get_master(), 1};
-        tmp.get_data()[0] = value;
-        value_ = Array<value_type>{exec, std::move(tmp)};
         GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems());
     }
 
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index 6b41cc176da..d3faaab5b4d 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/upper_trs.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 
 
 namespace gko {
@@ -109,8 +109,10 @@ namespace preconditioner {
  */
 template <typename LSolverType = solver::LowerTrs<>,
           typename USolverType = solver::UpperTrs<>, bool ReverseApply = false,
-          typename IndexTypeParIlu = int32>
-class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
+          typename IndexType = int32>
+class Ilu : public EnableLinOp<
+                Ilu<LSolverType, USolverType, ReverseApply, IndexType>>,
+            public Transposable {
     friend class EnableLinOp<Ilu>;
     friend class EnablePolymorphicObject<Ilu, LinOp>;
 
@@ -123,7 +125,10 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
     using l_solver_type = LSolverType;
     using u_solver_type = USolverType;
     static constexpr bool performs_reverse_apply = ReverseApply;
-    using index_type_par_ilu = IndexTypeParIlu;
+    using index_type = IndexType;
+    using transposed_type =
+        Ilu<typename USolverType::transposed_type,
+            typename LSolverType::transposed_type, ReverseApply, IndexType>;
 
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
@@ -138,6 +143,12 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
          */
         std::shared_ptr<typename u_solver_type::Factory> GKO_FACTORY_PARAMETER(
             u_solver_factory, nullptr);
+
+        /**
+         * Factory for the factorization
+         */
+        std::shared_ptr<LinOpFactory> GKO_FACTORY_PARAMETER(
+            factorization_factory, nullptr);
     };
 
     GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
@@ -163,15 +174,47 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
         return u_solver_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override
+    {
+        std::unique_ptr<transposed_type> transposed{
+            new transposed_type{this->get_executor()}};
+        transposed->set_size(gko::transpose(this->get_size()));
+        transposed->l_solver_ =
+            share(as<typename u_solver_type::transposed_type>(
+                this->get_u_solver()->transpose()));
+        transposed->u_solver_ =
+            share(as<typename l_solver_type::transposed_type>(
+                this->get_l_solver()->transpose()));
+
+        return std::move(transposed);
+    }
+
+    std::unique_ptr<LinOp> conj_transpose() const override
+    {
+        std::unique_ptr<transposed_type> transposed{
+            new transposed_type{this->get_executor()}};
+        transposed->set_size(gko::transpose(this->get_size()));
+        transposed->l_solver_ =
+            share(as<typename u_solver_type::transposed_type>(
+                this->get_u_solver()->conj_transpose()));
+        transposed->u_solver_ =
+            share(as<typename l_solver_type::transposed_type>(
+                this->get_l_solver()->conj_transpose()));
+
+        return std::move(transposed);
+    }
+
 protected:
     void apply_impl(const LinOp *b, LinOp *x) const override
     {
         set_cache_to(b);
         if (!ReverseApply) {
             l_solver_->apply(b, cache_.intermediate.get());
+            x->copy_from(cache_.intermediate.get());
             u_solver_->apply(cache_.intermediate.get(), x);
         } else {
             u_solver_->apply(b, cache_.intermediate.get());
+            x->copy_from(cache_.intermediate.get());
             l_solver_->apply(cache_.intermediate.get(), x);
         }
     }
@@ -197,24 +240,33 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
         : EnableLinOp<Ilu>(factory->get_executor(), lin_op->get_size()),
           parameters_{factory->get_parameters()}
     {
-        auto comp_cast =
-            dynamic_cast<const Composition<value_type> *>(lin_op.get());
+        auto comp =
+            std::dynamic_pointer_cast<const Composition<value_type>>(lin_op);
         std::shared_ptr<const LinOp> l_factor;
         std::shared_ptr<const LinOp> u_factor;
 
-        if (comp_cast == nullptr) {
+        // build factorization if we weren't passed a composition
+        if (!comp) {
             auto exec = lin_op->get_executor();
-            auto par_ilu =
-                factorization::ParIlu<value_type, index_type_par_ilu>::build()
-                    .on(exec)
-                    ->generate(lin_op);
-            l_factor = par_ilu->get_l_factor();
-            u_factor = par_ilu->get_u_factor();
-        } else if (comp_cast->get_operators().size() == 2) {
-            l_factor = comp_cast->get_operators()[0];
-            u_factor = comp_cast->get_operators()[1];
+            if (!parameters_.factorization_factory) {
+                parameters_.factorization_factory =
+                    factorization::ParIlu<value_type, index_type>::build().on(
+                        exec);
+            }
+            auto fact = std::shared_ptr<const LinOp>(
+                parameters_.factorization_factory->generate(lin_op));
+            // ensure that the result is a composition
+            comp =
+                std::dynamic_pointer_cast<const Composition<value_type>>(fact);
+            if (!comp) {
+                GKO_NOT_SUPPORTED(comp);
+            }
+        }
+        if (comp->get_operators().size() == 2) {
+            l_factor = comp->get_operators()[0];
+            u_factor = comp->get_operators()[1];
         } else {
-            GKO_NOT_SUPPORTED(comp_cast);
+            GKO_NOT_SUPPORTED(comp);
         }
         GKO_ASSERT_EQUAL_DIMENSIONS(l_factor, u_factor);
 
@@ -276,8 +328,7 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
      *
      */
     template <typename SolverType, typename = void>
-    struct has_with_criteria : std::false_type {
-    };
+    struct has_with_criteria : std::false_type {};
 
     /**
      * @copydoc has_with_criteria
@@ -291,8 +342,7 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
         SolverType,
         xstd::void_t<decltype(std::declval<factory_type_t<SolverType>>()
                                   .with_criteria(with_criteria_param_type()))>>
-        : std::true_type {
-    };
+        : std::true_type {};
 
 
     /**
@@ -308,7 +358,7 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
     generate_default_solver(const std::shared_ptr<const Executor> &exec,
                             const std::shared_ptr<const LinOp> &mtx)
     {
-        constexpr value_type default_reduce_residual{1e-4};
+        constexpr gko::remove_complex<value_type> default_reduce_residual{1e-4};
         const unsigned int default_max_iters{
             static_cast<unsigned int>(mtx->get_size()[0])};
 
@@ -316,7 +366,7 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
             .with_criteria(gko::stop::Iteration::build()
                                .with_max_iters(default_max_iters)
                                .on(exec),
-                           gko::stop::ResidualNormReduction<>::build()
+                           gko::stop::ResidualNormReduction<value_type>::build()
                                .with_reduction_factor(default_reduce_residual)
                                .on(exec))
             .on(exec)
diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp
new file mode 100644
index 00000000000..f7c6d8a701c
--- /dev/null
+++ b/include/ginkgo/core/preconditioner/isai.hpp
@@ -0,0 +1,218 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_PRECONDITIONER_ISAI_HPP_
+#define GKO_CORE_PRECONDITIONER_ISAI_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+/**
+ * @brief The Preconditioner namespace.
+ *
+ * @ingroup precond
+ */
+namespace preconditioner {
+
+
+/**
+ * This enum lists the types of the ISAI preconditioner.
+ *
+ * ISAI can either generate a lower triangular matrix, or an upper triangular
+ * matrix.
+ */
+enum struct isai_type { lower, upper };
+
+/**
+ * The Incomplete Sparse Approximate Inverse (ISAI) Preconditioner generates
+ * an approximate inverse matrix for a given lower triangular matrix L or upper
+ * triangular matrix U.
+ *
+ * Using the preconditioner computes $aiU * x$ or $aiL * x$ (depending on the
+ * type of the Isai) for a given vector x (may have multiple right hand sides).
+ * aiU and aiL are the approximate inverses for U and L respectively.
+ *
+ * The sparsity pattern used for the approximate inverse is the same as
+ * the sparsity pattern of the respective triangular matrix.
+ *
+ * For more details on the algorithm, see the paper
+ * <a href="https://doi.org/10.1016/j.parco.2017.10.003">
+ * Incomplete Sparse Approximate Inverses for Parallel Preconditioning</a>,
+ * which is the basis for this work.
+ *
+ * @note GPU implementations can only handle the vector unit width `width`
+ *       (warp size for CUDA) as number of elements per row in the sparse
+ *       matrix. If there are more than `width` elements per row, the remaining
+ *       elements will be ignored.
+ *
+ * @tparam IsaiType  determines if the ISAI is generated for a lower triangular
+ *                   matrix or an upper triangular matrix
+ * @tparam ValueType  precision of matrix elements
+ * @tparam IndexType  precision of matrix indexes
+ *
+ * @ingroup isai
+ * @ingroup precond
+ * @ingroup LinOp
+ */
+template <isai_type IsaiType, typename ValueType, typename IndexType>
+class Isai : public EnableLinOp<Isai<IsaiType, ValueType, IndexType>>,
+             public Transposable {
+    friend class EnableLinOp<Isai>;
+    friend class EnablePolymorphicObject<Isai, LinOp>;
+    friend class Isai<IsaiType == isai_type::lower ? isai_type::upper
+                                                   : isai_type::lower,
+                      ValueType, IndexType>;
+
+public:
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using transposed_type =
+        Isai<IsaiType == isai_type::lower ? isai_type::upper : isai_type::lower,
+             ValueType, IndexType>;
+    using Csr = matrix::Csr<ValueType, IndexType>;
+    static constexpr isai_type type{IsaiType};
+
+    /**
+     * Returns the approximate inverse of the given matrix (either L or U,
+     * depending on the template parameter IsaiType).
+     *
+     * @returns the generated approximate inverse
+     */
+    std::shared_ptr<const Csr> get_approximate_inverse() const
+    {
+        return approximate_inverse_;
+    }
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * @brief Optimization parameter that skips the sorting of the input
+         *        matrix (only skip if it is known that it is already sorted).
+         *
+         * The algorithm to create the approximate inverses requires the
+         * input matrix to be sorted. If it is, this parameter can be set to
+         * `true` to skip the sorting for better performance.
+         */
+        bool GKO_FACTORY_PARAMETER(skip_sorting, false);
+
+        /**
+         * @brief Which power of the input matrix should be used for the
+         *        sparsity pattern.
+         *
+         * The algorithm symbolically computes M^n and uses this sparsity
+         * pattern for the sparse inverse.
+         * Must be at least 1, default value 1.
+         */
+        int GKO_FACTORY_PARAMETER(sparsity_power, 1);
+    };
+
+    GKO_ENABLE_LIN_OP_FACTORY(Isai, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+protected:
+    explicit Isai(std::shared_ptr<const Executor> exec)
+        : EnableLinOp<Isai>(std::move(exec))
+    {}
+
+    /**
+     * Creates an Isai preconditioner from a matrix using an Isai::Factory.
+     *
+     * @param factory  the factory to use to create the preconditoner
+     * @param factors  Composition<ValueType> of a lower triangular and an
+     *                 upper triangular matrix (L and U)
+     */
+    explicit Isai(const Factory *factory,
+                  std::shared_ptr<const LinOp> system_matrix)
+        : EnableLinOp<Isai>(factory->get_executor(), system_matrix->get_size()),
+          parameters_{factory->get_parameters()}
+    {
+        const auto skip_sorting = parameters_.skip_sorting;
+        const auto power = parameters_.sparsity_power;
+        generate_inverse(system_matrix, skip_sorting, power);
+    }
+
+    void apply_impl(const LinOp *b, LinOp *x) const override
+    {
+        approximate_inverse_->apply(b, x);
+    }
+
+    void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta,
+                    LinOp *x) const override
+    {
+        approximate_inverse_->apply(alpha, b, beta, x);
+    }
+
+private:
+    /**
+     * Generates the approximate inverse for a triangular matrix and
+     * stores the result in `approximate_inverse_`.
+     *
+     * @param to_invert  the source triangular matrix used to generate
+     *                     the approximate inverse
+     *
+     * @param skip_sorting  dictates if the sorting of the input matrix should
+     *                      be skipped.
+     */
+    void generate_inverse(std::shared_ptr<const LinOp> to_invert,
+                          bool skip_sorting, int power);
+
+private:
+    std::shared_ptr<Csr> approximate_inverse_;
+};
+
+
+template <typename ValueType = default_precision, typename IndexType = int32>
+using LowerIsai = Isai<isai_type::lower, ValueType, IndexType>;
+
+template <typename ValueType = default_precision, typename IndexType = int32>
+using UpperIsai = Isai<isai_type::upper, ValueType, IndexType>;
+
+
+}  // namespace preconditioner
+}  // namespace gko
+
+
+#endif  // GKO_CORE_PRECONDITIONER_ISAI_HPP_
diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp
index 1c70ee0c498..fc6116f5d5e 100644
--- a/include/ginkgo/core/preconditioner/jacobi.hpp
+++ b/include/ginkgo/core/preconditioner/jacobi.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -206,7 +206,8 @@ struct block_interleaved_storage_scheme {
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
                public ConvertibleTo<matrix::Dense<ValueType>>,
-               public WritableToMatrixData<ValueType, IndexType> {
+               public WritableToMatrixData<ValueType, IndexType>,
+               public Transposable {
     friend class EnableLinOp<Jacobi>;
     friend class EnablePolymorphicObject<Jacobi, LinOp>;
 
@@ -216,6 +217,7 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
     using value_type = ValueType;
     using index_type = IndexType;
     using mat_data = matrix_data<ValueType, IndexType>;
+    using transposed_type = Jacobi<ValueType, IndexType>;
 
     /**
      * Returns the number of blocks of the operator.
@@ -287,15 +289,30 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
 
     void write(mat_data &data) const override;
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
          * Maximal size of diagonal blocks.
          *
-         * @note This value has to be between 1 and 32.
+         * @note This value has to be between 1 and 32 (NVIDIA)/64 (AMD).
          */
         uint32 GKO_FACTORY_PARAMETER(max_block_size, 32u);
 
+        /**
+         * Stride between two columns of a block (as number of elements).
+         *
+         * Should be a multiple of cache line size for best performance.
+         *
+         * @note If this value is 0, it uses 64 in hip AMD but 32 in NVIDIA or
+         *       reference executor. The allowed value: 0, 64 for AMD and 0, 32
+         *       for NVIDIA
+         */
+        uint32 GKO_FACTORY_PARAMETER(max_block_stride, 0u);
+
         /**
          * Starting (row / column) indexes of individual blocks.
          *
@@ -478,31 +495,22 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
     explicit Jacobi(const Factory *factory,
                     std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Jacobi>(factory->get_executor(),
-                              transpose(system_matrix->get_size())),
+                              gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
-          storage_scheme_{compute_storage_scheme(parameters_.max_block_size)},
+          storage_scheme_{this->compute_storage_scheme(
+              parameters_.max_block_size, parameters_.max_block_stride)},
           num_blocks_{parameters_.block_pointers.get_num_elems() - 1},
           blocks_(factory->get_executor(),
                   storage_scheme_.compute_storage_space(
                       parameters_.block_pointers.get_num_elems() - 1)),
           conditioning_(factory->get_executor())
     {
-        if (parameters_.max_block_size > 32 || parameters_.max_block_size < 1) {
-            GKO_NOT_SUPPORTED(this);
-        }
         parameters_.block_pointers.set_executor(this->get_executor());
         parameters_.storage_optimization.block_wise.set_executor(
             this->get_executor());
         this->generate(lend(system_matrix));
     }
 
-    /**
-     * Stride between two columns of a block (as number of elements).
-     *
-     * Should be a multiple of cache line size for best performance.
-     */
-    static constexpr size_type max_block_stride_ = 32;
-
     /**
      * Computes the storage scheme suitable for storing blocks of a given
      * maximum size.
@@ -511,11 +519,32 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
      *
      * @return a suitable storage scheme
      */
-    static block_interleaved_storage_scheme<index_type> compute_storage_scheme(
-        uint32 max_block_size) noexcept
+    block_interleaved_storage_scheme<index_type> compute_storage_scheme(
+        uint32 max_block_size, uint32 param_max_block_stride)
     {
+        uint32 default_block_stride = 32;
+        // If the executor is hip, the warp size is 32 or 64
+        if (auto hip_exec = std::dynamic_pointer_cast<const gko::HipExecutor>(
+                this->get_executor())) {
+            default_block_stride = hip_exec->get_warp_size();
+        }
+        uint32 max_block_stride = default_block_stride;
+        if (param_max_block_stride != 0) {
+            // if parameter max_block_stride is not zero, set max_block_stride =
+            // param_max_block_stride
+            max_block_stride = param_max_block_stride;
+            if (this->get_executor() != this->get_executor()->get_master() &&
+                max_block_stride != default_block_stride) {
+                // only support the default value on the gpu devive
+                GKO_NOT_SUPPORTED(this);
+            }
+        }
+        if (parameters_.max_block_size > max_block_stride ||
+            parameters_.max_block_size < 1) {
+            GKO_NOT_SUPPORTED(this);
+        }
         const auto group_size = static_cast<uint32>(
-            max_block_stride_ / get_superior_power(uint32{2}, max_block_size));
+            max_block_stride / get_superior_power(uint32{2}, max_block_size));
         const auto block_offset = max_block_size;
         const auto block_stride = group_size * block_offset;
         const auto group_offset = max_block_size * block_stride;
diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp
new file mode 100644
index 00000000000..14e3cada1ce
--- /dev/null
+++ b/include/ginkgo/core/solver/bicg.hpp
@@ -0,0 +1,190 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_SOLVER_BICG_HPP_
+#define GKO_CORE_SOLVER_BICG_HPP_
+
+
+#include <vector>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/log/logger.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/criterion.hpp>
+
+
+namespace gko {
+namespace solver {
+
+
+/**
+ * BICG or the Biconjugate gradient method is a Krylov subspace solver.
+ *
+ * Being a generic solver, it is capable of solving general matrices, including
+ * non-s.p.d matrices. Though, the memory and the computational requirement of
+ * the BiCG solver are higher than of its s.p.d solver counterpart, it has
+ * the capability to solve generic systems. BiCG is the unstable version of
+ * BiCGSTAB.
+ *
+ * @tparam ValueType  precision of matrix elements
+ *
+ * @ingroup solvers
+ * @ingroup LinOp
+ */
+template <typename ValueType = default_precision>
+class Bicg : public EnableLinOp<Bicg<ValueType>>,
+             public Preconditionable,
+             public Transposable {
+    friend class EnableLinOp<Bicg>;
+    friend class EnablePolymorphicObject<Bicg, LinOp>;
+
+public:
+    using value_type = ValueType;
+    using transposed_type = Bicg<ValueType>;
+
+    /**
+     * Gets the system operator (matrix) of the linear system.
+     *
+     * @return the system operator (matrix)
+     */
+    std::shared_ptr<const LinOp> get_system_matrix() const
+    {
+        return system_matrix_;
+    }
+
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
+
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * Criterion factories.
+         */
+        std::vector<std::shared_ptr<const stop::CriterionFactory>>
+            GKO_FACTORY_PARAMETER(criteria, nullptr);
+
+        /**
+         * Preconditioner factory.
+         */
+        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER(
+            preconditioner, nullptr);
+
+        /**
+         * Already generated preconditioner. If one is provided, the factory
+         * `preconditioner` will be ignored.
+         */
+        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER(
+            generated_preconditioner, nullptr);
+    };
+    GKO_ENABLE_LIN_OP_FACTORY(Bicg, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    void apply_impl(const LinOp *b, LinOp *x) const override;
+
+    void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta,
+                    LinOp *x) const override;
+
+    explicit Bicg(std::shared_ptr<const Executor> exec)
+        : EnableLinOp<Bicg>(std::move(exec))
+    {}
+
+    explicit Bicg(const Factory *factory,
+                  std::shared_ptr<const LinOp> system_matrix)
+        : EnableLinOp<Bicg>(factory->get_executor(),
+                            gko::transpose(system_matrix->get_size())),
+          parameters_{factory->get_parameters()},
+          system_matrix_{std::move(system_matrix)}
+    {
+        if (parameters_.generated_preconditioner) {
+            GKO_ASSERT_EQUAL_DIMENSIONS(parameters_.generated_preconditioner,
+                                        this);
+            set_preconditioner(parameters_.generated_preconditioner);
+        } else if (parameters_.preconditioner) {
+            set_preconditioner(
+                parameters_.preconditioner->generate(system_matrix_));
+        } else {
+            set_preconditioner(matrix::Identity<ValueType>::create(
+                this->get_executor(), this->get_size()[0]));
+        }
+        stop_criterion_factory_ =
+            stop::combine(std::move(parameters_.criteria));
+    }
+
+private:
+    std::shared_ptr<const LinOp> system_matrix_{};
+    std::shared_ptr<const stop::CriterionFactory> stop_criterion_factory_{};
+};
+
+
+}  // namespace solver
+}  // namespace gko
+
+
+#endif  // GKO_CORE_SOLVER_BICG_HPP_
diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp
index 35464a74f09..c7264bb2fbe 100644
--- a/include/ginkgo/core/solver/bicgstab.hpp
+++ b/include/ginkgo/core/solver/bicgstab.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -74,12 +74,14 @@ namespace solver {
  */
 template <typename ValueType = default_precision>
 class Bicgstab : public EnableLinOp<Bicgstab<ValueType>>,
-                 public Preconditionable {
+                 public Preconditionable,
+                 public Transposable {
     friend class EnableLinOp<Bicgstab>;
     friend class EnablePolymorphicObject<Bicgstab, LinOp>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Bicgstab<ValueType>;
 
     /**
      * Gets the system operator (matrix) of the linear system.
@@ -91,6 +93,39 @@ class Bicgstab : public EnableLinOp<Bicgstab<ValueType>>,
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
+
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -128,7 +163,7 @@ class Bicgstab : public EnableLinOp<Bicgstab<ValueType>>,
     explicit Bicgstab(const Factory *factory,
                       std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Bicgstab>(factory->get_executor(),
-                                transpose(system_matrix->get_size())),
+                                gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{std::move(system_matrix)}
     {
@@ -157,4 +192,4 @@ class Bicgstab : public EnableLinOp<Bicgstab<ValueType>>,
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_BICGSTAB_HPP
+#endif  // GKO_CORE_SOLVER_BICGSTAB_HPP_
diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp
index 9d661b3b07d..7e11b4e2d2a 100644
--- a/include/ginkgo/core/solver/cg.hpp
+++ b/include/ginkgo/core/solver/cg.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -69,12 +69,15 @@ namespace solver {
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
-class Cg : public EnableLinOp<Cg<ValueType>>, public Preconditionable {
+class Cg : public EnableLinOp<Cg<ValueType>>,
+           public Preconditionable,
+           public Transposable {
     friend class EnableLinOp<Cg>;
     friend class EnablePolymorphicObject<Cg, LinOp>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Cg<ValueType>;
 
     /**
      * Gets the system operator (matrix) of the linear system.
@@ -86,6 +89,39 @@ class Cg : public EnableLinOp<Cg<ValueType>>, public Preconditionable {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
+
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -123,7 +159,7 @@ class Cg : public EnableLinOp<Cg<ValueType>>, public Preconditionable {
     explicit Cg(const Factory *factory,
                 std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Cg>(factory->get_executor(),
-                          transpose(system_matrix->get_size())),
+                          gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{std::move(system_matrix)}
     {
@@ -152,4 +188,4 @@ class Cg : public EnableLinOp<Cg<ValueType>>, public Preconditionable {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_CG_HPP
+#endif  // GKO_CORE_SOLVER_CG_HPP_
diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp
index 65609f43b24..26b9e41e276 100644
--- a/include/ginkgo/core/solver/cgs.hpp
+++ b/include/ginkgo/core/solver/cgs.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -66,12 +66,15 @@ namespace solver {
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
-class Cgs : public EnableLinOp<Cgs<ValueType>>, public Preconditionable {
+class Cgs : public EnableLinOp<Cgs<ValueType>>,
+            public Preconditionable,
+            public Transposable {
     friend class EnableLinOp<Cgs>;
     friend class EnablePolymorphicObject<Cgs, LinOp>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Cgs<ValueType>;
 
     /**
      * Gets the system operator (matrix) of the linear system.
@@ -83,6 +86,39 @@ class Cgs : public EnableLinOp<Cgs<ValueType>>, public Preconditionable {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
+
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -120,7 +156,7 @@ class Cgs : public EnableLinOp<Cgs<ValueType>>, public Preconditionable {
     explicit Cgs(const Factory *factory,
                  std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Cgs>(factory->get_executor(),
-                           transpose(system_matrix->get_size())),
+                           gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{std::move(system_matrix)}
     {
@@ -149,4 +185,4 @@ class Cgs : public EnableLinOp<Cgs<ValueType>>, public Preconditionable {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_CGS_HPP
+#endif  // GKO_CORE_SOLVER_CGS_HPP_
diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp
index eaec9587685..ba9d14328ee 100644
--- a/include/ginkgo/core/solver/fcg.hpp
+++ b/include/ginkgo/core/solver/fcg.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -74,12 +74,15 @@ namespace solver {
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
-class Fcg : public EnableLinOp<Fcg<ValueType>>, public Preconditionable {
+class Fcg : public EnableLinOp<Fcg<ValueType>>,
+            public Preconditionable,
+            public Transposable {
     friend class EnableLinOp<Fcg>;
     friend class EnablePolymorphicObject<Fcg, LinOp>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Fcg<ValueType>;
 
     /**
      * Gets the system operator (matrix) of the linear system.
@@ -91,6 +94,39 @@ class Fcg : public EnableLinOp<Fcg<ValueType>>, public Preconditionable {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
+
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -128,7 +164,7 @@ class Fcg : public EnableLinOp<Fcg<ValueType>>, public Preconditionable {
     explicit Fcg(const Factory *factory,
                  std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Fcg>(factory->get_executor(),
-                           transpose(system_matrix->get_size())),
+                           gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{std::move(system_matrix)}
     {
@@ -157,4 +193,4 @@ class Fcg : public EnableLinOp<Fcg<ValueType>>, public Preconditionable {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_FCG_HPP
+#endif  // GKO_CORE_SOLVER_FCG_HPP_
diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp
index 06ff1abd629..83396641d41 100644
--- a/include/ginkgo/core/solver/gmres.hpp
+++ b/include/ginkgo/core/solver/gmres.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -69,12 +69,15 @@ constexpr size_type default_krylov_dim = 100u;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
-class Gmres : public EnableLinOp<Gmres<ValueType>>, public Preconditionable {
+class Gmres : public EnableLinOp<Gmres<ValueType>>,
+              public Preconditionable,
+              public Transposable {
     friend class EnableLinOp<Gmres>;
     friend class EnablePolymorphicObject<Gmres, LinOp>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Gmres<ValueType>;
 
     /**
      * Gets the system operator (matrix) of the linear system.
@@ -86,13 +89,53 @@ class Gmres : public EnableLinOp<Gmres<ValueType>>, public Preconditionable {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
+
     /**
-     * Returns the krylov dimension.
+     * Gets the krylov dimension of the solver
      *
      * @return the krylov dimension
      */
     size_type get_krylov_dim() const { return krylov_dim_; }
 
+    /**
+     * Sets the krylov dimension
+     *
+     * @param other  the new krylov dimension
+     */
+    void set_krylov_dim(const size_type &other) { krylov_dim_ = other; }
+
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -135,7 +178,7 @@ class Gmres : public EnableLinOp<Gmres<ValueType>>, public Preconditionable {
     explicit Gmres(const Factory *factory,
                    std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Gmres>(factory->get_executor(),
-                             transpose(system_matrix->get_size())),
+                             gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{std::move(system_matrix)}
     {
@@ -170,4 +213,4 @@ class Gmres : public EnableLinOp<Gmres<ValueType>>, public Preconditionable {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_GMRES_HPP
+#endif  // GKO_CORE_SOLVER_GMRES_HPP_
diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp
index 72173c6e49d..c665356ec5c 100644
--- a/include/ginkgo/core/solver/ir.hpp
+++ b/include/ginkgo/core/solver/ir.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
@@ -52,27 +53,34 @@ namespace solver {
 /**
  * Iterative refinement (IR) is an iterative method that uses another coarse
  * method to approximate the error of the current solution via the current
- * residual.
+ * residual. Moreover, it can be also considered as preconditioned Richardson
+ * iteration with relaxation factor = 1.
  *
  * For any approximation of the solution `solution` to the system `Ax = b`, the
  * residual is defined as: `residual = b - A solution`. The error in
  * `solution`,  `e = x - solution` (with `x` being the exact solution) can be
  * obtained as the solution to the residual equation `Ae = residual`, since `A e
  * = Ax - A solution = b - A solution = residual`. Then, the real solution is
- * computed as `x = solution + e`. Instead of accurately solving the residual
- * equation `Ae = residual`, the solution of the system `e` can be approximated
- * to obtain the approximation `error` using a coarse method `solver`, which is
- * used to update `solution`, and the entire process is repeated with the
- * updated `solution`.  This yields the iterative refinement method:
+ * computed as `x = relaxation_factor * solution + e`. Instead of accurately
+ * solving the residual equation `Ae = residual`, the solution of the system `e`
+ * can be approximated to obtain the approximation `error` using a coarse method
+ * `solver`, which is used to update `solution`, and the entire process is
+ * repeated with the updated `solution`.  This yields the iterative refinement
+ * method:
  *
  * ```
  * solution = initial_guess
  * while not converged:
  *     residual = b - A solution
  *     error = solver(A, residual)
- *     solution = solution + error
+ *     solution = solution + relaxation_factor * error
  * ```
  *
+ * With `relaxation_factor` equal to 1 (default), the solver is Iterative
+ * Refinement, with `relaxation_factor` equal to a value other than `1`, the
+ * solver is a Richardson iteration, with possibility for additional
+ * preconditioning.
+ *
  * Assuming that `solver` has accuracy `c`, i.e., `| e - error | <= c | e |`,
  * iterative refinement will converge with a convergence rate of `c`. Indeed,
  * from `e - error = x - solution - error = x - solution*` (where `solution*`
@@ -86,7 +94,8 @@ namespace solver {
  * solver. Such a setting results in a relaxation method known as the Richardson
  * iteration with parameter 1, which is guaranteed to converge for matrices
  * whose spectrum is strictly contained within the unit disc around 1 (i.e., all
- * its eigenvalues `lambda` have to satisfy the equation `|lambda - 1| < 1).
+ * its eigenvalues `lambda` have to satisfy the equation `|relaxation_factor *
+ * lambda - 1| < 1).
  *
  * @tparam ValueType  precision of matrix elements
  *
@@ -94,12 +103,13 @@ namespace solver {
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
-class Ir : public EnableLinOp<Ir<ValueType>> {
+class Ir : public EnableLinOp<Ir<ValueType>>, public Transposable {
     friend class EnableLinOp<Ir>;
     friend class EnablePolymorphicObject<Ir, LinOp>;
 
 public:
     using value_type = ValueType;
+    using transposed_type = Ir<ValueType>;
 
     /**
      * Returns the system operator (matrix) of the linear system.
@@ -111,6 +121,16 @@ class Ir : public EnableLinOp<Ir<ValueType>> {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
+    /**
+     * Return true as iterative solvers use the data in x as an initial guess.
+     *
+     * @return true as iterative solvers use the data in x as an initial guess.
+     */
+    bool apply_uses_initial_guess() const override { return true; }
 
     /**
      * Returns the solver operator used as the inner solver.
@@ -130,6 +150,28 @@ class Ir : public EnableLinOp<Ir<ValueType>> {
         solver_ = new_solver;
     }
 
+    /**
+     * Gets the stopping criterion factory of the solver.
+     *
+     * @return the stopping criterion factory
+     */
+    std::shared_ptr<const stop::CriterionFactory> get_stop_criterion_factory()
+        const
+    {
+        return stop_criterion_factory_;
+    }
+
+    /**
+     * Sets the stopping criterion of the solver.
+     *
+     * @param other  the new stopping criterion factory
+     */
+    void set_stop_criterion_factory(
+        std::shared_ptr<const stop::CriterionFactory> other)
+    {
+        stop_criterion_factory_ = std::move(other);
+    }
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -150,6 +192,11 @@ class Ir : public EnableLinOp<Ir<ValueType>> {
          */
         std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER(generated_solver,
                                                            nullptr);
+
+        /**
+         * Relaxation factor for Richardson iteration
+         */
+        ValueType GKO_FACTORY_PARAMETER(relaxation_factor, value_type{1});
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ir, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
@@ -167,7 +214,7 @@ class Ir : public EnableLinOp<Ir<ValueType>> {
     explicit Ir(const Factory *factory,
                 std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<Ir>(factory->get_executor(),
-                          transpose(system_matrix->get_size())),
+                          gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{std::move(system_matrix)}
     {
@@ -180,6 +227,8 @@ class Ir : public EnableLinOp<Ir<ValueType>> {
             solver_ = matrix::Identity<ValueType>::create(this->get_executor(),
                                                           this->get_size()[0]);
         }
+        relaxation_factor_ = gko::initialize<matrix::Dense<ValueType>>(
+            {parameters_.relaxation_factor}, this->get_executor());
         stop_criterion_factory_ =
             stop::combine(std::move(parameters_.criteria));
     }
@@ -188,9 +237,14 @@ class Ir : public EnableLinOp<Ir<ValueType>> {
     std::shared_ptr<const LinOp> system_matrix_{};
     std::shared_ptr<const LinOp> solver_{};
     std::shared_ptr<const stop::CriterionFactory> stop_criterion_factory_{};
+    std::shared_ptr<const matrix::Dense<ValueType>> relaxation_factor_{};
 };
 
 
+template <typename ValueType = default_precision>
+using Richardson = Ir<ValueType>;
+
+
 }  // namespace solver
 }  // namespace gko
 
diff --git a/include/ginkgo/core/solver/lower_trs.hpp b/include/ginkgo/core/solver/lower_trs.hpp
index 481063a4f51..409a2cd1583 100644
--- a/include/ginkgo/core/solver/lower_trs.hpp
+++ b/include/ginkgo/core/solver/lower_trs.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -58,6 +58,10 @@ namespace solver {
 struct SolveStruct;
 
 
+template <typename ValueType, typename IndexType>
+class UpperTrs;
+
+
 /**
  * LowerTrs is the triangular solver which solves the system L x = b, when L is
  * a lower triangular matrix. It works best when passing in a matrix in CSR
@@ -76,13 +80,16 @@ struct SolveStruct;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>> {
+class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>>,
+                 public Transposable {
     friend class EnableLinOp<LowerTrs>;
     friend class EnablePolymorphicObject<LowerTrs, LinOp>;
+    friend class UpperTrs<ValueType, IndexType>;
 
 public:
     using value_type = ValueType;
     using index_type = IndexType;
+    using transposed_type = UpperTrs<ValueType, IndexType>;
 
     /**
      * Gets the system operator (CSR matrix) of the linear system.
@@ -95,6 +102,10 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>> {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -138,7 +149,7 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>> {
     explicit LowerTrs(const Factory *factory,
                       std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<LowerTrs>(factory->get_executor(),
-                                transpose(system_matrix->get_size())),
+                                gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{}
     {
@@ -168,4 +179,4 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>> {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_LOWER_TRS_HPP
+#endif  // GKO_CORE_SOLVER_LOWER_TRS_HPP_
diff --git a/include/ginkgo/core/solver/upper_trs.hpp b/include/ginkgo/core/solver/upper_trs.hpp
index 6f23cd4d1fd..512467919a4 100644
--- a/include/ginkgo/core/solver/upper_trs.hpp
+++ b/include/ginkgo/core/solver/upper_trs.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -58,6 +58,10 @@ namespace solver {
 struct SolveStruct;
 
 
+template <typename ValueType, typename IndexType>
+class LowerTrs;
+
+
 /**
  * UpperTrs is the triangular solver which solves the system U x = b, when U is
  * an upper triangular matrix. It works best when passing in a matrix in CSR
@@ -76,13 +80,16 @@ struct SolveStruct;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>> {
+class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>>,
+                 public Transposable {
     friend class EnableLinOp<UpperTrs>;
     friend class EnablePolymorphicObject<UpperTrs, LinOp>;
+    friend class LowerTrs<ValueType, IndexType>;
 
 public:
     using value_type = ValueType;
     using index_type = IndexType;
+    using transposed_type = LowerTrs<ValueType, IndexType>;
 
     /**
      * Gets the system operator (CSR matrix) of the linear system.
@@ -95,6 +102,10 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>> {
         return system_matrix_;
     }
 
+    std::unique_ptr<LinOp> transpose() const override;
+
+    std::unique_ptr<LinOp> conj_transpose() const override;
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
@@ -132,7 +143,7 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>> {
     explicit UpperTrs(const Factory *factory,
                       std::shared_ptr<const LinOp> system_matrix)
         : EnableLinOp<UpperTrs>(factory->get_executor(),
-                                transpose(system_matrix->get_size())),
+                                gko::transpose(system_matrix->get_size())),
           parameters_{factory->get_parameters()},
           system_matrix_{}
     {
@@ -162,4 +173,4 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>> {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SOLVER_UPPER_TRS_HPP
+#endif  // GKO_CORE_SOLVER_UPPER_TRS_HPP_
diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp
index cc0e88a36be..d5d88a978f6 100644
--- a/include/ginkgo/core/stop/combined.hpp
+++ b/include/ginkgo/core/stop/combined.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_STOP_COMBINED_HPP_
 
 
-#include <ginkgo/core/stop/criterion.hpp>
+#include <vector>
 
 
-#include <vector>
+#include <ginkgo/core/stop/criterion.hpp>
 
 
 namespace gko {
@@ -87,7 +87,14 @@ class Combined : public EnablePolymorphicObject<Combined, Criterion> {
           parameters_{factory->get_parameters()}
     {
         for (const auto &f : parameters_.criteria) {
-            criteria_.push_back(f->generate(args));
+            // Ignore the nullptr from the list
+            if (f != nullptr) {
+                criteria_.push_back(f->generate(args));
+            }
+        }
+        // If the list are empty or all nullptr, throw gko::NotSupported
+        if (criteria_.size() == 0) {
+            GKO_NOT_SUPPORTED(this);
         }
     }
 
@@ -120,12 +127,21 @@ std::shared_ptr<const CriterionFactory> combine(FactoryContainer &&factories)
         GKO_NOT_SUPPORTED(nullptr);
         return nullptr;
     case 1:
+        if (factories[0] == nullptr) {
+            GKO_NOT_SUPPORTED(nullptr);
+        }
         return factories[0];
     default:
-        auto exec = factories[0]->get_executor();
-        return Combined::build()
-            .with_criteria(std::forward<FactoryContainer>(factories))
-            .on(exec);
+        if (factories[0] == nullptr) {
+            // first factory must be valid to capture executor
+            GKO_NOT_SUPPORTED(nullptr);
+            return nullptr;
+        } else {
+            auto exec = factories[0]->get_executor();
+            return Combined::build()
+                .with_criteria(std::forward<FactoryContainer>(factories))
+                .on(exec);
+        }
     }
 }
 
diff --git a/include/ginkgo/core/stop/criterion.hpp b/include/ginkgo/core/stop/criterion.hpp
index f619137387e..35c28aefdd9 100644
--- a/include/ginkgo/core/stop/criterion.hpp
+++ b/include/ginkgo/core/stop/criterion.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
 #include <ginkgo/core/base/utils.hpp>
@@ -275,6 +276,12 @@ using EnableDefaultCriterionFactory =
  *                          `get_<_parameters_name>()`)
  * @param _factory_name  name of the generated factory type
  *
+ * @internal For some abstract reason, nvcc compilation through HIP does not
+ *           properly take into account the `using` declaration to inherit
+ *           constructors. In addition, the default initialization `{}` for
+ *           `_parameters_name##type parameters` also does not work, which
+ *           means the current form is probably the only correct one.
+ *
  * @ingroup stop
  */
 #define GKO_ENABLE_CRITERION_FACTORY(_criterion, _parameters_name,           \
diff --git a/include/ginkgo/core/stop/iteration.hpp b/include/ginkgo/core/stop/iteration.hpp
index 8c17274c9de..e2fa08a60e3 100644
--- a/include/ginkgo/core/stop/iteration.hpp
+++ b/include/ginkgo/core/stop/iteration.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp
new file mode 100644
index 00000000000..03052618cc5
--- /dev/null
+++ b/include/ginkgo/core/stop/residual_norm.hpp
@@ -0,0 +1,278 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_STOP_RESIDUAL_NORM_HPP_
+#define GKO_CORE_STOP_RESIDUAL_NORM_HPP_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/criterion.hpp>
+
+
+namespace gko {
+namespace stop {
+
+
+/**
+ * The ResidualNorm class provides a framework for stopping criteria
+ * related to the residual norm. These criteria differ in the way they
+ * initialize starting_tau_, so in the value they compare the
+ * residual norm against.
+ *
+ * @ingroup stop
+ */
+template <typename ValueType = default_precision>
+class ResidualNorm
+    : public EnablePolymorphicObject<ResidualNorm<ValueType>, Criterion> {
+    friend class EnablePolymorphicObject<ResidualNorm<ValueType>, Criterion>;
+
+public:
+    using NormVector = matrix::Dense<remove_complex<ValueType>>;
+    using Vector = matrix::Dense<ValueType>;
+
+protected:
+    bool check_impl(uint8 stoppingId, bool setFinalized,
+                    Array<stopping_status> *stop_status, bool *one_changed,
+                    const Criterion::Updater &) override;
+
+    explicit ResidualNorm(std::shared_ptr<const gko::Executor> exec)
+        : EnablePolymorphicObject<ResidualNorm, Criterion>(exec),
+          device_storage_{exec, 2}
+    {}
+
+    explicit ResidualNorm(std::shared_ptr<const gko::Executor> exec,
+                          remove_complex<ValueType> tolerance)
+        : EnablePolymorphicObject<ResidualNorm, Criterion>(exec),
+          device_storage_{exec, 2},
+          tolerance_{tolerance}
+    {}
+
+    std::unique_ptr<NormVector> starting_tau_{};
+    std::unique_ptr<NormVector> u_dense_tau_{};
+
+private:
+    remove_complex<ValueType> tolerance_{};
+    /* Contains device side: all_converged and one_changed booleans */
+    Array<bool> device_storage_;
+};
+
+
+/**
+ * The ResidualNormReduction class is a stopping criterion which stops the
+ * iteration process when the residual norm is below a certain
+ * threshold relative to the norm of the initial residual, i.e. when
+ * norm(residual) / norm(initial_residual) < threshold.
+ * For better performance, the checks are run thanks to kernels on
+ * the executor where the algorithm is executed.
+ *
+ * @note To use this stopping criterion there are some dependencies. The
+ * constructor depends on `initial_residual` in order to compute the first
+ * relative residual norm. The check method depends on either the
+ * `residual_norm` or the `residual` being set. When any of those is not
+ * correctly provided, an exception ::gko::NotSupported() is thrown.
+ *
+ * @ingroup stop
+ */
+template <typename ValueType = default_precision>
+class ResidualNormReduction : public ResidualNorm<ValueType> {
+public:
+    using NormVector = matrix::Dense<remove_complex<ValueType>>;
+    using Vector = matrix::Dense<ValueType>;
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * Factor by which the residual norm will be reduced
+         */
+        remove_complex<ValueType> GKO_FACTORY_PARAMETER(reduction_factor,
+                                                        1e-15);
+    };
+    GKO_ENABLE_CRITERION_FACTORY(ResidualNormReduction<ValueType>, parameters,
+                                 Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    explicit ResidualNormReduction(std::shared_ptr<const gko::Executor> exec)
+        : ResidualNorm<ValueType>(exec)
+    {}
+
+    explicit ResidualNormReduction(const Factory *factory,
+                                   const CriterionArgs &args)
+        : ResidualNorm<ValueType>(factory->get_executor(),
+                                  factory->get_parameters().reduction_factor),
+          parameters_{factory->get_parameters()}
+    {
+        if (args.initial_residual == nullptr) {
+            GKO_NOT_SUPPORTED(nullptr);
+        }
+
+        auto exec = factory->get_executor();
+
+        auto dense_r = as<Vector>(args.initial_residual);
+        this->starting_tau_ = NormVector::create(
+            exec, dim<2>{1, args.initial_residual->get_size()[1]});
+        this->u_dense_tau_ =
+            NormVector::create_with_config_of(this->starting_tau_.get());
+        dense_r->compute_norm2(this->starting_tau_.get());
+    }
+};
+
+
+/**
+ * The RelativeResidualNorm class is a stopping criterion which stops the
+ * iteration process when the residual norm is below a certain
+ * threshold relative to the norm of the right-hand side, i.e. when
+ * norm(residual) / norm(right_hand_side) < threshold.
+ * For better performance, the checks are run thanks to kernels on
+ * the executor where the algorithm is executed.
+ *
+ * @note To use this stopping criterion there are some dependencies. The
+ * constructor depends on `b` in order to compute the norm of the
+ * right-hand side. If this is not correctly provided, an exception
+ * ::gko::NotSupported() is thrown.
+ *
+ * @ingroup stop
+ */
+template <typename ValueType = default_precision>
+class RelativeResidualNorm : public ResidualNorm<ValueType> {
+public:
+    using NormVector = matrix::Dense<remove_complex<ValueType>>;
+    using Vector = matrix::Dense<ValueType>;
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * Relative residual norm goal
+         */
+        remove_complex<ValueType> GKO_FACTORY_PARAMETER(tolerance, 1e-15);
+    };
+    GKO_ENABLE_CRITERION_FACTORY(RelativeResidualNorm<ValueType>, parameters,
+                                 Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    explicit RelativeResidualNorm(std::shared_ptr<const gko::Executor> exec)
+        : ResidualNorm<ValueType>(exec)
+    {}
+
+    explicit RelativeResidualNorm(const Factory *factory,
+                                  const CriterionArgs &args)
+        : ResidualNorm<ValueType>(factory->get_executor(),
+                                  factory->get_parameters().tolerance),
+          parameters_{factory->get_parameters()}
+    {
+        if (args.b == nullptr) {
+            GKO_NOT_SUPPORTED(nullptr);
+        }
+
+        auto exec = factory->get_executor();
+
+        auto dense_rhs = as<Vector>(args.b);
+        this->starting_tau_ =
+            NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
+        this->u_dense_tau_ =
+            NormVector::create_with_config_of(this->starting_tau_.get());
+        dense_rhs->compute_norm2(this->starting_tau_.get());
+    }
+};
+
+
+/**
+ * The AbsoluteResidualNorm class is a stopping criterion which stops the
+ * iteration process when the residual norm is below a certain
+ * threshold, i.e. when norm(residual) / threshold.
+ * For better performance, the checks are run thanks to kernels on
+ * the executor where the algorithm is executed.
+ *
+ * @note To use this stopping criterion there are some dependencies. The
+ * constructor depends on `b` in order to get the number of right-hand sides.
+ * If this is not correctly provided, an exception ::gko::NotSupported()
+ * is thrown.
+ *
+ * @ingroup stop
+ */
+template <typename ValueType = default_precision>
+class AbsoluteResidualNorm : public ResidualNorm<ValueType> {
+public:
+    using NormVector = matrix::Dense<remove_complex<ValueType>>;
+    using Vector = matrix::Dense<ValueType>;
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        /**
+         * Absolute residual norm goal
+         */
+        remove_complex<ValueType> GKO_FACTORY_PARAMETER(tolerance, 1e-15);
+    };
+    GKO_ENABLE_CRITERION_FACTORY(AbsoluteResidualNorm<ValueType>, parameters,
+                                 Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+protected:
+    void initialize_starting_tau();
+
+    explicit AbsoluteResidualNorm(std::shared_ptr<const gko::Executor> exec)
+        : ResidualNorm<ValueType>(exec)
+    {}
+
+    explicit AbsoluteResidualNorm(const Factory *factory,
+                                  const CriterionArgs &args)
+        : ResidualNorm<ValueType>(factory->get_executor(),
+                                  factory->get_parameters().tolerance),
+          parameters_{factory->get_parameters()}
+    {
+        if (args.b == nullptr) {
+            GKO_NOT_SUPPORTED(nullptr);
+        }
+
+        auto exec = factory->get_executor();
+
+        this->starting_tau_ =
+            NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
+        this->u_dense_tau_ =
+            NormVector::create_with_config_of(this->starting_tau_.get());
+        initialize_starting_tau();
+    }
+};
+
+
+}  // namespace stop
+}  // namespace gko
+
+
+#endif  // GKO_CORE_STOP_RESIDUAL_NORM_HPP_
diff --git a/include/ginkgo/core/stop/residual_norm_reduction.hpp b/include/ginkgo/core/stop/residual_norm_reduction.hpp
index 4ae3392021b..6872b7be5c2 100644
--- a/include/ginkgo/core/stop/residual_norm_reduction.hpp
+++ b/include/ginkgo/core/stop/residual_norm_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,94 +34,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_HPP_
 
 
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/utils.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/stop/criterion.hpp>
-
-
-#include <type_traits>
-
-
-namespace gko {
-namespace stop {
-
-/**
- * The ResidualNormReduction class is a stopping criterion which stops the
- * iteration process when the relative residual norm is below a certain
- * threshold. For better performance, the checks are run thanks to kernels on
- * the executor where the algorithm is executed.
- *
- * @note To use this stopping criterion there are some dependencies. The
- * constructor depends on `initial_residual` in order to compute the first
- * relative residual norm. The check method depends on either the
- * `residual_norm` or the `residual` being set. When any of those is not
- * correctly provided, an exception ::gko::NotSupported() is thrown.
- *
- * @ingroup stop
- */
-template <typename ValueType = default_precision>
-class ResidualNormReduction
-    : public EnablePolymorphicObject<ResidualNormReduction<ValueType>,
-                                     Criterion> {
-    friend class EnablePolymorphicObject<ResidualNormReduction<ValueType>,
-                                         Criterion>;
-
-public:
-    using Vector = matrix::Dense<ValueType>;
-
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Relative residual norm goal
-         */
-        remove_complex<ValueType> GKO_FACTORY_PARAMETER(reduction_factor,
-                                                        1e-15);
-    };
-    GKO_ENABLE_CRITERION_FACTORY(ResidualNormReduction<ValueType>, parameters,
-                                 Factory);
-    GKO_ENABLE_BUILD_METHOD(Factory);
-
-protected:
-    bool check_impl(uint8 stoppingId, bool setFinalized,
-                    Array<stopping_status> *stop_status, bool *one_changed,
-                    const Criterion::Updater &) override;
-
-    explicit ResidualNormReduction(std::shared_ptr<const gko::Executor> exec)
-        : EnablePolymorphicObject<ResidualNormReduction, Criterion>(exec),
-          device_storage_{exec, 2}
-    {}
-
-    explicit ResidualNormReduction(const Factory *factory,
-                                   const CriterionArgs &args)
-        : EnablePolymorphicObject<ResidualNormReduction, Criterion>(
-              factory->get_executor()),
-          parameters_{factory->get_parameters()},
-          device_storage_{factory->get_executor(), 2}
-    {
-        if (args.initial_residual == nullptr) {
-            GKO_NOT_SUPPORTED(nullptr);
-        }
-
-        auto exec = factory->get_executor();
-
-        auto dense_r = as<Vector>(args.initial_residual);
-        starting_tau_ = Vector::create(
-            exec, dim<2>{1, args.initial_residual->get_size()[1]});
-        u_dense_tau_ = Vector::create_with_config_of(starting_tau_.get());
-        dense_r->compute_norm2(starting_tau_.get());
-    }
-
-private:
-    std::unique_ptr<Vector> starting_tau_{};
-    std::unique_ptr<Vector> u_dense_tau_{};
-    /* Contains device side: all_converged and one_changed booleans */
-    Array<bool> device_storage_;
-};
-
-
-}  // namespace stop
-}  // namespace gko
+#ifdef __GNUC__
+#pragma message \
+    "This file is deprecated and will be removed in a later major release."
+#elif defined(_MSC_VER)
+#pragma message WARN( \
+    "This file is deprecated and will be removed in a later major release.")
+#endif
+#include <ginkgo/core/stop/residual_norm.hpp>
 
 
 #endif  // GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_HPP_
diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp
index 6a74d0f7d8f..679d78be4de 100644
--- a/include/ginkgo/core/stop/stopping_status.hpp
+++ b/include/ginkgo/core/stop/stopping_status.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp
index a63bf576112..ef8f52fe7d8 100644
--- a/include/ginkgo/core/stop/time.hpp
+++ b/include/ginkgo/core/stop/time.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_STOP_TIME_HPP_
 
 
-#include <ginkgo/core/stop/criterion.hpp>
+#include <chrono>
 
 
-#include <chrono>
+#include <ginkgo/core/stop/criterion.hpp>
 
 
 namespace gko {
@@ -58,10 +58,10 @@ class Time : public EnablePolymorphicObject<Time, Criterion> {
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**
-         * Amount of seconds to wait
+         * Amount of seconds to wait (default value: 10 seconds)
          */
-        std::chrono::nanoseconds GKO_FACTORY_PARAMETER(
-            time_limit, std::chrono::seconds(10));
+        std::chrono::nanoseconds GKO_FACTORY_PARAMETER(time_limit,
+                                                       10000000000LL);
     };
     GKO_ENABLE_CRITERION_FACTORY(Time, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
@@ -72,7 +72,9 @@ class Time : public EnablePolymorphicObject<Time, Criterion> {
                     const Updater &) override;
 
     explicit Time(std::shared_ptr<const gko::Executor> exec)
-        : EnablePolymorphicObject<Time, Criterion>(std::move(exec))
+        : EnablePolymorphicObject<Time, Criterion>(std::move(exec)),
+          time_limit_{},
+          start_{}
     {}
 
     explicit Time(const Factory *factory, const CriterionArgs args)
@@ -89,8 +91,8 @@ class Time : public EnablePolymorphicObject<Time, Criterion> {
      * parameters and here properly convert the double to a
      * std::chrono::duration type
      */
-    std::chrono::duration<double> time_limit_{};
-    clock::time_point start_{};
+    std::chrono::duration<double> time_limit_;
+    clock::time_point start_;
 };
 
 
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index 075ddc92806..7fccad5a29d 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_SYNTHESIZER_CONTAINERS_
-#define GKO_CORE_SYNTHESIZER_CONTAINERS_
+#ifndef GKO_CORE_SYNTHESIZER_CONTAINERS_HPP_
+#define GKO_CORE_SYNTHESIZER_CONTAINERS_HPP_
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
 
 
 namespace gko {
@@ -115,4 +118,4 @@ using as_list = typename detail::as_list_impl<T>::type;
 }  // namespace gko
 
 
-#endif  // GKO_CORE_SYNTHESIZER_CONTAINERS_
+#endif  // GKO_CORE_SYNTHESIZER_CONTAINERS_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 95a7dc3c734..1866412706d 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,7 +59,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/base/version.hpp>
 
+#include <ginkgo/core/factorization/ilu.hpp>
+#include <ginkgo/core/factorization/par_ict.hpp>
 #include <ginkgo/core/factorization/par_ilu.hpp>
+#include <ginkgo/core/factorization/par_ilut.hpp>
 
 #include <ginkgo/core/log/convergence.hpp>
 #include <ginkgo/core/log/logger.hpp>
@@ -73,12 +76,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include <ginkgo/core/preconditioner/ilu.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
+#include <ginkgo/core/solver/bicg.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/solver/cgs.hpp>
@@ -91,7 +97,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
diff --git a/include/ginkgo/ginkgo.hpp.in b/include/ginkgo/ginkgo.hpp.in
index 6dbaafef794..d3e83f82ccd 100644
--- a/include/ginkgo/ginkgo.hpp.in
+++ b/include/ginkgo/ginkgo.hpp.in
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/matrices/CMakeLists.txt b/matrices/CMakeLists.txt
index 6c368edfa95..ffe3602f83a 100644
--- a/matrices/CMakeLists.txt
+++ b/matrices/CMakeLists.txt
@@ -3,3 +3,13 @@ configure_file("${Ginkgo_SOURCE_DIR}/matrices/config.hpp.in"
 
 configure_file("test/ani1.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/ani1.mtx")
 configure_file("test/ani4.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/ani4.mtx")
+configure_file("test/isai_l.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l.mtx")
+configure_file("test/isai_l_excess.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_excess.mtx")
+configure_file("test/isai_l_excess_rhs.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_excess_rhs.mtx")
+configure_file("test/isai_l_inv.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_inv.mtx")
+configure_file("test/isai_l_inv_partial.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_inv_partial.mtx")
+configure_file("test/isai_u.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u.mtx")
+configure_file("test/isai_u_excess.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_excess.mtx")
+configure_file("test/isai_u_excess_rhs.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_excess_rhs.mtx")
+configure_file("test/isai_u_inv.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_inv.mtx")
+configure_file("test/isai_u_inv_partial.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_inv_partial.mtx")
diff --git a/matrices/config.hpp.in b/matrices/config.hpp.in
index 3920a6d1788..93cd1adb874 100644
--- a/matrices/config.hpp.in
+++ b/matrices/config.hpp.in
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,13 @@ namespace gko {
 namespace matrices {
 
 
-const char *location_ani1_mtx = "@Ginkgo_SOURCE_DIR@/matrices/test/ani1.mtx";
-const char *location_ani4_mtx = "@Ginkgo_SOURCE_DIR@/matrices/test/ani4.mtx";
+const char *location_ani1_mtx = "@Ginkgo_BINARY_DIR@/matrices/test/ani1.mtx";
+const char *location_ani4_mtx = "@Ginkgo_BINARY_DIR@/matrices/test/ani4.mtx";
+const char *location_isai_mtxs = "@Ginkgo_BINARY_DIR@/matrices/test/";
 
 
-} // namespace matrices
-} // namespace gko
+}  // namespace matrices
+}  // namespace gko
 
 
-#endif // GKO_MATRICES_CONFIG_HPP_
+#endif  // GKO_MATRICES_CONFIG_HPP_
diff --git a/matrices/test/isai_l.mtx b/matrices/test/isai_l.mtx
new file mode 100644
index 00000000000..01e7dd3eeab
--- /dev/null
+++ b/matrices/test/isai_l.mtx
@@ -0,0 +1,162 @@
+%%MatrixMarket matrix coordinate real general
+35 35 160
+1 1 1
+2 1 -1
+2 2 1
+3 2 -1
+3 3 2
+4 3 2
+4 4 1
+5 3 2
+5 4 -1
+5 5 1
+6 3 2
+6 5 -1
+6 6 1
+7 3 2
+7 6 -1
+7 7 1
+8 3 2
+8 7 -1
+8 8 1
+9 3 2
+9 8 -1
+9 9 1
+10 3 2
+10 9 -1
+10 10 1
+11 3 2
+11 10 -1
+11 11 1
+12 3 2
+12 11 -1
+12 12 1
+13 3 2
+13 12 -1
+13 13 1
+14 3 2
+14 13 -1
+14 14 1
+15 3 2
+15 14 -1
+15 15 1
+16 3 2
+16 15 -1
+16 16 1
+17 3 2
+17 16 -1
+17 17 1
+18 3 2
+18 17 -1
+18 18 1
+19 3 2
+19 18 -1
+19 19 1
+20 3 2
+20 19 -1
+20 20 1
+21 3 2
+21 20 -1
+21 21 1
+22 3 2
+22 21 -1
+22 22 1
+23 3 2
+23 22 -1
+23 23 1
+24 3 2
+24 23 -1
+24 24 1
+25 3 2
+25 24 -1
+25 25 1
+26 3 2
+26 25 -1
+26 26 1
+27 3 2
+27 26 -1
+27 27 1
+28 3 2
+28 27 -1
+28 28 1
+29 3 2
+29 28 -1
+29 29 1
+30 3 2
+30 29 -1
+30 30 1
+31 3 2
+31 30 -1
+31 31 1
+32 3 2
+32 31 -1
+32 32 1
+33 1 1
+33 2 1
+33 3 3
+33 4 1
+33 5 1
+33 6 1
+33 7 1
+33 8 1
+33 9 1
+33 10 1
+33 11 1
+33 12 1
+33 13 1
+33 14 1
+33 15 1
+33 16 1
+33 17 1
+33 18 1
+33 19 1
+33 20 1
+33 21 1
+33 22 1
+33 23 1
+33 24 1
+33 25 1
+33 26 1
+33 27 1
+33 28 1
+33 29 1
+33 30 1
+33 31 1
+33 32 1
+33 33 1
+34 3 2
+34 33 -1
+34 34 1
+35 2 12345
+35 3 12345
+35 4 12345
+35 5 12345
+35 6 12345
+35 7 12345
+35 8 12345
+35 9 12345
+35 10 12345
+35 11 12345
+35 12 12345
+35 13 12345
+35 14 12345
+35 15 12345
+35 16 12345
+35 17 12345
+35 18 12345
+35 19 12345
+35 20 12345
+35 21 12345
+35 22 12345
+35 23 12345
+35 24 12345
+35 25 12345
+35 26 12345
+35 27 12345
+35 28 12345
+35 29 12345
+35 30 12345
+35 31 12345
+35 32 12345
+35 34 12345
+35 35 1
diff --git a/matrices/test/isai_l_excess.mtx b/matrices/test/isai_l_excess.mtx
new file mode 100644
index 00000000000..da41a6c58e1
--- /dev/null
+++ b/matrices/test/isai_l_excess.mtx
@@ -0,0 +1,250 @@
+%%MatrixMarket matrix coordinate real general
+66 66 248
+1 1 1
+2 1 -1
+2 2 1
+3 2 -1
+3 3 2
+4 3 2
+4 4 1
+5 3 2
+5 4 -1
+5 5 1
+6 3 2
+6 5 -1
+6 6 1
+7 3 2
+7 6 -1
+7 7 1
+8 3 2
+8 7 -1
+8 8 1
+9 3 2
+9 8 -1
+9 9 1
+10 3 2
+10 9 -1
+10 10 1
+11 3 2
+11 10 -1
+11 11 1
+12 3 2
+12 11 -1
+12 12 1
+13 3 2
+13 12 -1
+13 13 1
+14 3 2
+14 13 -1
+14 14 1
+15 3 2
+15 14 -1
+15 15 1
+16 3 2
+16 15 -1
+16 16 1
+17 3 2
+17 16 -1
+17 17 1
+18 3 2
+18 17 -1
+18 18 1
+19 3 2
+19 18 -1
+19 19 1
+20 3 2
+20 19 -1
+20 20 1
+21 3 2
+21 20 -1
+21 21 1
+22 3 2
+22 21 -1
+22 22 1
+23 3 2
+23 22 -1
+23 23 1
+24 3 2
+24 23 -1
+24 24 1
+25 3 2
+25 24 -1
+25 25 1
+26 3 2
+26 25 -1
+26 26 1
+27 3 2
+27 26 -1
+27 27 1
+28 3 2
+28 27 -1
+28 28 1
+29 3 2
+29 28 -1
+29 29 1
+30 3 2
+30 29 -1
+30 30 1
+31 3 2
+31 30 -1
+31 31 1
+32 3 2
+32 31 -1
+32 32 1
+33 1 1
+33 2 1
+33 3 3
+33 4 1
+33 5 1
+33 6 1
+33 7 1
+33 8 1
+33 9 1
+33 10 1
+33 11 1
+33 12 1
+33 13 1
+33 14 1
+33 15 1
+33 16 1
+33 17 1
+33 18 1
+33 19 1
+33 20 1
+33 21 1
+33 22 1
+33 23 1
+33 24 1
+33 25 1
+33 26 1
+33 27 1
+33 28 1
+33 29 1
+33 30 1
+33 31 1
+33 32 1
+33 33 1
+34 34 1
+35 34 -1
+35 35 2
+36 35 2
+36 36 1
+37 35 2
+37 36 -1
+37 37 1
+38 35 2
+38 37 -1
+38 38 1
+39 35 2
+39 38 -1
+39 39 1
+40 35 2
+40 39 -1
+40 40 1
+41 35 2
+41 40 -1
+41 41 1
+42 35 2
+42 41 -1
+42 42 1
+43 35 2
+43 42 -1
+43 43 1
+44 35 2
+44 43 -1
+44 44 1
+45 35 2
+45 44 -1
+45 45 1
+46 35 2
+46 45 -1
+46 46 1
+47 35 2
+47 46 -1
+47 47 1
+48 35 2
+48 47 -1
+48 48 1
+49 35 2
+49 48 -1
+49 49 1
+50 35 2
+50 49 -1
+50 50 1
+51 35 2
+51 50 -1
+51 51 1
+52 35 2
+52 51 -1
+52 52 1
+53 35 2
+53 52 -1
+53 53 1
+54 35 2
+54 53 -1
+54 54 1
+55 35 2
+55 54 -1
+55 55 1
+56 35 2
+56 55 -1
+56 56 1
+57 35 2
+57 56 -1
+57 57 1
+58 35 2
+58 57 -1
+58 58 1
+59 35 2
+59 58 -1
+59 59 1
+60 35 2
+60 59 -1
+60 60 1
+61 35 2
+61 60 -1
+61 61 1
+62 35 2
+62 61 -1
+62 62 1
+63 35 2
+63 62 -1
+63 63 1
+64 35 2
+64 63 -1
+64 64 1
+65 35 2
+65 65 1
+66 34 12345
+66 35 12345
+66 36 12345
+66 37 12345
+66 38 12345
+66 39 12345
+66 40 12345
+66 41 12345
+66 42 12345
+66 43 12345
+66 44 12345
+66 45 12345
+66 46 12345
+66 47 12345
+66 48 12345
+66 49 12345
+66 50 12345
+66 51 12345
+66 52 12345
+66 53 12345
+66 54 12345
+66 55 12345
+66 56 12345
+66 57 12345
+66 58 12345
+66 59 12345
+66 60 12345
+66 61 12345
+66 62 12345
+66 63 12345
+66 64 12345
+66 65 12345
+66 66 1
diff --git a/matrices/test/isai_l_excess_rhs.mtx b/matrices/test/isai_l_excess_rhs.mtx
new file mode 100644
index 00000000000..85972f682ab
--- /dev/null
+++ b/matrices/test/isai_l_excess_rhs.mtx
@@ -0,0 +1,4 @@
+%%MatrixMarket matrix coordinate real general
+66 1 2
+33 1 1
+66 1 1
diff --git a/matrices/test/isai_l_inv.mtx b/matrices/test/isai_l_inv.mtx
new file mode 100644
index 00000000000..188eb3b9daf
--- /dev/null
+++ b/matrices/test/isai_l_inv.mtx
@@ -0,0 +1,162 @@
+%%MatrixMarket matrix coordinate real general
+35 35 160
+1 1 1
+2 1 1
+2 2 1
+3 2 0.5
+3 3 0.5
+4 3 -1
+4 4 1
+5 3 -2
+5 4 1
+5 5 1
+6 3 -2
+6 5 1
+6 6 1
+7 3 -2
+7 6 1
+7 7 1
+8 3 -2
+8 7 1
+8 8 1
+9 3 -2
+9 8 1
+9 9 1
+10 3 -2
+10 9 1
+10 10 1
+11 3 -2
+11 10 1
+11 11 1
+12 3 -2
+12 11 1
+12 12 1
+13 3 -2
+13 12 1
+13 13 1
+14 3 -2
+14 13 1
+14 14 1
+15 3 -2
+15 14 1
+15 15 1
+16 3 -2
+16 15 1
+16 16 1
+17 3 -2
+17 16 1
+17 17 1
+18 3 -2
+18 17 1
+18 18 1
+19 3 -2
+19 18 1
+19 19 1
+20 3 -2
+20 19 1
+20 20 1
+21 3 -2
+21 20 1
+21 21 1
+22 3 -2
+22 21 1
+22 22 1
+23 3 -2
+23 22 1
+23 23 1
+24 3 -2
+24 23 1
+24 24 1
+25 3 -2
+25 24 1
+25 25 1
+26 3 -2
+26 25 1
+26 26 1
+27 3 -2
+27 26 1
+27 27 1
+28 3 -2
+28 27 1
+28 28 1
+29 3 -2
+29 28 1
+29 29 1
+30 3 -2
+30 29 1
+30 30 1
+31 3 -2
+31 30 1
+31 31 1
+32 3 -2
+32 31 1
+32 32 1
+33 1 431.5
+33 2 432.5
+33 3 433.5
+33 4 -29
+33 5 -28
+33 6 -27
+33 7 -26
+33 8 -25
+33 9 -24
+33 10 -23
+33 11 -22
+33 12 -21
+33 13 -20
+33 14 -19
+33 15 -18
+33 16 -17
+33 17 -16
+33 18 -15
+33 19 -14
+33 20 -13
+33 21 -12
+33 22 -11
+33 23 -10
+33 24 -9
+33 25 -8
+33 26 -7
+33 27 -6
+33 28 -5
+33 29 -4
+33 30 -3
+33 31 -2
+33 32 -1
+33 33 1
+34 3 -2.5
+34 33 1
+34 34 1
+35 2 12345
+35 3 12345
+35 4 12345
+35 5 12345
+35 6 12345
+35 7 12345
+35 8 12345
+35 9 12345
+35 10 12345
+35 11 12345
+35 12 12345
+35 13 12345
+35 14 12345
+35 15 12345
+35 16 12345
+35 17 12345
+35 18 12345
+35 19 12345
+35 20 12345
+35 21 12345
+35 22 12345
+35 23 12345
+35 24 12345
+35 25 12345
+35 26 12345
+35 27 12345
+35 28 12345
+35 29 12345
+35 30 12345
+35 31 12345
+35 32 12345
+35 34 12345
+35 35 1
diff --git a/matrices/test/isai_l_inv_partial.mtx b/matrices/test/isai_l_inv_partial.mtx
new file mode 100644
index 00000000000..a9fb591f7c8
--- /dev/null
+++ b/matrices/test/isai_l_inv_partial.mtx
@@ -0,0 +1,162 @@
+%%MatrixMarket matrix coordinate real general
+35 35 160
+1 1 1
+2 1 1
+2 2 1
+3 2 0.5
+3 3 0.5
+4 3 -1
+4 4 1
+5 3 -2
+5 4 1
+5 5 1
+6 3 -2
+6 5 1
+6 6 1
+7 3 -2
+7 6 1
+7 7 1
+8 3 -2
+8 7 1
+8 8 1
+9 3 -2
+9 8 1
+9 9 1
+10 3 -2
+10 9 1
+10 10 1
+11 3 -2
+11 10 1
+11 11 1
+12 3 -2
+12 11 1
+12 12 1
+13 3 -2
+13 12 1
+13 13 1
+14 3 -2
+14 13 1
+14 14 1
+15 3 -2
+15 14 1
+15 15 1
+16 3 -2
+16 15 1
+16 16 1
+17 3 -2
+17 16 1
+17 17 1
+18 3 -2
+18 17 1
+18 18 1
+19 3 -2
+19 18 1
+19 19 1
+20 3 -2
+20 19 1
+20 20 1
+21 3 -2
+21 20 1
+21 21 1
+22 3 -2
+22 21 1
+22 22 1
+23 3 -2
+23 22 1
+23 23 1
+24 3 -2
+24 23 1
+24 24 1
+25 3 -2
+25 24 1
+25 25 1
+26 3 -2
+26 25 1
+26 26 1
+27 3 -2
+27 26 1
+27 27 1
+28 3 -2
+28 27 1
+28 28 1
+29 3 -2
+29 28 1
+29 29 1
+30 3 -2
+30 29 1
+30 30 1
+31 3 -2
+31 30 1
+31 31 1
+32 3 -2
+32 31 1
+32 32 1
+33 1 -1
+33 2 -1
+33 3 -1
+33 4 -1
+33 5 -1
+33 6 -1
+33 7 -1
+33 8 -1
+33 9 -1
+33 10 -1
+33 11 -1
+33 12 -1
+33 13 -1
+33 14 -1
+33 15 -1
+33 16 -1
+33 17 -1
+33 18 -1
+33 19 -1
+33 20 -1
+33 21 -1
+33 22 -1
+33 23 -1
+33 24 -1
+33 25 -1
+33 26 -1
+33 27 -1
+33 28 -1
+33 29 -1
+33 30 -1
+33 31 -1
+33 32 -1
+33 33 -1
+34 3 -2.5
+34 33 1
+34 34 1
+35 2 -1
+35 3 -1
+35 4 -1
+35 5 -1
+35 6 -1
+35 7 -1
+35 8 -1
+35 9 -1
+35 10 -1
+35 11 -1
+35 12 -1
+35 13 -1
+35 14 -1
+35 15 -1
+35 16 -1
+35 17 -1
+35 18 -1
+35 19 -1
+35 20 -1
+35 21 -1
+35 22 -1
+35 23 -1
+35 24 -1
+35 25 -1
+35 26 -1
+35 27 -1
+35 28 -1
+35 29 -1
+35 30 -1
+35 31 -1
+35 32 -1
+35 34 -1
+35 35 -1
diff --git a/matrices/test/isai_u.mtx b/matrices/test/isai_u.mtx
new file mode 100644
index 00000000000..6fb357a718f
--- /dev/null
+++ b/matrices/test/isai_u.mtx
@@ -0,0 +1,162 @@
+%%MatrixMarket matrix coordinate real general
+35 35 160
+1 1 1
+1 2 -1
+1 33 1
+2 2 1
+2 3 -1
+2 33 1
+2 35 12345
+3 3 2
+3 4 2
+3 5 2
+3 6 2
+3 7 2
+3 8 2
+3 9 2
+3 10 2
+3 11 2
+3 12 2
+3 13 2
+3 14 2
+3 15 2
+3 16 2
+3 17 2
+3 18 2
+3 19 2
+3 20 2
+3 21 2
+3 22 2
+3 23 2
+3 24 2
+3 25 2
+3 26 2
+3 27 2
+3 28 2
+3 29 2
+3 30 2
+3 31 2
+3 32 2
+3 33 3
+3 34 2
+3 35 12345
+4 4 1
+4 5 -1
+4 33 1
+4 35 12345
+5 5 1
+5 6 -1
+5 33 1
+5 35 12345
+6 6 1
+6 7 -1
+6 33 1
+6 35 12345
+7 7 1
+7 8 -1
+7 33 1
+7 35 12345
+8 8 1
+8 9 -1
+8 33 1
+8 35 12345
+9 9 1
+9 10 -1
+9 33 1
+9 35 12345
+10 10 1
+10 11 -1
+10 33 1
+10 35 12345
+11 11 1
+11 12 -1
+11 33 1
+11 35 12345
+12 12 1
+12 13 -1
+12 33 1
+12 35 12345
+13 13 1
+13 14 -1
+13 33 1
+13 35 12345
+14 14 1
+14 15 -1
+14 33 1
+14 35 12345
+15 15 1
+15 16 -1
+15 33 1
+15 35 12345
+16 16 1
+16 17 -1
+16 33 1
+16 35 12345
+17 17 1
+17 18 -1
+17 33 1
+17 35 12345
+18 18 1
+18 19 -1
+18 33 1
+18 35 12345
+19 19 1
+19 20 -1
+19 33 1
+19 35 12345
+20 20 1
+20 21 -1
+20 33 1
+20 35 12345
+21 21 1
+21 22 -1
+21 33 1
+21 35 12345
+22 22 1
+22 23 -1
+22 33 1
+22 35 12345
+23 23 1
+23 24 -1
+23 33 1
+23 35 12345
+24 24 1
+24 25 -1
+24 33 1
+24 35 12345
+25 25 1
+25 26 -1
+25 33 1
+25 35 12345
+26 26 1
+26 27 -1
+26 33 1
+26 35 12345
+27 27 1
+27 28 -1
+27 33 1
+27 35 12345
+28 28 1
+28 29 -1
+28 33 1
+28 35 12345
+29 29 1
+29 30 -1
+29 33 1
+29 35 12345
+30 30 1
+30 31 -1
+30 33 1
+30 35 12345
+31 31 1
+31 32 -1
+31 33 1
+31 35 12345
+32 32 1
+32 33 1
+32 35 12345
+33 33 1
+33 34 -1
+34 34 1
+34 35 12345
+35 35 1
diff --git a/matrices/test/isai_u_excess.mtx b/matrices/test/isai_u_excess.mtx
new file mode 100644
index 00000000000..0ef6a921e03
--- /dev/null
+++ b/matrices/test/isai_u_excess.mtx
@@ -0,0 +1,155 @@
+%%MatrixMarket matrix coordinate real general
+33 33 153
+1 1 2
+1 2 2
+1 3 2
+1 4 2
+1 5 2
+1 6 2
+1 7 2
+1 8 2
+1 9 2
+1 10 2
+1 11 2
+1 12 2
+1 13 2
+1 14 2
+1 15 2
+1 16 2
+1 17 2
+1 18 2
+1 19 2
+1 20 2
+1 21 2
+1 22 2
+1 23 2
+1 24 2
+1 25 2
+1 26 2
+1 27 2
+1 28 2
+1 29 2
+1 30 2
+1 31 3
+1 32 2
+1 33 12345
+2 2 1
+2 3 -1
+2 31 1
+2 33 12345
+3 3 1
+3 4 -1
+3 31 1
+3 33 12345
+4 4 1
+4 5 -1
+4 31 1
+4 33 12345
+5 5 1
+5 6 -1
+5 31 1
+5 33 12345
+6 6 1
+6 7 -1
+6 31 1
+6 33 12345
+7 7 1
+7 8 -1
+7 31 1
+7 33 12345
+8 8 1
+8 9 -1
+8 31 1
+8 33 12345
+9 9 1
+9 10 -1
+9 31 1
+9 33 12345
+10 10 1
+10 11 -1
+10 31 1
+10 33 12345
+11 11 1
+11 12 -1
+11 31 1
+11 33 12345
+12 12 1
+12 13 -1
+12 31 1
+12 33 12345
+13 13 1
+13 14 -1
+13 31 1
+13 33 12345
+14 14 1
+14 15 -1
+14 31 1
+14 33 12345
+15 15 1
+15 16 -1
+15 31 1
+15 33 12345
+16 16 1
+16 17 -1
+16 31 1
+16 33 12345
+17 17 1
+17 18 -1
+17 31 1
+17 33 12345
+18 18 1
+18 19 -1
+18 31 1
+18 33 12345
+19 19 1
+19 20 -1
+19 31 1
+19 33 12345
+20 20 1
+20 21 -1
+20 31 1
+20 33 12345
+21 21 1
+21 22 -1
+21 31 1
+21 33 12345
+22 22 1
+22 23 -1
+22 31 1
+22 33 12345
+23 23 1
+23 24 -1
+23 31 1
+23 33 12345
+24 24 1
+24 25 -1
+24 31 1
+24 33 12345
+25 25 1
+25 26 -1
+25 31 1
+25 33 12345
+26 26 1
+26 27 -1
+26 31 1
+26 33 12345
+27 27 1
+27 28 -1
+27 31 1
+27 33 12345
+28 28 1
+28 29 -1
+28 31 1
+28 33 12345
+29 29 1
+29 30 -1
+29 31 1
+29 33 12345
+30 30 1
+30 31 1
+30 33 12345
+31 31 1
+31 32 -1
+32 32 1
+32 33 12345
+33 33 1
diff --git a/matrices/test/isai_u_excess_rhs.mtx b/matrices/test/isai_u_excess_rhs.mtx
new file mode 100644
index 00000000000..2218b9de2e7
--- /dev/null
+++ b/matrices/test/isai_u_excess_rhs.mtx
@@ -0,0 +1,3 @@
+%%MatrixMarket matrix coordinate real general
+33 1 1
+1 1 1
diff --git a/matrices/test/isai_u_inv.mtx b/matrices/test/isai_u_inv.mtx
new file mode 100644
index 00000000000..48aef7daae8
--- /dev/null
+++ b/matrices/test/isai_u_inv.mtx
@@ -0,0 +1,162 @@
+%%MatrixMarket matrix coordinate real general
+35 35 160
+1 1 1
+1 2 1
+1 33 -2
+2 2 1
+2 3 0.5
+2 33 -2.5
+2 35 12345
+3 3 0.5
+3 4 -1
+3 5 -2
+3 6 -3
+3 7 -4
+3 8 -5
+3 9 -6
+3 10 -7
+3 11 -8
+3 12 -9
+3 13 -10
+3 14 -11
+3 15 -12
+3 16 -13
+3 17 -14
+3 18 -15
+3 19 -16
+3 20 -17
+3 21 -18
+3 22 -19
+3 23 -20
+3 24 -21
+3 25 -22
+3 26 -23
+3 27 -24
+3 28 -25
+3 29 -26
+3 30 -27
+3 31 -28
+3 32 -29
+3 33 433.5
+3 34 432.5
+3 35 12345
+4 4 1
+4 5 1
+4 33 -2
+4 35 12345
+5 5 1
+5 6 1
+5 33 -2
+5 35 12345
+6 6 1
+6 7 1
+6 33 -2
+6 35 12345
+7 7 1
+7 8 1
+7 33 -2
+7 35 12345
+8 8 1
+8 9 1
+8 33 -2
+8 35 12345
+9 9 1
+9 10 1
+9 33 -2
+9 35 12345
+10 10 1
+10 11 1
+10 33 -2
+10 35 12345
+11 11 1
+11 12 1
+11 33 -2
+11 35 12345
+12 12 1
+12 13 1
+12 33 -2
+12 35 12345
+13 13 1
+13 14 1
+13 33 -2
+13 35 12345
+14 14 1
+14 15 1
+14 33 -2
+14 35 12345
+15 15 1
+15 16 1
+15 33 -2
+15 35 12345
+16 16 1
+16 17 1
+16 33 -2
+16 35 12345
+17 17 1
+17 18 1
+17 33 -2
+17 35 12345
+18 18 1
+18 19 1
+18 33 -2
+18 35 12345
+19 19 1
+19 20 1
+19 33 -2
+19 35 12345
+20 20 1
+20 21 1
+20 33 -2
+20 35 12345
+21 21 1
+21 22 1
+21 33 -2
+21 35 12345
+22 22 1
+22 23 1
+22 33 -2
+22 35 12345
+23 23 1
+23 24 1
+23 33 -2
+23 35 12345
+24 24 1
+24 25 1
+24 33 -2
+24 35 12345
+25 25 1
+25 26 1
+25 33 -2
+25 35 12345
+26 26 1
+26 27 1
+26 33 -2
+26 35 12345
+27 27 1
+27 28 1
+27 33 -2
+27 35 12345
+28 28 1
+28 29 1
+28 33 -2
+28 35 12345
+29 29 1
+29 30 1
+29 33 -2
+29 35 12345
+30 30 1
+30 31 1
+30 33 -2
+30 35 12345
+31 31 1
+31 32 1
+31 33 -2
+31 35 12345
+32 32 1
+32 33 -1
+32 35 12345
+33 33 1
+33 34 1
+34 34 1
+34 35 12345
+35 35 1
diff --git a/matrices/test/isai_u_inv_partial.mtx b/matrices/test/isai_u_inv_partial.mtx
new file mode 100644
index 00000000000..50ab5203f68
--- /dev/null
+++ b/matrices/test/isai_u_inv_partial.mtx
@@ -0,0 +1,162 @@
+%%MatrixMarket matrix coordinate real general
+35 35 160
+1 1 1
+1 2 1
+1 33 -2
+2 2 1
+2 3 0.5
+2 33 -2.5
+2 35 12345
+3 3 -1
+3 4 -1
+3 5 -1
+3 6 -1
+3 7 -1
+3 8 -1
+3 9 -1
+3 10 -1
+3 11 -1
+3 12 -1
+3 13 -1
+3 14 -1
+3 15 -1
+3 16 -1
+3 17 -1
+3 18 -1
+3 19 -1
+3 20 -1
+3 21 -1
+3 22 -1
+3 23 -1
+3 24 -1
+3 25 -1
+3 26 -1
+3 27 -1
+3 28 -1
+3 29 -1
+3 30 -1
+3 31 -1
+3 32 -1
+3 33 -1
+3 34 -1
+3 35 -1
+4 4 1
+4 5 1
+4 33 -2
+4 35 12345
+5 5 1
+5 6 1
+5 33 -2
+5 35 12345
+6 6 1
+6 7 1
+6 33 -2
+6 35 12345
+7 7 1
+7 8 1
+7 33 -2
+7 35 12345
+8 8 1
+8 9 1
+8 33 -2
+8 35 12345
+9 9 1
+9 10 1
+9 33 -2
+9 35 12345
+10 10 1
+10 11 1
+10 33 -2
+10 35 12345
+11 11 1
+11 12 1
+11 33 -2
+11 35 12345
+12 12 1
+12 13 1
+12 33 -2
+12 35 12345
+13 13 1
+13 14 1
+13 33 -2
+13 35 12345
+14 14 1
+14 15 1
+14 33 -2
+14 35 12345
+15 15 1
+15 16 1
+15 33 -2
+15 35 12345
+16 16 1
+16 17 1
+16 33 -2
+16 35 12345
+17 17 1
+17 18 1
+17 33 -2
+17 35 12345
+18 18 1
+18 19 1
+18 33 -2
+18 35 12345
+19 19 1
+19 20 1
+19 33 -2
+19 35 12345
+20 20 1
+20 21 1
+20 33 -2
+20 35 12345
+21 21 1
+21 22 1
+21 33 -2
+21 35 12345
+22 22 1
+22 23 1
+22 33 -2
+22 35 12345
+23 23 1
+23 24 1
+23 33 -2
+23 35 12345
+24 24 1
+24 25 1
+24 33 -2
+24 35 12345
+25 25 1
+25 26 1
+25 33 -2
+25 35 12345
+26 26 1
+26 27 1
+26 33 -2
+26 35 12345
+27 27 1
+27 28 1
+27 33 -2
+27 35 12345
+28 28 1
+28 29 1
+28 33 -2
+28 35 12345
+29 29 1
+29 30 1
+29 33 -2
+29 35 12345
+30 30 1
+30 31 1
+30 33 -2
+30 35 12345
+31 31 1
+31 32 1
+31 33 -2
+31 35 12345
+32 32 1
+32 33 -1
+32 35 12345
+33 33 1
+33 34 1
+34 34 1
+34 35 12345
+35 35 1
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 3f8705b2d2b..8a26b5931a2 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -3,38 +3,58 @@ find_package(OpenMP REQUIRED)
 add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> "")
 target_sources(ginkgo_omp
     PRIVATE
-        base/version.cpp
-        factorization/par_ilu_kernels.cpp
-        matrix/coo_kernels.cpp
-        matrix/csr_kernels.cpp
-        matrix/dense_kernels.cpp
-        matrix/ell_kernels.cpp
-        matrix/hybrid_kernels.cpp
-        matrix/sellp_kernels.cpp
-        matrix/sparsity_csr_kernels.cpp
-        preconditioner/jacobi_kernels.cpp
-        solver/bicgstab_kernels.cpp
-        solver/cg_kernels.cpp
-        solver/cgs_kernels.cpp
-        solver/fcg_kernels.cpp
-        solver/gmres_kernels.cpp
-        solver/ir_kernels.cpp
-        solver/lower_trs_kernels.cpp
-        solver/upper_trs_kernels.cpp
-        stop/criterion_kernels.cpp
-        stop/residual_norm_reduction_kernels.cpp)
+    base/version.cpp
+    components/fill_array.cpp
+    components/precision_conversion.cpp
+    components/prefix_sum.cpp
+    factorization/ilu_kernels.cpp
+    factorization/factorization_kernels.cpp
+    factorization/par_ict_kernels.cpp
+    factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/csr_kernels.cpp
+    matrix/dense_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/hybrid_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    preconditioner/isai_kernels.cpp
+    preconditioner/jacobi_kernels.cpp
+    solver/bicg_kernels.cpp
+    solver/bicgstab_kernels.cpp
+    solver/cg_kernels.cpp
+    solver/cgs_kernels.cpp
+    solver/fcg_kernels.cpp
+    solver/gmres_kernels.cpp
+    solver/ir_kernels.cpp
+    solver/lower_trs_kernels.cpp
+    solver/upper_trs_kernels.cpp
+    stop/criterion_kernels.cpp
+    stop/residual_norm_kernels.cpp)
 
 ginkgo_compile_features(ginkgo_omp)
+
 target_link_libraries(ginkgo_omp PRIVATE "${OpenMP_CXX_LIBRARIES}")
-target_compile_options(ginkgo_omp PRIVATE "${OpenMP_CXX_FLAGS}")
+target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}")
+# We first separate the arguments, otherwise, the target_compile_options adds it as a string
+# and the compiler is unhappy with the quotation marks.
+separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}")
+target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}")
 target_compile_options(ginkgo_omp PRIVATE "${GINKGO_COMPILER_FLAGS}")
 
 # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method
 target_link_libraries(ginkgo_omp PUBLIC ginkgo_cuda)
+# Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method
+target_link_libraries(ginkgo_omp PUBLIC ginkgo_hip)
 
 ginkgo_default_includes(ginkgo_omp)
 ginkgo_install_library(ginkgo_omp omp)
 
+if (GINKGO_CHECK_CIRCULAR_DEPS)
+    ginkgo_check_headers(ginkgo_omp)
+endif()
+
 if(GINKGO_BUILD_TESTS)
     add_subdirectory(test)
 endif()
diff --git a/omp/base/version.cpp b/omp/base/version.cpp
index f9c7a68dcf3..d429e2cbcf1 100644
--- a/omp/base/version.cpp
+++ b/omp/base/version.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/components/csr_spgeam.hpp b/omp/components/csr_spgeam.hpp
new file mode 100644
index 00000000000..d88e612a0ae
--- /dev/null
+++ b/omp/components/csr_spgeam.hpp
@@ -0,0 +1,115 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_
+#define GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_
+
+
+#include <limits>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/base/utils.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+/**
+ * Adds two (sorted) sparse matrices.
+ *
+ * Calls begin_cb(row) on each row to initialize row-local data
+ * Calls entry_cb(row, col, a_val, b_val, local_data) on each output non-zero
+ * Calls end_cb(row, local_data) on each row to finalize row-local data
+ *
+ * If the three functions are thread-safe, the whole invocation is.
+ */
+template <typename ValueType, typename IndexType, typename BeginCallback,
+          typename EntryCallback, typename EndCallback>
+void abstract_spgeam(const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Csr<ValueType, IndexType> *b,
+                     BeginCallback begin_cb, EntryCallback entry_cb,
+                     EndCallback end_cb)
+{
+    auto num_rows = a->get_size()[0];
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto b_vals = b->get_const_values();
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto a_begin = a_row_ptrs[row];
+        auto a_end = a_row_ptrs[row + 1];
+        auto b_begin = b_row_ptrs[row];
+        auto b_end = b_row_ptrs[row + 1];
+        auto total_size = (a_end - a_begin) + (b_end - b_begin);
+        bool skip{};
+        auto local_data = begin_cb(row);
+        for (IndexType i = 0; i < total_size; ++i) {
+            if (skip) {
+                skip = false;
+                continue;
+            }
+            // load column indices or sentinel
+            auto a_col = checked_load(a_col_idxs, a_begin, a_end, sentinel);
+            auto b_col = checked_load(b_col_idxs, b_begin, b_end, sentinel);
+            auto a_val =
+                checked_load(a_vals, a_begin, a_end, zero<ValueType>());
+            auto b_val =
+                checked_load(b_vals, b_begin, b_end, zero<ValueType>());
+            auto col = min(a_col, b_col);
+            // callback
+            entry_cb(row, col, a_col == col ? a_val : zero<ValueType>(),
+                     b_col == col ? b_val : zero<ValueType>(), local_data);
+            // advance indices
+            a_begin += (a_col <= b_col);
+            b_begin += (b_col <= a_col);
+            skip = a_col == b_col;
+        }
+        end_cb(row, local_data);
+    }
+}
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_
diff --git a/omp/components/fill_array.cpp b/omp/components/fill_array.cpp
new file mode 100644
index 00000000000..60844522cbe
--- /dev/null
+++ b/omp/components/fill_array.cpp
@@ -0,0 +1,60 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/fill_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace components {
+
+
+template <typename ValueType>
+void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType *array,
+                size_type n, ValueType val)
+{
+#pragma omp parallel for
+    for (size_type i = 0; i < n; ++i) {
+        array[i] = val;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/components/format_conversion.hpp b/omp/components/format_conversion.hpp
index 35e98965207..1d2e9da9e46 100644
--- a/omp/components/format_conversion.hpp
+++ b/omp/components/format_conversion.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <omp.h>
 
 
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
diff --git a/omp/components/matrix_operations.hpp b/omp/components/matrix_operations.hpp
index 3adb28095bb..7fa629811ee 100644
--- a/omp/components/matrix_operations.hpp
+++ b/omp/components/matrix_operations.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/components/precision_conversion.cpp b/omp/components/precision_conversion.cpp
new file mode 100644
index 00000000000..4c4553470a8
--- /dev/null
+++ b/omp/components/precision_conversion.cpp
@@ -0,0 +1,58 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/precision_conversion.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace components {
+
+
+template <typename SourceType, typename TargetType>
+void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
+                       size_type size, const SourceType *in, TargetType *out)
+{
+#pragma omp parallel for
+    for (size_type i = 0; i < size; ++i) {
+        out[i] = in[i];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+
+
+}  // namespace components
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/components/prefix_sum.cpp b/omp/components/prefix_sum.cpp
new file mode 100644
index 00000000000..1375c1643dc
--- /dev/null
+++ b/omp/components/prefix_sum.cpp
@@ -0,0 +1,63 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace components {
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const OmpExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    IndexType partial_sum{};
+    for (IndexType i = 0; i < num_entries; ++i) {
+        auto nnz = counts[i];
+        counts[i] = partial_sum;
+        partial_sum += nnz;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// instantiate for size_type as well, as this is used in the Sellp format
+template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp
new file mode 100644
index 00000000000..f4b1f616444
--- /dev/null
+++ b/omp/factorization/factorization_kernels.cpp
@@ -0,0 +1,388 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/factorization_kernels.hpp"
+
+
+#include <algorithm>
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+namespace kernel {
+namespace detail {
+
+
+template <bool IsSorted>
+struct find_helper {
+    template <typename ForwardIt, typename IndexType>
+    static inline bool find(ForwardIt first, ForwardIt last, IndexType value)
+    {
+        return std::find(first, last, value) != last;
+    }
+};
+
+
+template <>
+struct find_helper<true> {
+    template <typename ForwardIt, typename IndexType>
+    static inline bool find(ForwardIt first, ForwardIt last, IndexType value)
+    {
+        return std::binary_search(first, last, value);
+    }
+};
+
+
+}  // namespace detail
+
+
+template <bool IsSorted, typename ValueType, typename IndexType>
+void find_missing_diagonal_elements(
+    const matrix::Csr<ValueType, IndexType> *mtx,
+    IndexType *elements_to_add_per_row, bool *changes_required)
+{
+    auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
+    auto num_cols = static_cast<IndexType>(mtx->get_size()[1]);
+    auto col_idxs = mtx->get_const_col_idxs();
+    auto row_ptrs = mtx->get_const_row_ptrs();
+    bool local_change{false};
+#pragma omp parallel for reduction(|| : local_change)
+    for (IndexType row = 0; row < num_rows; ++row) {
+        if (row >= num_cols) {
+            elements_to_add_per_row[row] = 0;
+            continue;
+        }
+        const auto *start_cols = col_idxs + row_ptrs[row];
+        const auto *end_cols = col_idxs + row_ptrs[row + 1];
+        if (detail::find_helper<IsSorted>::find(start_cols, end_cols, row)) {
+            elements_to_add_per_row[row] = 0;
+        } else {
+            elements_to_add_per_row[row] = 1;
+            local_change = true;
+        }
+    }
+    *changes_required = local_change;
+}
+
+
+template <typename ValueType, typename IndexType>
+void add_missing_diagonal_elements(const matrix::Csr<ValueType, IndexType> *mtx,
+                                   ValueType *new_values,
+                                   IndexType *new_col_idxs,
+                                   const IndexType *row_ptrs_addition)
+{
+    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
+    const auto old_values = mtx->get_const_values();
+    const auto old_col_idxs = mtx->get_const_col_idxs();
+    const auto row_ptrs = mtx->get_const_row_ptrs();
+#pragma omp parallel for
+    for (IndexType row = 0; row < num_rows; ++row) {
+        const IndexType old_row_start{row_ptrs[row]};
+        const IndexType old_row_end{row_ptrs[row + 1]};
+        const IndexType new_row_start{old_row_start + row_ptrs_addition[row]};
+        const IndexType new_row_end{old_row_end + row_ptrs_addition[row + 1]};
+
+        // if no element needs to be added, do a simple copy
+        if (new_row_end - new_row_start == old_row_end - old_row_start) {
+            for (IndexType i = 0; i < new_row_end - new_row_start; ++i) {
+                const IndexType new_idx = new_row_start + i;
+                const IndexType old_idx = old_row_start + i;
+                new_values[new_idx] = old_values[old_idx];
+                new_col_idxs[new_idx] = old_col_idxs[old_idx];
+            }
+        } else {
+            IndexType new_idx = new_row_start;
+            bool diagonal_added{false};
+            for (IndexType old_idx = old_row_start; old_idx < old_row_end;
+                 ++old_idx) {
+                const auto col_idx = old_col_idxs[old_idx];
+                if (!diagonal_added && row < col_idx) {
+                    new_values[new_idx] = zero<ValueType>();
+                    new_col_idxs[new_idx] = row;
+                    ++new_idx;
+                    diagonal_added = true;
+                }
+                new_values[new_idx] = old_values[old_idx];
+                new_col_idxs[new_idx] = col_idx;
+                ++new_idx;
+            }
+            if (!diagonal_added) {
+                new_values[new_idx] = zero<ValueType>();
+                new_col_idxs[new_idx] = row;
+                diagonal_added = true;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void add_diagonal_elements(std::shared_ptr<const OmpExecutor> exec,
+                           matrix::Csr<ValueType, IndexType> *mtx,
+                           bool is_sorted)
+{
+    auto mtx_size = mtx->get_size();
+    size_type row_ptrs_size = mtx_size[0] + 1;
+    Array<IndexType> row_ptrs_addition{exec, row_ptrs_size};
+    bool needs_change{};
+    if (is_sorted) {
+        kernel::find_missing_diagonal_elements<true>(
+            mtx, row_ptrs_addition.get_data(), &needs_change);
+    } else {
+        kernel::find_missing_diagonal_elements<false>(
+            mtx, row_ptrs_addition.get_data(), &needs_change);
+    }
+    if (!needs_change) {
+        return;
+    }
+
+    row_ptrs_addition.get_data()[row_ptrs_size - 1] = 0;
+    components::prefix_sum(exec, row_ptrs_addition.get_data(), row_ptrs_size);
+
+    size_type new_num_elems = mtx->get_num_stored_elements() +
+                              row_ptrs_addition.get_data()[row_ptrs_size - 1];
+    Array<ValueType> new_values{exec, new_num_elems};
+    Array<IndexType> new_col_idxs{exec, new_num_elems};
+    kernel::add_missing_diagonal_elements(mtx, new_values.get_data(),
+                                          new_col_idxs.get_data(),
+                                          row_ptrs_addition.get_const_data());
+
+    auto old_row_ptrs_ptr = mtx->get_row_ptrs();
+    auto row_ptrs_addition_ptr = row_ptrs_addition.get_const_data();
+#pragma omp parallel for
+    for (IndexType i = 0; i < row_ptrs_size; ++i) {
+        old_row_ptrs_ptr[i] += row_ptrs_addition_ptr[i];
+    }
+
+    matrix::CsrBuilder<ValueType, IndexType> mtx_builder{mtx};
+    mtx_builder.get_value_array() = std::move(new_values);
+    mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l_u(
+    std::shared_ptr<const OmpExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
+{
+    auto num_rows = system_matrix->get_size()[0];
+    auto row_ptrs = system_matrix->get_const_row_ptrs();
+    auto col_idxs = system_matrix->get_const_col_idxs();
+
+// Calculate the NNZ per row first
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        size_type l_nnz{};
+        size_type u_nnz{};
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            size_type col = col_idxs[el];
+            // don't count diagonal
+            l_nnz += col < row;
+            u_nnz += col > row;
+        }
+        // add diagonal again
+        l_row_ptrs[row] = l_nnz + 1;
+        u_row_ptrs[row] = u_nnz + 1;
+    }
+
+    // Now, compute the prefix-sum, to get proper row_ptrs for L and U
+    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    components::prefix_sum(exec, u_row_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l_u(std::shared_ptr<const OmpExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *system_matrix,
+                    matrix::Csr<ValueType, IndexType> *csr_l,
+                    matrix::Csr<ValueType, IndexType> *csr_u)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
+    auto col_idxs_u = csr_u->get_col_idxs();
+    auto vals_u = csr_u->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        size_type current_index_u =
+            row_ptrs_u[row] + 1;  // we treat the diagonal separately
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = val;
+                ++current_index_l;
+            } else if (col == row) {
+                // save value for later
+                diag_val = val;
+            } else {  // col > row
+                col_idxs_u[current_index_u] = col;
+                vals_u[current_index_u] = val;
+                ++current_index_u;
+            }
+        }
+        // store diagonal entries
+        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
+        size_type u_diag_idx = row_ptrs_u[row];
+        col_idxs_l[l_diag_idx] = row;
+        col_idxs_u[u_diag_idx] = row;
+        vals_l[l_diag_idx] = one<ValueType>();
+        vals_u[u_diag_idx] = diag_val;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l(
+    std::shared_ptr<const OmpExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs)
+{
+    auto num_rows = system_matrix->get_size()[0];
+    auto row_ptrs = system_matrix->get_const_row_ptrs();
+    auto col_idxs = system_matrix->get_const_col_idxs();
+
+// Calculate the NNZ per row first
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        size_type l_nnz{};
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            size_type col = col_idxs[el];
+            // skip diagonal
+            l_nnz += col < row;
+        }
+        // add diagonal again
+        l_row_ptrs[row] = l_nnz + 1;
+    }
+
+    // Now, compute the prefix-sum, to get proper row_ptrs for L
+    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l(std::shared_ptr<const OmpExecutor> exec,
+                  const matrix::Csr<ValueType, IndexType> *system_matrix,
+                  matrix::Csr<ValueType, IndexType> *csr_l, bool diag_sqrt)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = val;
+                ++current_index_l;
+            } else if (col == row) {
+                // save value for later
+                diag_val = val;
+            }
+        }
+        // store diagonal entries
+        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
+        col_idxs_l[l_diag_idx] = row;
+        // compute square root with sentinel
+        if (diag_sqrt) {
+            diag_val = sqrt(diag_val);
+            if (!is_finite(diag_val)) {
+                diag_val = one<ValueType>();
+            }
+        }
+        vals_l[l_diag_idx] = diag_val;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp
new file mode 100644
index 00000000000..77f30a9a753
--- /dev/null
+++ b/omp/factorization/ilu_kernels.cpp
@@ -0,0 +1,58 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/ilu_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace ilu_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
+                matrix::Csr<ValueType, IndexType> *m) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+
+
+}  // namespace ilu_factorization
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp
new file mode 100644
index 00000000000..15d80ab8755
--- /dev/null
+++ b/omp/factorization/par_ict_kernels.cpp
@@ -0,0 +1,207 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <algorithm>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/utils.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "omp/components/csr_spgeam.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Coo<ValueType, IndexType> *)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1];
+             ++l_nz) {
+            auto col = l_col_idxs[l_nz];
+            // find value from A
+            auto a_begin = a_row_ptrs[row];
+            auto a_end = a_row_ptrs[row + 1];
+            auto a_nz_it =
+                std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col);
+            auto a_nz = std::distance(a_col_idxs, a_nz_it);
+            auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col;
+            auto a_val = has_a ? a_vals[a_nz] : zero<ValueType>();
+            // accumulate l(row,:) * l(col,:) without the last entry l(col, col)
+            ValueType sum{};
+            IndexType lt_nz{};
+            auto l_begin = l_row_ptrs[row];
+            auto l_end = l_row_ptrs[row + 1];
+            auto lt_begin = l_row_ptrs[col];
+            auto lt_end = l_row_ptrs[col + 1];
+            while (l_begin < l_end && lt_begin < lt_end) {
+                auto l_col = l_col_idxs[l_begin];
+                auto lt_row = l_col_idxs[lt_begin];
+                if (l_col == lt_row && l_col < col) {
+                    sum += l_vals[l_begin] * l_vals[lt_begin];
+                }
+                if (lt_row == row) {
+                    lt_nz = lt_begin;
+                }
+                l_begin += (l_col <= lt_row);
+                lt_begin += (lt_row <= l_col);
+            }
+            auto new_val = a_val - sum;
+            if (row == col) {
+                new_val = sqrt(new_val);
+            } else {
+                auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+                new_val = new_val / diag;
+            }
+            if (is_finite(new_val)) {
+                l_vals[l_nz] = new_val;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *llt,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    matrix::Csr<ValueType, IndexType> *l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    // count nnz
+    abstract_spgeam(
+        a, llt, [](IndexType) { return IndexType{}; },
+        [](IndexType row, IndexType col, ValueType, ValueType, IndexType &nnz) {
+            nnz += col <= row;
+        },
+        [&](IndexType row, IndexType nnz) { l_new_row_ptrs[row] = nnz; });
+
+    components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize arrays
+    auto l_nnz = l_new_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> l_builder{l_new};
+    l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+    l_builder.get_value_array().resize_and_reset(l_nnz);
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // accumulate non-zeros
+    struct row_state {
+        IndexType l_new_nz;
+        IndexType l_old_begin;
+        IndexType l_old_end;
+    };
+    abstract_spgeam(
+        a, llt,
+        [&](IndexType row) {
+            row_state state{};
+            state.l_new_nz = l_new_row_ptrs[row];
+            state.l_old_begin = l_row_ptrs[row];
+            state.l_old_end = l_row_ptrs[row + 1];
+            return state;
+        },
+        [&](IndexType row, IndexType col, ValueType a_val, ValueType llt_val,
+            row_state &state) {
+            auto r_val = a_val - llt_val;
+            // load matching entry of L
+            auto l_col = checked_load(l_col_idxs, state.l_old_begin,
+                                      state.l_old_end, sentinel);
+            auto l_val = checked_load(l_vals, state.l_old_begin,
+                                      state.l_old_end, zero<ValueType>());
+            // load diagonal entry of L
+            auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+            // if there is already an entry present, use that
+            // instead.
+            auto out_val = l_col == col ? l_val : r_val / diag;
+            // store output entries
+            if (row >= col) {
+                l_new_col_idxs[state.l_new_nz] = col;
+                l_new_vals[state.l_new_nz] = out_val;
+                state.l_new_nz++;
+            }
+            // advance entry of L if we used it
+            state.l_old_begin += (l_col == col);
+        },
+        [](IndexType, row_state) {});
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index 4c06f2337b0..d658c0da579 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
+#include <memory>
+
+
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -49,102 +52,6 @@ namespace omp {
 namespace par_ilu_factorization {
 
 
-template <typename ValueType, typename IndexType>
-void initialize_row_ptrs_l_u(
-    std::shared_ptr<const OmpExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *system_matrix,
-    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
-{
-    auto row_ptrs = system_matrix->get_const_row_ptrs();
-    auto col_idxs = system_matrix->get_const_col_idxs();
-
-    l_row_ptrs[0] = 0;
-    u_row_ptrs[0] = 0;
-// Calculate the NNZ per row first
-#pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type l_nnz{};
-        size_type u_nnz{};
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            size_type col = col_idxs[el];
-            if (col <= row) {
-                ++l_nnz;
-            }
-            if (col >= row) {
-                ++u_nnz;
-            }
-        }
-        l_row_ptrs[row + 1] = l_nnz;
-        u_row_ptrs[row + 1] = u_nnz;
-    }
-
-    // Now, compute the prefix-sum, to get proper row_ptrs for L and U
-    IndexType l_previous_nnz{};
-    IndexType u_previous_nnz{};
-    for (size_type row = 1; row < system_matrix->get_size()[0] + 1; ++row) {
-        l_previous_nnz += l_row_ptrs[row];
-        u_previous_nnz += u_row_ptrs[row];
-
-        l_row_ptrs[row] = l_previous_nnz;
-        u_row_ptrs[row] = u_previous_nnz;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void initialize_l_u(std::shared_ptr<const OmpExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType> *system_matrix,
-                    matrix::Csr<ValueType, IndexType> *csr_l,
-                    matrix::Csr<ValueType, IndexType> *csr_u)
-{
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
-    auto col_idxs_u = csr_u->get_col_idxs();
-    auto vals_u = csr_u->get_values();
-
-#pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        size_type current_index_u = row_ptrs_u[row];
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // Update both L and U
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = one<ValueType>();
-                ++current_index_l;
-
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            } else {  // col > row
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            }
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
                          size_type iterations,
@@ -198,12 +105,12 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
 
             if (row > col) {  // modify entry in L
                 auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1];
-                if (isfinite(to_write)) {
+                if (is_finite(to_write)) {
                     vals_l[row_l - 1] = to_write;
                 }
             } else {  // modify entry in U
                 auto to_write = sum;
-                if (isfinite(to_write)) {
+                if (is_finite(to_write)) {
                     vals_u[row_u - 1] = to_write;
                 }
             }
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
new file mode 100644
index 00000000000..1e7d4988c5c
--- /dev/null
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -0,0 +1,470 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+
+#include <omp.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/utils.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "omp/components/csr_spgeam.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *m,
+                      IndexType rank, Array<ValueType> &tmp,
+                      Array<remove_complex<ValueType>> &,
+                      remove_complex<ValueType> &threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    tmp.resize_and_reset(size);
+    std::copy_n(values, size, tmp.get_data());
+
+    auto begin = tmp.get_data();
+    auto target = begin + rank;
+    auto end = begin + size;
+    std::nth_element(begin, target, end,
+                     [](ValueType a, ValueType b) { return abs(a) < abs(b); });
+    threshold = abs(*target);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+/**
+ * Removes all the elements from the input matrix for which pred is false.
+ * Stores the result in m_out and (if non-null) m_out_coo.
+ * pred(row, nz) is called for each entry, where nz is the index in
+ * values/col_idxs.
+ */
+template <typename Predicate, typename ValueType, typename IndexType>
+void abstract_filter(std::shared_ptr<const DefaultExecutor> exec,
+                     const matrix::Csr<ValueType, IndexType> *m,
+                     matrix::Csr<ValueType, IndexType> *m_out,
+                     matrix::Coo<ValueType, IndexType> *m_out_coo,
+                     Predicate pred)
+{
+    auto num_rows = m->get_size()[0];
+    auto row_ptrs = m->get_const_row_ptrs();
+    auto col_idxs = m->get_const_col_idxs();
+    auto vals = m->get_const_values();
+
+    // first sweep: count nnz for each row
+    auto new_row_ptrs = m_out->get_row_ptrs();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        IndexType count{};
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; ++nz) {
+            count += pred(row, nz);
+        }
+        new_row_ptrs[row] = count;
+    }
+
+    // build row pointers
+    components::prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = new_row_ptrs[num_rows];
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType *new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            Array<ValueType>::view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto new_nz = new_row_ptrs[row];
+        auto begin = row_ptrs[row];
+        auto end = row_ptrs[row + 1];
+        for (auto nz = begin; nz < end; ++nz) {
+            if (pred(row, nz)) {
+                if (new_row_idxs) {
+                    new_row_idxs[new_nz] = row;
+                }
+                new_col_idxs[new_nz] = col_idxs[nz];
+                new_vals[new_nz] = vals[nz];
+                ++new_nz;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *m,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType> *m_out,
+                      matrix::Coo<ValueType, IndexType> *m_out_coo, bool)
+{
+    auto col_idxs = m->get_const_col_idxs();
+    auto vals = m->get_const_values();
+    abstract_filter(
+        exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) {
+            return abs(vals[nz]) >= threshold || col_idxs[nz] == row;
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+constexpr auto bucket_count = 1 << sampleselect_searchtree_height;
+constexpr auto sample_size = bucket_count * sampleselect_oversampling;
+
+
+template <typename ValueType, typename IndexType>
+void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> &tmp,
+                             remove_complex<ValueType> &threshold,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto vals = m->get_const_values();
+    auto col_idxs = m->get_const_col_idxs();
+    auto size = static_cast<IndexType>(m->get_num_stored_elements());
+    using AbsType = remove_complex<ValueType>;
+    auto num_threads = omp_get_max_threads();
+    auto storage_size =
+        ceildiv(sample_size * sizeof(AbsType) +
+                    bucket_count * (num_threads + 1) * sizeof(IndexType),
+                sizeof(ValueType));
+    tmp.resize_and_reset(storage_size);
+    // pick and sort sample
+    auto sample = reinterpret_cast<AbsType *>(tmp.get_data());
+    // assuming rounding towards zero
+    auto stride = double(size) / sample_size;
+    for (IndexType i = 0; i < sample_size; ++i) {
+        sample[i] = abs(vals[static_cast<IndexType>(i * stride)]);
+    }
+    std::sort(sample, sample + sample_size);
+    // pick splitters
+    for (IndexType i = 0; i < bucket_count - 1; ++i) {
+        // shift by one so we get upper bounds for the buckets
+        sample[i] = sample[(i + 1) * sampleselect_oversampling];
+    }
+    // count elements per bucket
+    auto total_histogram = reinterpret_cast<IndexType *>(sample + bucket_count);
+    for (IndexType bucket = 0; bucket < bucket_count; ++bucket) {
+        total_histogram[bucket] = 0;
+    }
+#pragma omp parallel
+    {
+        auto local_histogram =
+            total_histogram + (omp_get_thread_num() + 1) * bucket_count;
+        for (IndexType bucket = 0; bucket < bucket_count; ++bucket) {
+            local_histogram[bucket] = 0;
+        }
+#pragma omp for
+        for (IndexType nz = 0; nz < size; ++nz) {
+            auto bucket_it = std::upper_bound(sample, sample + bucket_count - 1,
+                                              abs(vals[nz]));
+            auto bucket = std::distance(sample, bucket_it);
+            // smallest bucket s.t. sample[bucket] >= abs(val[nz])
+            local_histogram[bucket]++;
+        }
+        for (IndexType bucket = 0; bucket < bucket_count; ++bucket) {
+#pragma omp atomic
+            total_histogram[bucket] += local_histogram[bucket];
+        }
+    }
+    // determine splitter ranks: prefix sum over bucket counts
+    components::prefix_sum(exec, total_histogram, bucket_count + 1);
+    // determine the bucket containing the threshold rank:
+    // prefix_sum[bucket] <= rank < prefix_sum[bucket + 1]
+    auto it = std::upper_bound(total_histogram,
+                               total_histogram + bucket_count + 1, rank);
+    auto threshold_bucket = std::distance(total_histogram + 1, it);
+    // sample contains upper bounds for the buckets
+    threshold = threshold_bucket > 0 ? sample[threshold_bucket - 1]
+                                     : zero<remove_complex<ValueType>>();
+    // filter elements
+    abstract_filter(
+        exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) {
+            auto bucket_it = std::upper_bound(sample, sample + bucket_count - 1,
+                                              abs(vals[nz]));
+            auto bucket = std::distance(sample, bucket_it);
+            return bucket >= threshold_bucket || col_idxs[nz] == row;
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *a,
+                         matrix::Csr<ValueType, IndexType> *l,
+                         const matrix::Coo<ValueType, IndexType> *,
+                         matrix::Csr<ValueType, IndexType> *u,
+                         const matrix::Coo<ValueType, IndexType> *,
+                         matrix::Csr<ValueType, IndexType> *u_csc)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_values();
+    auto ut_col_ptrs = u_csc->get_const_row_ptrs();
+    auto ut_row_idxs = u_csc->get_const_col_idxs();
+    auto ut_vals = u_csc->get_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+
+    auto compute_sum = [&](IndexType row, IndexType col) {
+        // find value from A
+        auto a_begin = a_row_ptrs[row];
+        auto a_end = a_row_ptrs[row + 1];
+        auto a_nz_it =
+            std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col);
+        auto a_nz = std::distance(a_col_idxs, a_nz_it);
+        auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col;
+        auto a_val = has_a ? a_vals[a_nz] : zero<ValueType>();
+        // accumulate l(row,:) * u(:,col) without the last entry (row, col)
+        ValueType sum{};
+        IndexType ut_nz{};
+        auto l_begin = l_row_ptrs[row];
+        auto l_end = l_row_ptrs[row + 1];
+        auto u_begin = ut_col_ptrs[col];
+        auto u_end = ut_col_ptrs[col + 1];
+        auto last_entry = min(row, col);
+        while (l_begin < l_end && u_begin < u_end) {
+            auto l_col = l_col_idxs[l_begin];
+            auto u_row = ut_row_idxs[u_begin];
+            if (l_col == u_row && l_col < last_entry) {
+                sum += l_vals[l_begin] * ut_vals[u_begin];
+            }
+            if (u_row == row) {
+                ut_nz = u_begin;
+            }
+            l_begin += (l_col <= u_row);
+            u_begin += (u_row <= l_col);
+        }
+        return std::make_pair(a_val - sum, ut_nz);
+    };
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1] - 1;
+             ++l_nz) {
+            auto col = l_col_idxs[l_nz];
+            auto u_diag = ut_vals[ut_col_ptrs[col + 1] - 1];
+            auto new_val = compute_sum(row, col).first / u_diag;
+            if (is_finite(new_val)) {
+                l_vals[l_nz] = new_val;
+            }
+        }
+        for (size_type u_nz = u_row_ptrs[row]; u_nz < u_row_ptrs[row + 1];
+             ++u_nz) {
+            auto col = u_col_idxs[u_nz];
+            auto result = compute_sum(row, col);
+            auto new_val = result.first;
+            auto ut_nz = result.second;
+            if (is_finite(new_val)) {
+                u_vals[u_nz] = new_val;
+                ut_vals[ut_nz] = new_val;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *lu,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Csr<ValueType, IndexType> *u,
+                    matrix::Csr<ValueType, IndexType> *l_new,
+                    matrix::Csr<ValueType, IndexType> *u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    // count nnz
+    abstract_spgeam(
+        a, lu, [](IndexType) { return std::pair<IndexType, IndexType>{}; },
+        [](IndexType row, IndexType col, ValueType, ValueType,
+           std::pair<IndexType, IndexType> &nnzs) {
+            nnzs.first += col <= row;
+            nnzs.second += col >= row;
+        },
+        [&](IndexType row, std::pair<IndexType, IndexType> nnzs) {
+            l_new_row_ptrs[row] = nnzs.first;
+            u_new_row_ptrs[row] = nnzs.second;
+        });
+
+    components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize arrays
+    auto l_nnz = l_new_row_ptrs[num_rows];
+    auto u_nnz = u_new_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> l_builder{l_new};
+    matrix::CsrBuilder<ValueType, IndexType> u_builder{u_new};
+    l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+    l_builder.get_value_array().resize_and_reset(l_nnz);
+    u_builder.get_col_idx_array().resize_and_reset(u_nnz);
+    u_builder.get_value_array().resize_and_reset(u_nnz);
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    // accumulate non-zeros
+    struct row_state {
+        IndexType l_new_nz;
+        IndexType u_new_nz;
+        IndexType l_old_begin;
+        IndexType l_old_end;
+        IndexType u_old_begin;
+        IndexType u_old_end;
+        bool finished_l;
+    };
+    abstract_spgeam(
+        a, lu,
+        [&](IndexType row) {
+            row_state state{};
+            state.l_new_nz = l_new_row_ptrs[row];
+            state.u_new_nz = u_new_row_ptrs[row];
+            state.l_old_begin = l_row_ptrs[row];
+            state.l_old_end = l_row_ptrs[row + 1] - 1;  // skip diagonal
+            state.u_old_begin = u_row_ptrs[row];
+            state.u_old_end = u_row_ptrs[row + 1];
+            state.finished_l = (state.l_old_begin == state.l_old_end);
+            return state;
+        },
+        [&](IndexType row, IndexType col, ValueType a_val, ValueType lu_val,
+            row_state &state) {
+            auto r_val = a_val - lu_val;
+            // load matching entry of L + U
+            auto lpu_col = state.finished_l
+                               ? checked_load(u_col_idxs, state.u_old_begin,
+                                              state.u_old_end, sentinel)
+                               : l_col_idxs[state.l_old_begin];
+            auto lpu_val =
+                state.finished_l
+                    ? checked_load(u_vals, state.u_old_begin, state.u_old_end,
+                                   zero<ValueType>())
+                    : l_vals[state.l_old_begin];
+            // load diagonal entry of U for lower diagonal entries
+            auto diag = col < row ? u_vals[u_row_ptrs[col]] : one<ValueType>();
+            // if there is already an entry present, use that instead.
+            auto out_val = lpu_col == col ? lpu_val : r_val / diag;
+            // store output entries
+            if (row >= col) {
+                l_new_col_idxs[state.l_new_nz] = col;
+                l_new_vals[state.l_new_nz] =
+                    row == col ? one<ValueType>() : out_val;
+                state.l_new_nz++;
+            }
+            if (row <= col) {
+                u_new_col_idxs[state.u_new_nz] = col;
+                u_new_vals[state.u_new_nz] = out_val;
+                state.u_new_nz++;
+            }
+            // advance entry of L + U if we used it
+            if (state.finished_l) {
+                state.u_old_begin += (lpu_col == col);
+            } else {
+                state.l_old_begin += (lpu_col == col);
+                state.finished_l = (state.l_old_begin == state.l_old_end);
+            }
+        },
+        [](IndexType, row_state) {});
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index 33254cb0f44..71eb4c93a45 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -156,8 +156,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const OmpExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -176,8 +176,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto coo_val = source->get_const_values();
     auto coo_col = source->get_const_col_idxs();
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 3b541cba94c..406ce327ff1 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
-#include <iostream>
 #include <numeric>
 #include <utility>
 
@@ -42,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <omp.h>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -49,7 +49,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/hybrid.hpp>
 
 
+#include "core/base/allocator.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "omp/components/csr_spgeam.hpp"
 #include "omp/components/format_conversion.hpp"
 
 
@@ -126,6 +130,238 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void spgemm_insert_row(unordered_set<IndexType> &cols,
+                       const matrix::Csr<ValueType, IndexType> *c,
+                       size_type row)
+{
+    auto row_ptrs = c->get_const_row_ptrs();
+    auto col_idxs = c->get_const_col_idxs();
+    cols.insert(col_idxs + row_ptrs[row], col_idxs + row_ptrs[row + 1]);
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_insert_row2(unordered_set<IndexType> &cols,
+                        const matrix::Csr<ValueType, IndexType> *a,
+                        const matrix::Csr<ValueType, IndexType> *b,
+                        size_type row)
+{
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    for (size_type a_nz = a_row_ptrs[row];
+         a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) {
+        auto a_col = a_col_idxs[a_nz];
+        auto b_row = a_col;
+        cols.insert(b_col_idxs + b_row_ptrs[b_row],
+                    b_col_idxs + b_row_ptrs[b_row + 1]);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_accumulate_row(map<IndexType, ValueType> &cols,
+                           const matrix::Csr<ValueType, IndexType> *c,
+                           ValueType scale, size_type row)
+{
+    auto row_ptrs = c->get_const_row_ptrs();
+    auto col_idxs = c->get_const_col_idxs();
+    auto vals = c->get_const_values();
+    for (size_type c_nz = row_ptrs[row]; c_nz < size_type(row_ptrs[row + 1]);
+         ++c_nz) {
+        auto c_col = col_idxs[c_nz];
+        auto c_val = vals[c_nz];
+        cols[c_col] += scale * c_val;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_accumulate_row2(map<IndexType, ValueType> &cols,
+                            const matrix::Csr<ValueType, IndexType> *a,
+                            const matrix::Csr<ValueType, IndexType> *b,
+                            ValueType scale, size_type row)
+{
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto b_vals = b->get_const_values();
+    for (size_type a_nz = a_row_ptrs[row];
+         a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) {
+        auto a_col = a_col_idxs[a_nz];
+        auto a_val = a_vals[a_nz];
+        auto b_row = a_col;
+        for (size_type b_nz = b_row_ptrs[b_row];
+             b_nz < size_type(b_row_ptrs[b_row + 1]); ++b_nz) {
+            auto b_col = b_col_idxs[b_nz];
+            auto b_val = b_vals[b_nz];
+            cols[b_col] += scale * a_val * b_val;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm(std::shared_ptr<const OmpExecutor> exec,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto num_rows = a->get_size()[0];
+
+    // first sweep: count nnz for each row
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    unordered_set<IndexType> local_col_idxs(exec);
+#pragma omp parallel for firstprivate(local_col_idxs)
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_col_idxs.clear();
+        spgemm_insert_row2(local_col_idxs, a, b, a_row);
+        c_row_ptrs[a_row] = local_col_idxs.size();
+    }
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = c_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto &c_col_idxs_array = c_builder.get_col_idx_array();
+    auto &c_vals_array = c_builder.get_value_array();
+    c_col_idxs_array.resize_and_reset(new_nnz);
+    c_vals_array.resize_and_reset(new_nnz);
+    auto c_col_idxs = c_col_idxs_array.get_data();
+    auto c_vals = c_vals_array.get_data();
+
+    map<IndexType, ValueType> local_row_nzs(exec);
+#pragma omp parallel for firstprivate(local_row_nzs)
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_row_nzs.clear();
+        spgemm_accumulate_row2(local_row_nzs, a, b, one<ValueType>(), a_row);
+        // store result
+        auto c_nz = c_row_ptrs[a_row];
+        for (auto pair : local_row_nzs) {
+            c_col_idxs[c_nz] = pair.first;
+            c_vals[c_nz] = pair.second;
+            ++c_nz;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spgemm(std::shared_ptr<const OmpExecutor> exec,
+                     const matrix::Dense<ValueType> *alpha,
+                     const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Csr<ValueType, IndexType> *b,
+                     const matrix::Dense<ValueType> *beta,
+                     const matrix::Csr<ValueType, IndexType> *d,
+                     matrix::Csr<ValueType, IndexType> *c)
+{
+    auto num_rows = a->get_size()[0];
+    auto valpha = alpha->at(0, 0);
+    auto vbeta = beta->at(0, 0);
+
+    // first sweep: count nnz for each row
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    unordered_set<IndexType> local_col_idxs(exec);
+#pragma omp parallel for firstprivate(local_col_idxs)
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_col_idxs.clear();
+        spgemm_insert_row(local_col_idxs, d, a_row);
+        spgemm_insert_row2(local_col_idxs, a, b, a_row);
+        c_row_ptrs[a_row] = local_col_idxs.size();
+    }
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = c_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto &c_col_idxs_array = c_builder.get_col_idx_array();
+    auto &c_vals_array = c_builder.get_value_array();
+    c_col_idxs_array.resize_and_reset(new_nnz);
+    c_vals_array.resize_and_reset(new_nnz);
+    auto c_col_idxs = c_col_idxs_array.get_data();
+    auto c_vals = c_vals_array.get_data();
+
+    map<IndexType, ValueType> local_row_nzs(exec);
+#pragma omp parallel for firstprivate(local_row_nzs)
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_row_nzs.clear();
+        spgemm_accumulate_row(local_row_nzs, d, vbeta, a_row);
+        spgemm_accumulate_row2(local_row_nzs, a, b, valpha, a_row);
+        // store result
+        auto c_nz = c_row_ptrs[a_row];
+        for (auto pair : local_row_nzs) {
+            c_col_idxs[c_nz] = pair.first;
+            c_vals[c_nz] = pair.second;
+            ++c_nz;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const OmpExecutor> exec,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto num_rows = a->get_size()[0];
+    auto valpha = alpha->at(0, 0);
+    auto vbeta = beta->at(0, 0);
+
+    // first sweep: count nnz for each row
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    abstract_spgeam(
+        a, b, [](IndexType) { return IndexType{}; },
+        [](IndexType, IndexType, ValueType, ValueType, IndexType &nnz) {
+            ++nnz;
+        },
+        [&](IndexType row, IndexType nnz) { c_row_ptrs[row] = nnz; });
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = c_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto &c_col_idxs_array = c_builder.get_col_idx_array();
+    auto &c_vals_array = c_builder.get_value_array();
+    c_col_idxs_array.resize_and_reset(new_nnz);
+    c_vals_array.resize_and_reset(new_nnz);
+    auto c_col_idxs = c_col_idxs_array.get_data();
+    auto c_vals = c_vals_array.get_data();
+
+    abstract_spgeam(
+        a, b, [&](IndexType row) { return c_row_ptrs[row]; },
+        [&](IndexType, IndexType col, ValueType a_val, ValueType b_val,
+            IndexType &nz) {
+            c_vals[nz] = valpha * a_val + vbeta * b_val;
+            c_col_idxs[nz] = col;
+            ++nz;
+        },
+        [](IndexType, IndexType) {});
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
+
 template <typename IndexType>
 void convert_row_ptrs_to_idxs(std::shared_ptr<const OmpExecutor> exec,
                               const IndexType *ptrs, size_type num_rows,
@@ -137,8 +373,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const OmpExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -154,8 +390,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -163,12 +399,11 @@ void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
     auto col_idxs = source->get_const_col_idxs();
     auto vals = source->get_const_values();
 
-    for (size_type row = 0; row < num_rows; ++row) {
 #pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
         for (size_type col = 0; col < num_cols; ++col) {
             result->at(row, col) = zero<ValueType>();
         }
-#pragma omp parallel for
         for (size_type i = row_ptrs[row];
              i < static_cast<size_type>(row_ptrs[row + 1]); ++i) {
             result->at(row, col_idxs[i]) = vals[i];
@@ -182,8 +417,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -192,15 +427,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
-template <typename IndexType, typename ValueType, typename UnaryOperator>
+template <typename ValueType, typename IndexType, typename UnaryOperator>
 inline void convert_csr_to_csc(size_type num_rows, const IndexType *row_ptrs,
                                const IndexType *col_idxs,
                                const ValueType *csr_vals, IndexType *row_idxs,
@@ -245,8 +480,8 @@ void transpose_and_transform(std::shared_ptr<const OmpExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return x; });
@@ -257,8 +492,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return conj(x); });
@@ -289,8 +524,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -363,6 +598,144 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void row_permute_impl(std::shared_ptr<const OmpExecutor> exec,
+                      const Array<IndexType> *permutation_indices,
+                      const matrix::Csr<ValueType, IndexType> *orig,
+                      matrix::Csr<ValueType, IndexType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto orig_vals = orig->get_const_values();
+    auto rp_row_ptrs = row_permuted->get_row_ptrs();
+    auto rp_col_idxs = row_permuted->get_col_idxs();
+    auto rp_vals = row_permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+    size_type num_nnz = orig->get_num_stored_elements();
+
+    size_type cur_ptr = 0;
+    rp_row_ptrs[0] = cur_ptr;
+    vector<size_type> orig_num_nnz_per_row(num_rows, 0, exec);
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        orig_num_nnz_per_row[row] = orig_row_ptrs[row + 1] - orig_row_ptrs[row];
+    }
+    for (size_type row = 0; row < num_rows; ++row) {
+        rp_row_ptrs[row + 1] =
+            rp_row_ptrs[row] + orig_num_nnz_per_row[perm[row]];
+    }
+    rp_row_ptrs[num_rows] = orig_row_ptrs[num_rows];
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto new_row = perm[row];
+        auto new_k = orig_row_ptrs[new_row];
+        for (size_type k = rp_row_ptrs[row];
+             k < size_type(rp_row_ptrs[row + 1]); ++k) {
+            rp_col_idxs[k] = orig_col_idxs[new_k];
+            rp_vals[k] = orig_vals[new_k];
+            new_k++;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const OmpExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
+{
+    row_permute_impl(exec, permutation_indices, orig, row_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const OmpExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    Array<IndexType> inv_perm(*permutation_indices);
+    auto iperm = inv_perm.get_data();
+#pragma omp parallel for
+    for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) {
+        iperm[perm[ind]] = ind;
+    }
+
+    row_permute_impl(exec, &inv_perm, orig, row_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute_impl(const Array<IndexType> *permutation_indices,
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto orig_vals = orig->get_const_values();
+    auto cp_row_ptrs = column_permuted->get_row_ptrs();
+    auto cp_col_idxs = column_permuted->get_col_idxs();
+    auto cp_vals = column_permuted->get_values();
+    auto num_nnz = orig->get_num_stored_elements();
+    size_type num_rows = orig->get_size()[0];
+    size_type num_cols = orig->get_size()[1];
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        cp_row_ptrs[row] = orig_row_ptrs[row];
+        for (size_type k = orig_row_ptrs[row];
+             k < size_type(orig_row_ptrs[row + 1]); ++k) {
+            cp_col_idxs[k] = perm[orig_col_idxs[k]];
+            cp_vals[k] = orig_vals[k];
+        }
+    }
+    cp_row_ptrs[num_rows] = orig_row_ptrs[num_rows];
+}
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const OmpExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    Array<IndexType> inv_perm(*permutation_indices);
+    auto iperm = inv_perm.get_data();
+#pragma omp parallel for
+    for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) {
+        iperm[perm[ind]] = ind;
+    }
+    column_permute_impl(&inv_perm, orig, column_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const OmpExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
+{
+    column_permute_impl(permutation_indices, orig, column_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row(std::shared_ptr<const OmpExecutor> exec,
                                 const matrix::Csr<ValueType, IndexType> *source,
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 5654b40d753..a74bae1bd0f 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <omp.h>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -49,6 +50,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -195,17 +199,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const OmpExecutor> exec,
                    const matrix::Dense<ValueType> *x,
-                   matrix::Dense<ValueType> *result)
+                   matrix::Dense<remove_complex<ValueType>> *result)
 {
-    compute_dot(exec, x, x, result);
-    const size_type dim_0 = result->get_size()[0];
-    const size_type dim_1 = result->get_size()[1];
-#pragma omp parallel for collapse(2)
-    for (size_type i = 0; i < dim_0; ++i) {
-        for (size_type j = 0; j < dim_1; ++j) {
-            result->at(i, j) = sqrt(abs(result->at(i, j)));
+    using norm_type = remove_complex<ValueType>;
+#pragma omp parallel for
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = zero<norm_type>();
+    }
+#pragma omp parallel for
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        for (size_type i = 0; i < x->get_size()[0]; ++i) {
+            result->at(0, j) += squared_norm(x->at(i, j));
         }
     }
+#pragma omp parallel for
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = sqrt(result->at(0, j));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
@@ -213,8 +223,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -223,20 +233,31 @@ void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
     auto row_idxs = result->get_row_idxs();
     auto col_idxs = result->get_col_idxs();
     auto values = result->get_values();
+    Array<IndexType> row_ptrs_array(exec, num_rows);
+    auto row_ptrs = row_ptrs_array.get_data();
 
-    size_type idxs = 0;
+#pragma omp parallel for
     for (size_type row = 0; row < num_rows; ++row) {
+        IndexType row_count{};
+        for (size_type col = 0; col < num_cols; ++col) {
+            auto val = source->at(row, col);
+            row_count += val != zero<ValueType>();
+        }
+        row_ptrs[row] = row_count;
+    }
+
+    components::prefix_sum(exec, row_ptrs, num_rows);
+
 #pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto idxs = row_ptrs[row];
         for (size_type col = 0; col < num_cols; ++col) {
             auto val = source->at(row, col);
             if (val != zero<ValueType>()) {
-#pragma omp critical
-                {
-                    row_idxs[idxs] = row;
-                    col_idxs[idxs] = col;
-                    values[idxs] = val;
-                    ++idxs;
-                }
+                row_idxs[idxs] = row;
+                col_idxs[idxs] = col;
+                values[idxs] = val;
+                ++idxs;
             }
         }
     }
@@ -248,8 +269,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -259,22 +280,29 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
     auto col_idxs = result->get_col_idxs();
     auto values = result->get_values();
 
-    size_type cur_ptr = 0;
-    row_ptrs[0] = cur_ptr;
+#pragma omp parallel for
     for (size_type row = 0; row < num_rows; ++row) {
+        IndexType row_nnz{};
+        for (size_type col = 0; col < num_cols; ++col) {
+            auto val = source->at(row, col);
+            row_nnz += val != zero<ValueType>();
+        }
+        row_ptrs[row] = row_nnz;
+    }
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
 #pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto cur_ptr = row_ptrs[row];
         for (size_type col = 0; col < num_cols; ++col) {
             auto val = source->at(row, col);
             if (val != zero<ValueType>()) {
-#pragma omp critical
-                {
-                    col_idxs[cur_ptr] = col;
-                    values[cur_ptr] = val;
-                    ++cur_ptr;
-                }
+                col_idxs[cur_ptr] = col;
+                values[cur_ptr] = val;
+                ++cur_ptr;
             }
         }
-        row_ptrs[row + 1] = cur_ptr;
     }
 }
 
@@ -284,8 +312,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -317,22 +345,24 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
     auto strategy = result->get_strategy();
     auto ell_lim = strategy->get_ell_num_stored_elements_per_row();
-    auto coo_lim = strategy->get_coo_nnz();
     auto coo_val = result->get_coo_values();
     auto coo_col = result->get_coo_col_idxs();
     auto coo_row = result->get_coo_row_idxs();
+    Array<IndexType> coo_row_ptrs_array(exec, num_rows);
+    auto coo_row_ptrs = coo_row_ptrs_array.get_data();
 
-#pragma omp parallel for
-    for (size_type i = 0; i < result->get_ell_num_stored_elements_per_row();
-         i++) {
-        for (size_type j = 0; j < result->get_ell_stride(); j++) {
+    auto ell_nnz_row = result->get_ell_num_stored_elements_per_row();
+    auto ell_stride = result->get_ell_stride();
+#pragma omp parallel for collapse(2)
+    for (size_type i = 0; i < ell_nnz_row; i++) {
+        for (size_type j = 0; j < ell_stride; j++) {
             result->ell_val_at(j, i) = zero<ValueType>();
             result->ell_col_at(j, i) = 0;
         }
@@ -343,39 +373,39 @@ void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
         coo_col[i] = 0;
         coo_row[i] = 0;
     }
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; row++) {
+        size_type total_row_nnz{};
+        for (size_type col = 0; col < num_cols; col++) {
+            auto val = source->at(row, col);
+            total_row_nnz += val != zero<ValueType>();
+        }
+        coo_row_ptrs[row] = std::max(ell_lim, total_row_nnz) - ell_lim;
+    }
+
+    components::prefix_sum(exec, coo_row_ptrs, num_rows);
 
-    size_type coo_idx = 0;
-    // FIXME: This parallelization may cause the COO part to not being sorted by
-    //        row idx
 #pragma omp parallel for
     for (size_type row = 0; row < num_rows; row++) {
-        size_type col_idx = 0;
+        size_type ell_count = 0;
         size_type col = 0;
-        while (col < num_cols && col_idx < ell_lim) {
+        for (; col < num_cols && ell_count < ell_lim; col++) {
             auto val = source->at(row, col);
             if (val != zero<ValueType>()) {
-                result->ell_val_at(row, col_idx) = val;
-                result->ell_col_at(row, col_idx) = col;
-                col_idx++;
+                result->ell_val_at(row, ell_count) = val;
+                result->ell_col_at(row, ell_count) = col;
+                ell_count++;
             }
-            col++;
         }
-        while (col < num_cols) {
+        auto coo_idx = coo_row_ptrs[row];
+        for (; col < num_cols; col++) {
             auto val = source->at(row, col);
             if (val != zero<ValueType>()) {
-                size_type current_coo_idx;
-                // Use the critical section for accessing the coo_idx only, the
-                // rest can be performed in parallel since the index is unique
-#pragma omp critical
-                {
-                    current_coo_idx = coo_idx;
-                    ++coo_idx;
-                }
-                coo_val[current_coo_idx] = val;
-                coo_col[current_coo_idx] = col;
-                coo_row[current_coo_idx] = row;
+                coo_val[coo_idx] = val;
+                coo_col[coo_idx] = col;
+                coo_row[coo_idx] = row;
+                coo_idx++;
             }
-            col++;
         }
     }
 }
@@ -386,8 +416,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -448,8 +478,11 @@ void convert_to_sellp(std::shared_ptr<const OmpExecutor> exec,
             }
         }
     }
-    slice_sets[slice_num] =
-        slice_sets[slice_num - 1] + slice_lengths[slice_num - 1];
+
+    if (slice_num > 0) {
+        slice_sets[slice_num] =
+            slice_sets[slice_num - 1] + slice_lengths[slice_num - 1];
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -458,8 +491,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const OmpExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -469,21 +502,28 @@ void convert_to_sparsity_csr(std::shared_ptr<const OmpExecutor> exec,
     auto value = result->get_value();
     value[0] = one<ValueType>();
 
-    size_type cur_ptr = 0;
-    row_ptrs[0] = cur_ptr;
+#pragma omp parallel for
     for (size_type row = 0; row < num_rows; ++row) {
+        IndexType row_nnz{};
+        for (size_type col = 0; col < num_cols; ++col) {
+            auto val = source->at(row, col);
+            row_nnz += val != zero<ValueType>();
+        }
+        row_ptrs[row] = row_nnz;
+    }
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
 #pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto cur_ptr = row_ptrs[row];
         for (size_type col = 0; col < num_cols; ++col) {
             auto val = source->at(row, col);
             if (val != zero<ValueType>()) {
-#pragma omp critical
-                {
-                    col_idxs[cur_ptr] = col;
-                    ++cur_ptr;
-                }
+                col_idxs[cur_ptr] = col;
+                ++cur_ptr;
             }
         }
-        row_ptrs[row + 1] = cur_ptr;
     }
 }
 
@@ -592,8 +632,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
 #pragma omp parallel for
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
@@ -608,8 +648,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
 #pragma omp parallel for
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
@@ -622,6 +662,81 @@ void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const OmpExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+#pragma omp parallel for
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            row_permuted->at(i, j) = orig->at(perm[i], j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const OmpExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+#pragma omp parallel for
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            column_permuted->at(i, j) = orig->at(i, perm[j]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const OmpExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+#pragma omp parallel for
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            row_permuted->at(perm[i], j) = orig->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const OmpExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+#pragma omp parallel for
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            column_permuted->at(i, perm[j]) = orig->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
 }  // namespace dense
 }  // namespace omp
 }  // namespace kernels
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index a487890be07..57bced52e9d 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -114,8 +114,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -139,8 +139,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -156,8 +156,8 @@ void count_nonzeros(std::shared_ptr<const OmpExecutor> exec,
     const auto max_nnz_per_row = source->get_num_stored_elements_per_row();
     const auto stride = source->get_stride();
 
-    for (size_type row = 0; row < num_rows; row++) {
 #pragma omp parallel for reduction(+ : nonzeros)
+    for (size_type row = 0; row < num_rows; row++) {
         for (size_type i = 0; i < max_nnz_per_row; i++) {
             nonzeros += (source->val_at(row, i) != zero<ValueType>());
         }
diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp
index 5453c94ac57..8282d7c7ab8 100644
--- a/omp/matrix/hybrid_kernels.cpp
+++ b/omp/matrix/hybrid_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -59,8 +59,8 @@ namespace hybrid {
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Hybrid<ValueType, IndexType> *source)
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -99,8 +99,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto csr_val = result->get_values();
     auto csr_col_idxs = result->get_col_idxs();
diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp
index 6212e1ac7b0..023dd7f5249 100644
--- a/omp/matrix/sellp_kernels.cpp
+++ b/omp/matrix/sellp_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -125,8 +125,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -161,8 +161,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 830435fc376..22987b55287 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -148,9 +148,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void remove_diagonal_elements(std::shared_ptr<const OmpExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
                               const IndexType *row_ptrs,
-                              const IndexType *col_idxs)
+                              const IndexType *col_idxs,
+                              matrix::SparsityCsr<ValueType, IndexType> *matrix)
 {
     auto num_rows = matrix->get_size()[0];
     auto adj_ptrs = matrix->get_row_ptrs();
@@ -221,8 +221,8 @@ void transpose_and_transform(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig);
 }
diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp
new file mode 100644
index 00000000000..98acbf68446
--- /dev/null
+++ b/omp/preconditioner/isai_kernels.cpp
@@ -0,0 +1,332 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/isai_kernels.hpp"
+
+
+#include <algorithm>
+#include <memory>
+
+
+#include <omp.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The Isai preconditioner namespace.
+ *
+ * @ingroup isai
+ */
+namespace isai {
+
+
+template <typename IndexType, typename Callback>
+void forall_matching(const IndexType *fst, IndexType fst_size,
+                     const IndexType *snd, IndexType snd_size, Callback cb)
+{
+    IndexType fst_idx{};
+    IndexType snd_idx{};
+    while (fst_idx < fst_size && snd_idx < snd_size) {
+        const auto fst_val = fst[fst_idx];
+        const auto snd_val = snd[snd_idx];
+        if (fst_val == snd_val) {
+            cb(fst_val, fst_idx, snd_idx);
+        }
+        // advance the smaller entrie(s)
+        fst_idx += (fst_val <= snd_val);
+        snd_idx += (fst_val >= snd_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename Callable>
+void generic_generate(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *mtx,
+                      matrix::Csr<ValueType, IndexType> *inverse_mtx,
+                      IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs,
+                      Callable trs_solve)
+{
+    /*
+    Consider: aiM := inverse_mtx; M := mtx
+    I := Identity matrix
+    e(i) := unit vector i (containing all zeros except for row i, which is one)
+    S := Sparsity pattern of the desired aiM
+    S(i) := Sparsity pattern of row i of aiM (Set of non-zero columns)
+    D(i) := M[S(i), S(i)]
+    aiM := approximate inverse of M
+
+    Target: Solving (aiM * M = I)_{S} (aiM * M = I for the sparsity pattern S)
+    aiM[i, :] * D(i) = e(i)^T
+    <=> D(i)^T * aiM[i, :]^T = e(i)   =^ Triangular system (Trs)
+    Solve Trs, fill in aiM row by row (coalesced access)
+    */
+    const auto num_rows = mtx->get_size()[0];
+    const auto m_row_ptrs = mtx->get_const_row_ptrs();
+    const auto m_cols = mtx->get_const_col_idxs();
+    const auto m_vals = mtx->get_const_values();
+    const auto i_row_ptrs = inverse_mtx->get_const_row_ptrs();
+    const auto i_cols = inverse_mtx->get_const_col_idxs();
+    auto i_vals = inverse_mtx->get_values();
+
+    auto num_threads = static_cast<size_type>(omp_get_max_threads());
+    // RHS for local trisystem
+    gko::Array<ValueType> rhs_array{exec, row_size_limit * num_threads};
+    // memory for dense trisystem
+    gko::Array<ValueType> trisystem_array{
+        exec, row_size_limit * row_size_limit * num_threads};
+
+#pragma omp parallel
+    {
+        auto thread_num = static_cast<size_type>(omp_get_thread_num());
+
+        auto rhs = rhs_array.get_data() + thread_num * row_size_limit;
+        auto trisystem_ptr = trisystem_array.get_data() +
+                             thread_num * row_size_limit * row_size_limit;
+
+#pragma omp for
+        for (size_type row = 0; row < num_rows; ++row) {
+            const auto i_begin = i_row_ptrs[row];
+            const auto i_size = i_row_ptrs[row + 1] - i_begin;
+
+            if (i_size <= row_size_limit) {
+                // short rows: treat directly as dense system
+                excess_rhs_ptrs[row] = 0;
+                excess_nz_ptrs[row] = 0;
+                auto trisystem = range<accessor::row_major<ValueType, 2>>(
+                    trisystem_ptr, static_cast<size_type>(i_size),
+                    static_cast<size_type>(i_size),
+                    static_cast<size_type>(i_size));
+                std::fill_n(trisystem_ptr, i_size * i_size, zero<ValueType>());
+
+                for (size_type i = 0; i < i_size; ++i) {
+                    const auto col = i_cols[i_begin + i];
+                    const auto m_begin = m_row_ptrs[col];
+                    const auto m_size = m_row_ptrs[col + 1] - m_begin;
+                    forall_matching(
+                        m_cols + m_begin, m_size, i_cols + i_begin, i_size,
+                        [&](IndexType, IndexType m_idx, IndexType i_idx) {
+                            trisystem(i, i_idx) = m_vals[m_idx + m_begin];
+                        });
+                }
+
+                // solve dense triangular system
+                trs_solve(trisystem, rhs);
+
+                // write triangular solution to inverse
+                for (size_type i = 0; i < i_size; ++i) {
+                    const auto new_val = rhs[i];
+                    const auto idx = i_begin + i;
+                    // check for non-finite elements which should not be copied
+                    // over
+                    if (is_finite(new_val)) {
+                        i_vals[idx] = new_val;
+                    } else {
+                        // ensure the preconditioner does not prevent
+                        // convergence
+                        i_vals[idx] = i_cols[idx] == row ? one<ValueType>()
+                                                         : zero<ValueType>();
+                    }
+                }
+            } else {
+                // count non-zeros and dimension in the excess system
+                IndexType count{};
+                for (size_type i = 0; i < i_size; ++i) {
+                    const auto col = i_cols[i_begin + i];
+                    const auto m_begin = m_row_ptrs[col];
+                    const auto m_size = m_row_ptrs[col + 1] - m_begin;
+                    forall_matching(
+                        m_cols + m_begin, m_size, i_cols + i_begin, i_size,
+                        [&](IndexType, IndexType, IndexType) { ++count; });
+                }
+                excess_rhs_ptrs[row] = i_size;
+                excess_nz_ptrs[row] = count;
+            }
+        }
+    }
+    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+}
+
+
+template <typename ValueType, typename IndexType>
+void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *mtx,
+                          matrix::Csr<ValueType, IndexType> *inverse_mtx,
+                          IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs,
+                          bool lower)
+{
+    auto trs_solve =
+        [lower](const range<accessor::row_major<ValueType, 2>> trisystem,
+                ValueType *rhs) {
+            const IndexType size = trisystem.length(0);
+            if (size <= 0) {
+                return;
+            }
+            // RHS is the identity: zero everywhere except for the diagonal
+            // entry
+            std::fill_n(rhs, size, zero<ValueType>());
+            rhs[lower ? size - 1 : 0] = one<ValueType>();
+
+            // solve transposed triangular system
+            if (lower) {
+                for (auto col = size - 1; col >= 0; --col) {
+                    const auto diag = trisystem(col, col);
+                    const auto bot = rhs[col] / diag;
+                    rhs[col] = bot;
+                    // do a backwards substitution
+                    for (auto row = col - 1; row >= 0; --row) {
+                        rhs[row] -= bot * trisystem(col, row);
+                    }
+                }
+            } else {
+                for (IndexType col = 0; col < size; ++col) {
+                    const auto diag = trisystem(col, col);
+                    const auto top = rhs[col] / diag;
+                    rhs[col] = top;
+                    // do a forward substitution
+                    for (auto row = col + 1; row < size; ++row) {
+                        rhs[row] -= top * trisystem(col, row);
+                    }
+                }
+            }
+        };
+
+    generic_generate(exec, mtx, inverse_mtx, excess_rhs_ptrs, excess_nz_ptrs,
+                     trs_solve);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
+                            const matrix::Csr<ValueType, IndexType> *input,
+                            const matrix::Csr<ValueType, IndexType> *inverse,
+                            const IndexType *excess_rhs_ptrs,
+                            const IndexType *excess_nz_ptrs,
+                            matrix::Csr<ValueType, IndexType> *excess_system,
+                            matrix::Dense<ValueType> *excess_rhs)
+{
+    const auto num_rows = input->get_size()[0];
+    const auto m_row_ptrs = input->get_const_row_ptrs();
+    const auto m_cols = input->get_const_col_idxs();
+    const auto m_vals = input->get_const_values();
+    const auto i_row_ptrs = inverse->get_const_row_ptrs();
+    const auto i_cols = inverse->get_const_col_idxs();
+    const auto e_dim = excess_rhs->get_size()[0];
+    auto e_row_ptrs = excess_system->get_row_ptrs();
+    auto e_cols = excess_system->get_col_idxs();
+    auto e_vals = excess_system->get_values();
+    auto e_rhs = excess_rhs->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        const auto i_begin = i_row_ptrs[row];
+        const auto i_size = i_row_ptrs[row + 1] - i_begin;
+        // first row index of the sparse block in the excess system
+        auto e_begin = excess_rhs_ptrs[row];
+        // first non-zero index in the sparse block
+        auto e_nz = excess_nz_ptrs[row];
+
+        if (i_size > row_size_limit) {
+            // count non-zeros and dimension in the excess system
+            for (size_type i = 0; i < i_size; ++i) {
+                // current row in the excess system
+                const auto e_row = e_begin + i;
+                const auto col = i_cols[i_begin + i];
+                const auto m_begin = m_row_ptrs[col];
+                const auto m_size = m_row_ptrs[col + 1] - m_begin;
+                // store row pointers: one row per non-zero of inverse row
+                e_row_ptrs[e_row] = e_nz;
+                // build right-hand side: identity row
+                e_rhs[e_row] =
+                    row == col ? one<ValueType>() : zero<ValueType>();
+                // build sparse block
+                forall_matching(
+                    m_cols + m_begin, m_size, i_cols + i_begin, i_size,
+                    [&](IndexType, IndexType m_idx, IndexType i_idx) {
+                        // trisystem(i, i_idx) = m_vals[m_idx + m_begin]
+                        // just in sparse
+                        e_cols[e_nz] = i_idx + e_begin;
+                        e_vals[e_nz] = m_vals[m_idx + m_begin];
+                        ++e_nz;
+                    });
+            }
+        }
+    }
+    e_row_ptrs[e_dim] = excess_nz_ptrs[num_rows];
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
+                             const IndexType *excess_block_ptrs,
+                             const matrix::Dense<ValueType> *excess_solution,
+                             matrix::Csr<ValueType, IndexType> *inverse)
+{
+    const auto num_rows = inverse->get_size()[0];
+    auto excess_values = excess_solution->get_const_values();
+    auto values = inverse->get_values();
+    auto row_ptrs = inverse->get_const_row_ptrs();
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        const auto excess_begin = excess_values + excess_block_ptrs[row];
+        const auto excess_end = excess_values + excess_block_ptrs[row + 1];
+        auto values_begin = values + row_ptrs[row];
+        std::copy(excess_begin, excess_end, values_begin);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index 1d04410a47b..cf3e6fb4ec8 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <algorithm>
 #include <cmath>
+#include <iterator>
 #include <numeric>
 #include <vector>
 
@@ -48,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/allocator.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "omp/components/matrix_operations.hpp"
@@ -83,15 +85,9 @@ inline bool has_same_nonzero_pattern(const IndexType *prev_row_ptr,
                                      const IndexType *curr_row_ptr,
                                      const IndexType *next_row_ptr)
 {
-    if (next_row_ptr - curr_row_ptr != curr_row_ptr - prev_row_ptr) {
-        return false;
-    }
-    for (; curr_row_ptr < next_row_ptr; ++prev_row_ptr, ++curr_row_ptr) {
-        if (*curr_row_ptr != *prev_row_ptr) {
-            return false;
-        }
-    }
-    return true;
+    return std::distance(curr_row_ptr, next_row_ptr) ==
+               std::distance(prev_row_ptr, curr_row_ptr) &&
+           std::equal(curr_row_ptr, next_row_ptr, prev_row_ptr);
 }
 
 
@@ -270,6 +266,24 @@ inline void transpose_block(IndexType block_size, const SourceValueType *from,
 }
 
 
+template <typename SourceValueType, typename ResultValueType,
+          typename IndexType,
+          typename ValueConverter =
+              default_converter<SourceValueType, ResultValueType>>
+inline void conj_transpose_block(IndexType block_size,
+                                 const SourceValueType *from,
+                                 size_type from_stride, ResultValueType *to,
+                                 size_type to_stride,
+                                 ValueConverter converter = {}) noexcept
+{
+    for (IndexType i = 0; i < block_size; ++i) {
+        for (IndexType j = 0; j < block_size; ++j) {
+            to[i * to_stride + j] = conj(converter(from[i + j * from_stride]));
+        }
+    }
+}
+
+
 template <typename SourceValueType, typename ResultValueType,
           typename IndexType,
           typename ValueConverter =
@@ -312,13 +326,13 @@ inline bool invert_block(IndexType block_size, IndexType *perm,
 
 
 template <typename ReducedType, typename ValueType, typename IndexType>
-inline bool validate_precision_reduction_feasibility(IndexType block_size,
-                                                     const ValueType *block,
-                                                     size_type stride)
+inline bool validate_precision_reduction_feasibility(
+    std::shared_ptr<const OmpExecutor> exec, IndexType block_size,
+    const ValueType *block, size_type stride)
 {
     using gko::detail::float_traits;
-    std::vector<ValueType> tmp(block_size * block_size);
-    std::vector<IndexType> perm(block_size);
+    vector<ValueType> tmp(block_size * block_size, {}, exec);
+    vector<IndexType> perm(block_size, {}, exec);
     std::iota(begin(perm), end(perm), IndexType{0});
     for (IndexType i = 0; i < block_size; ++i) {
         for (IndexType j = 0; j < block_size; ++j) {
@@ -359,9 +373,9 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     const auto cond = conditioning.get_data();
 #pragma omp parallel for
     for (size_type g = 0; g < num_blocks; g += group_size) {
-        std::vector<Array<ValueType>> block(group_size);
-        std::vector<Array<IndexType>> perm(group_size);
-        std::vector<uint32> pr_descriptors(group_size, uint32{} - 1);
+        vector<Array<ValueType>> block(group_size, {}, exec);
+        vector<Array<IndexType>> perm(group_size, {}, exec);
+        vector<uint32> pr_descriptors(group_size, uint32{} - 1, exec);
         // extract group of blocks, invert them, figure out storage precision
         for (size_type b = 0; b < group_size; ++b) {
             if (b + g >= num_blocks) {
@@ -391,16 +405,18 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
                 using preconditioner::detail::get_supported_storage_reductions;
                 pr_descriptors[b] = get_supported_storage_reductions<ValueType>(
                     accuracy, cond[g + b],
-                    [&block_size, &block, &b] {
+                    [&exec, &block_size, &block, &b] {
                         using target = reduce_precision<ValueType>;
                         return validate_precision_reduction_feasibility<target>(
-                            block_size, block[b].get_const_data(), block_size);
+                            exec, block_size, block[b].get_const_data(),
+                            block_size);
                     },
-                    [&block_size, &block, &b] {
+                    [&exec, &block_size, &block, &b] {
                         using target =
                             reduce_precision<reduce_precision<ValueType>>;
                         return validate_precision_reduction_feasibility<target>(
-                            block_size, block[b].get_const_data(), block_size);
+                            exec, block_size, block[b].get_const_data(),
+                            block_size);
                     });
             } else {
                 pr_descriptors[b] = preconditioner::detail::
@@ -553,6 +569,80 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    const auto ptrs = block_pointers.get_const_data();
+    const auto prec = block_precisions.get_const_data();
+    const size_type matrix_size = ptrs[num_blocks];
+
+#pragma omp parallel for
+    for (size_type i = 0; i < num_blocks; ++i) {
+        const auto group_ofs = storage_scheme.get_group_offset(i);
+        const auto block_ofs = storage_scheme.get_block_offset(i);
+        const auto block_stride = storage_scheme.get_stride();
+        const auto group = blocks.get_const_data() + group_ofs;
+        auto out_group = out_blocks.get_data() + group_ofs;
+        const auto block_size = ptrs[i + 1] - ptrs[i];
+        const auto p = prec ? prec[i] : precision_reduction();
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, p,
+            transpose_block(
+                block_size,
+                reinterpret_cast<const resolved_precision *>(group) + block_ofs,
+                block_stride,
+                reinterpret_cast<resolved_precision *>(out_group) + block_ofs,
+                block_stride));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    const auto ptrs = block_pointers.get_const_data();
+    const auto prec = block_precisions.get_const_data();
+    const size_type matrix_size = ptrs[num_blocks];
+
+#pragma omp parallel for
+    for (size_type i = 0; i < num_blocks; ++i) {
+        const auto group_ofs = storage_scheme.get_group_offset(i);
+        const auto block_ofs = storage_scheme.get_block_offset(i);
+        const auto block_stride = storage_scheme.get_stride();
+        const auto group = blocks.get_const_data() + group_ofs;
+        auto out_group = out_blocks.get_data() + group_ofs;
+        const auto block_size = ptrs[i + 1] - ptrs[i];
+        const auto p = prec ? prec[i] : precision_reduction();
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, p,
+            conj_transpose_block(
+                block_size,
+                reinterpret_cast<const resolved_precision *>(group) + block_ofs,
+                block_stride,
+                reinterpret_cast<resolved_precision *>(out_group) + block_ofs,
+                block_stride));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
     std::shared_ptr<const OmpExecutor> exec, size_type num_blocks,
diff --git a/omp/solver/bicg_kernels.cpp b/omp/solver/bicg_kernels.cpp
new file mode 100644
index 00000000000..d9e2864eedf
--- /dev/null
+++ b/omp/solver/bicg_kernels.cpp
@@ -0,0 +1,147 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/bicg_kernels.hpp"
+
+
+#include <omp.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The BICG solver namespace.
+ *
+ * @ingroup bicg
+ */
+namespace bicg {
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const OmpExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *r2,
+                matrix::Dense<ValueType> *z2, matrix::Dense<ValueType> *p2,
+                matrix::Dense<ValueType> *q2,
+                Array<stopping_status> *stop_status)
+{
+#pragma omp parallel for
+    for (size_type j = 0; j < b->get_size()[1]; ++j) {
+        rho->at(j) = zero<ValueType>();
+        prev_rho->at(j) = one<ValueType>();
+        stop_status->get_data()[j].reset();
+    }
+#pragma omp parallel for
+    for (size_type i = 0; i < b->get_size()[0]; ++i) {
+        for (size_type j = 0; j < b->get_size()[1]; ++j) {
+            r->at(i, j) = b->at(i, j);
+            r2->at(i, j) = b->at(i, j);
+            z->at(i, j) = p->at(i, j) = q->at(i, j) = zero<ValueType>();
+            z2->at(i, j) = p2->at(i, j) = q2->at(i, j) = zero<ValueType>();
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const OmpExecutor> exec,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
+            matrix::Dense<ValueType> *p2, const matrix::Dense<ValueType> *z2,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const Array<stopping_status> *stop_status)
+{
+#pragma omp parallel for
+    for (size_type i = 0; i < p->get_size()[0]; ++i) {
+        for (size_type j = 0; j < p->get_size()[1]; ++j) {
+            if (stop_status->get_const_data()[j].has_stopped()) {
+                continue;
+            }
+            if (prev_rho->at(j) == zero<ValueType>()) {
+                p->at(i, j) = z->at(i, j);
+                p2->at(i, j) = z2->at(i, j);
+            } else {
+                auto tmp = rho->at(j) / prev_rho->at(j);
+                p->at(i, j) = z->at(i, j) + tmp * p->at(i, j);
+                p2->at(i, j) = z2->at(i, j) + tmp * p2->at(i, j);
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const OmpExecutor> exec,
+            matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
+            matrix::Dense<ValueType> *r2, const matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *q,
+            const matrix::Dense<ValueType> *q2,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho,
+            const Array<stopping_status> *stop_status)
+{
+#pragma omp parallel for
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            if (stop_status->get_const_data()[j].has_stopped()) {
+                continue;
+            }
+            if (beta->at(j) != zero<ValueType>()) {
+                auto tmp = rho->at(j) / beta->at(j);
+                x->at(i, j) += tmp * p->at(i, j);
+                r->at(i, j) -= tmp * q->at(i, j);
+                r2->at(i, j) -= tmp * q2->at(i, j);
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+
+
+}  // namespace bicg
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/solver/bicgstab_kernels.cpp b/omp/solver/bicgstab_kernels.cpp
index 8f4149a73f4..d761fc044cf 100644
--- a/omp/solver/bicgstab_kernels.cpp
+++ b/omp/solver/bicgstab_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/solver/bicgstab_kernels.hpp"
 
 
-#include <omp.h>
+#include <algorithm>
 
 
-#include <algorithm>
+#include <omp.h>
 
 
 #include <ginkgo/core/base/array.hpp>
@@ -135,6 +135,17 @@ void step_2(std::shared_ptr<const OmpExecutor> exec,
             const matrix::Dense<ValueType> *beta,
             const Array<stopping_status> *stop_status)
 {
+#pragma omp parallel for
+    for (size_type j = 0; j < s->get_size()[1]; ++j) {
+        if (stop_status->get_const_data()[j].has_stopped()) {
+            continue;
+        }
+        if (beta->at(j) != zero<ValueType>()) {
+            alpha->at(j) = rho->at(j) / beta->at(j);
+        } else {
+            alpha->at(j) = zero<ValueType>();
+        }
+    }
 #pragma omp parallel for
     for (size_type i = 0; i < s->get_size()[0]; ++i) {
         for (size_type j = 0; j < s->get_size()[1]; ++j) {
@@ -142,10 +153,8 @@ void step_2(std::shared_ptr<const OmpExecutor> exec,
                 continue;
             }
             if (beta->at(j) != zero<ValueType>()) {
-                alpha->at(j) = rho->at(j) / beta->at(j);
                 s->at(i, j) = r->at(i, j) - alpha->at(j) * v->at(i, j);
             } else {
-                alpha->at(j) = zero<ValueType>();
                 s->at(i, j) = r->at(i, j);
             }
         }
diff --git a/omp/solver/cg_kernels.cpp b/omp/solver/cg_kernels.cpp
index 07590c0b6f0..b9a88f25761 100644
--- a/omp/solver/cg_kernels.cpp
+++ b/omp/solver/cg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/solver/cgs_kernels.cpp b/omp/solver/cgs_kernels.cpp
index 5e9995faae1..a0678788565 100644
--- a/omp/solver/cgs_kernels.cpp
+++ b/omp/solver/cgs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -86,7 +86,6 @@ void initialize(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
 
 
diff --git a/omp/solver/fcg_kernels.cpp b/omp/solver/fcg_kernels.cpp
index e9076d113c4..b8b69ee6d91 100644
--- a/omp/solver/fcg_kernels.cpp
+++ b/omp/solver/fcg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -77,7 +77,6 @@ void initialize(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
 
 
diff --git a/omp/solver/gmres_kernels.cpp b/omp/solver/gmres_kernels.cpp
index 066ffaf2178..6c44263bc75 100644
--- a/omp/solver/gmres_kernels.cpp
+++ b/omp/solver/gmres_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -58,14 +58,14 @@ namespace {
 
 
 template <typename ValueType>
-void finish_arnoldi(matrix::Dense<ValueType> *next_krylov_basis,
-                    matrix::Dense<ValueType> *krylov_bases,
+void finish_arnoldi(size_type num_rows, matrix::Dense<ValueType> *krylov_bases,
                     matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
                     const stopping_status *stop_status)
 {
-#pragma omp declare reduction(add : ValueType : omp_out = omp_out + omp_in)
-
-    for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) {
+    const auto krylov_bases_rowoffset = num_rows;
+    const auto next_krylov_rowoffset = (iter + 1) * krylov_bases_rowoffset;
+#pragma omp declare reduction(add:ValueType : omp_out = omp_out + omp_in)
+    for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) {
         if (stop_status[i].has_stopped()) {
             continue;
         }
@@ -73,20 +73,17 @@ void finish_arnoldi(matrix::Dense<ValueType> *next_krylov_basis,
             ValueType hessenberg_iter_entry = zero<ValueType>();
 
 #pragma omp parallel for reduction(add : hessenberg_iter_entry)
-            for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
+            for (size_type j = 0; j < num_rows; ++j) {
                 hessenberg_iter_entry +=
-                    next_krylov_basis->at(j, i) *
-                    krylov_bases->at(j,
-                                     next_krylov_basis->get_size()[1] * k + i);
+                    krylov_bases->at(j + next_krylov_rowoffset, i) *
+                    krylov_bases->at(j + k * krylov_bases_rowoffset, i);
             }
             hessenberg_iter->at(k, i) = hessenberg_iter_entry;
-
 #pragma omp parallel for
-            for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
-                next_krylov_basis->at(j, i) -=
+            for (size_type j = 0; j < num_rows; ++j) {
+                krylov_bases->at(j + next_krylov_rowoffset, i) -=
                     hessenberg_iter->at(k, i) *
-                    krylov_bases->at(j,
-                                     next_krylov_basis->get_size()[1] * k + i);
+                    krylov_bases->at(j + k * krylov_bases_rowoffset, i);
             }
         }
         // for i in 1:iter
@@ -97,20 +94,19 @@ void finish_arnoldi(matrix::Dense<ValueType> *next_krylov_basis,
         ValueType hessenberg_iter_entry = zero<ValueType>();
 
 #pragma omp parallel for reduction(add : hessenberg_iter_entry)
-        for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
+        for (size_type j = 0; j < num_rows; ++j) {
             hessenberg_iter_entry +=
-                next_krylov_basis->at(j, i) * next_krylov_basis->at(j, i);
+                krylov_bases->at(j + next_krylov_rowoffset, i) *
+                krylov_bases->at(j + next_krylov_rowoffset, i);
         }
         hessenberg_iter->at(iter + 1, i) = sqrt(hessenberg_iter_entry);
-        // hessenberg(iter, iter + 1) = norm(next_krylov_basis)
+// hessenberg(iter + 1, iter) = norm(krylov_bases)
 #pragma omp parallel for
-        for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
-            next_krylov_basis->at(j, i) /= hessenberg_iter->at(iter + 1, i);
-            krylov_bases->at(j, next_krylov_basis->get_size()[1] * (iter + 1) +
-                                    i) = next_krylov_basis->at(j, i);
+        for (size_type j = 0; j < num_rows; ++j) {
+            krylov_bases->at(j + next_krylov_rowoffset, i) /=
+                hessenberg_iter->at(iter + 1, i);
         }
         // next_krylov_basis /= hessenberg(iter, iter + 1)
-        // krylov_bases(:, iter + 1) = next_krylov_basis
         // End of arnoldi
     }
 }
@@ -126,28 +122,26 @@ void calculate_sin_and_cos(matrix::Dense<ValueType> *givens_sin,
         givens_cos->at(iter, rhs) = zero<ValueType>();
         givens_sin->at(iter, rhs) = one<ValueType>();
     } else {
-        auto hypotenuse = sqrt(hessenberg_iter->at(iter, rhs) *
-                                   hessenberg_iter->at(iter, rhs) +
-                               hessenberg_iter->at(iter + 1, rhs) *
-                                   hessenberg_iter->at(iter + 1, rhs));
-        givens_cos->at(iter, rhs) =
-            abs(hessenberg_iter->at(iter, rhs)) / hypotenuse;
-        givens_sin->at(iter, rhs) = givens_cos->at(iter, rhs) *
-                                    hessenberg_iter->at(iter + 1, rhs) /
-                                    hessenberg_iter->at(iter, rhs);
+        auto this_hess = hessenberg_iter->at(iter, rhs);
+        auto next_hess = hessenberg_iter->at(iter + 1, rhs);
+        const auto scale = abs(this_hess) + abs(next_hess);
+        const auto hypotenuse =
+            scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) +
+                         abs(next_hess / scale) * abs(next_hess / scale));
+        givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse;
+        givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse;
     }
 }
 
 
 template <typename ValueType>
-void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
-                     matrix::Dense<ValueType> *givens_sin,
+void givens_rotation(matrix::Dense<ValueType> *givens_sin,
                      matrix::Dense<ValueType> *givens_cos,
                      matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
                      const stopping_status *stop_status)
 {
 #pragma omp parallel for
-    for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) {
+    for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) {
         if (stop_status[i].has_stopped()) {
             continue;
         }
@@ -155,13 +149,13 @@ void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
             auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) +
                         givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i);
             hessenberg_iter->at(j + 1, i) =
-                -givens_sin->at(j, i) * hessenberg_iter->at(j, i) +
-                givens_cos->at(j, i) * hessenberg_iter->at(j + 1, i);
+                -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) +
+                conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i);
             hessenberg_iter->at(j, i) = temp;
             // temp             =  cos(j)*hessenberg(j) +
             //                     sin(j)*hessenberg(j+1)
-            // hessenberg(j+1)  = -sin(j)*hessenberg(j) +
-            //                     cos(j)*hessenberg(j+1)
+            // hessenberg(j+1)  = -conj(sin(j))*hessenberg(j) +
+            //                     conj(cos(j))*hessenberg(j+1)
             // hessenberg(j)    =  temp;
         }
 
@@ -172,7 +166,7 @@ void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
             givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i);
         hessenberg_iter->at(iter + 1, i) = zero<ValueType>();
         // hessenberg(iter)   = cos(iter)*hessenberg(iter) +
-        //                      sin(iter)*hessenberg(iter)
+        //                      sin(iter)*hessenberg(iter + 1)
         // hessenberg(iter+1) = 0
     }
 }
@@ -181,9 +175,8 @@ void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
 template <typename ValueType>
 void calculate_next_residual_norm(
     matrix::Dense<ValueType> *givens_sin, matrix::Dense<ValueType> *givens_cos,
-    matrix::Dense<ValueType> *residual_norm,
-    matrix::Dense<ValueType> *residual_norm_collection,
-    const matrix::Dense<ValueType> *b_norm, size_type iter,
+    matrix::Dense<remove_complex<ValueType>> *residual_norm,
+    matrix::Dense<ValueType> *residual_norm_collection, size_type iter,
     const stopping_status *stop_status)
 {
 #pragma omp parallel for
@@ -192,11 +185,12 @@ void calculate_next_residual_norm(
             continue;
         }
         residual_norm_collection->at(iter + 1, i) =
-            -givens_sin->at(iter, i) * residual_norm_collection->at(iter, i);
+            -conj(givens_sin)->at(iter, i) *
+            residual_norm_collection->at(iter, i);
         residual_norm_collection->at(iter, i) =
             givens_cos->at(iter, i) * residual_norm_collection->at(iter, i);
         residual_norm->at(0, i) =
-            abs(residual_norm_collection->at(iter + 1, i)) / b_norm->at(0, i);
+            abs(residual_norm_collection->at(iter + 1, i));
     }
 }
 
@@ -231,14 +225,14 @@ void calculate_qy(const matrix::Dense<ValueType> *krylov_bases,
                   matrix::Dense<ValueType> *before_preconditioner,
                   const size_type *final_iter_nums)
 {
+    const auto krylov_bases_rowoffset = before_preconditioner->get_size()[0];
 #pragma omp parallel for
     for (size_type i = 0; i < before_preconditioner->get_size()[0]; ++i) {
         for (size_type k = 0; k < before_preconditioner->get_size()[1]; ++k) {
             before_preconditioner->at(i, k) = zero<ValueType>();
             for (size_type j = 0; j < final_iter_nums[k]; ++j) {
                 before_preconditioner->at(i, k) +=
-                    krylov_bases->at(
-                        i, j * before_preconditioner->get_size()[1] + k) *
+                    krylov_bases->at(i + j * krylov_bases_rowoffset, k) *
                     y->at(j, k);
             }
         }
@@ -252,24 +246,13 @@ void calculate_qy(const matrix::Dense<ValueType> *krylov_bases,
 template <typename ValueType>
 void initialize_1(std::shared_ptr<const OmpExecutor> exec,
                   const matrix::Dense<ValueType> *b,
-                  matrix::Dense<ValueType> *b_norm,
                   matrix::Dense<ValueType> *residual,
                   matrix::Dense<ValueType> *givens_sin,
                   matrix::Dense<ValueType> *givens_cos,
                   Array<stopping_status> *stop_status, size_type krylov_dim)
 {
+    using norm_type = remove_complex<ValueType>;
     for (size_type j = 0; j < b->get_size()[1]; ++j) {
-        // Calculate b norm
-        ValueType norm = zero<ValueType>();
-
-#pragma omp declare reduction(add : ValueType : omp_out = omp_out + omp_in)
-
-#pragma omp parallel for reduction(add : norm)
-        for (size_type i = 0; i < b->get_size()[0]; ++i) {
-            norm += b->at(i, j) * b->at(i, j);
-        }
-        b_norm->at(0, j) = sqrt(norm);
-
 #pragma omp parallel for
         for (size_type i = 0; i < b->get_size()[0]; ++i) {
             residual->at(i, j) = b->at(i, j);
@@ -290,32 +273,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL);
 template <typename ValueType>
 void initialize_2(std::shared_ptr<const OmpExecutor> exec,
                   const matrix::Dense<ValueType> *residual,
-                  matrix::Dense<ValueType> *residual_norm,
+                  matrix::Dense<remove_complex<ValueType>> *residual_norm,
                   matrix::Dense<ValueType> *residual_norm_collection,
                   matrix::Dense<ValueType> *krylov_bases,
                   Array<size_type> *final_iter_nums, size_type krylov_dim)
 {
+    using norm_type = remove_complex<ValueType>;
     for (size_type j = 0; j < residual->get_size()[1]; ++j) {
         // Calculate residual norm
-        ValueType res_norm = zero<ValueType>();
-
-#pragma omp declare reduction(add : ValueType : omp_out = omp_out + omp_in)
+        norm_type res_norm = zero<norm_type>();
+#pragma omp declare reduction(add:norm_type : omp_out = omp_out + omp_in)
 
 #pragma omp parallel for reduction(add : res_norm)
         for (size_type i = 0; i < residual->get_size()[0]; ++i) {
-            res_norm += residual->at(i, j) * residual->at(i, j);
+            res_norm += squared_norm(residual->at(i, j));
         }
         residual_norm->at(0, j) = sqrt(res_norm);
-
-#pragma omp parallel for
-        for (size_type i = 0; i < krylov_dim + 1; ++i) {
-            if (i == 0) {
-                residual_norm_collection->at(i, j) = residual_norm->at(0, j);
-            } else {
-                residual_norm_collection->at(i, j) = zero<ValueType>();
-            }
-        }
-
+        residual_norm_collection->at(0, j) = residual_norm->at(0, j);
 #pragma omp parallel for
         for (size_type i = 0; i < residual->get_size()[0]; ++i) {
             krylov_bases->at(i, j) =
@@ -323,29 +297,19 @@ void initialize_2(std::shared_ptr<const OmpExecutor> exec,
         }
         final_iter_nums->get_data()[j] = 0;
     }
-
-#pragma omp parallel for
-    for (size_type i = 0; i < krylov_bases->get_size()[0]; ++i) {
-        for (size_type j = residual->get_size()[1];
-             j < krylov_bases->get_size()[1]; ++j) {
-            krylov_bases->at(i, j) = zero<ValueType>();
-        }
-    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL);
 
 
 template <typename ValueType>
-void step_1(std::shared_ptr<const OmpExecutor> exec,
-            matrix::Dense<ValueType> *next_krylov_basis,
+void step_1(std::shared_ptr<const OmpExecutor> exec, size_type num_rows,
             matrix::Dense<ValueType> *givens_sin,
             matrix::Dense<ValueType> *givens_cos,
-            matrix::Dense<ValueType> *residual_norm,
+            matrix::Dense<remove_complex<ValueType>> *residual_norm,
             matrix::Dense<ValueType> *residual_norm_collection,
             matrix::Dense<ValueType> *krylov_bases,
-            matrix::Dense<ValueType> *hessenberg_iter,
-            const matrix::Dense<ValueType> *b_norm, size_type iter,
+            matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
             Array<size_type> *final_iter_nums,
             const Array<stopping_status> *stop_status)
 {
@@ -355,12 +319,12 @@ void step_1(std::shared_ptr<const OmpExecutor> exec,
             (1 - stop_status->get_const_data()[i].has_stopped());
     }
 
-    finish_arnoldi(next_krylov_basis, krylov_bases, hessenberg_iter, iter,
+    finish_arnoldi(num_rows, krylov_bases, hessenberg_iter, iter,
                    stop_status->get_const_data());
-    givens_rotation(next_krylov_basis, givens_sin, givens_cos, hessenberg_iter,
-                    iter, stop_status->get_const_data());
+    givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter,
+                    stop_status->get_const_data());
     calculate_next_residual_norm(givens_sin, givens_cos, residual_norm,
-                                 residual_norm_collection, b_norm, iter,
+                                 residual_norm_collection, iter,
                                  stop_status->get_const_data());
 }
 
diff --git a/omp/solver/ir_kernels.cpp b/omp/solver/ir_kernels.cpp
index df7f6ff87e0..ba68c407e95 100644
--- a/omp/solver/ir_kernels.cpp
+++ b/omp/solver/ir_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp
index bdfd73e94b1..af6a0670ea0 100644
--- a/omp/solver/lower_trs_kernels.cpp
+++ b/omp/solver/lower_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp
index ed1fdea3799..2fa0b6a3db9 100644
--- a/omp/solver/upper_trs_kernels.cpp
+++ b/omp/solver/upper_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/stop/criterion_kernels.cpp b/omp/stop/criterion_kernels.cpp
index b0f9517b980..ef8ff9f1221 100644
--- a/omp/stop/criterion_kernels.cpp
+++ b/omp/stop/criterion_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/stop/residual_norm_reduction_kernels.cpp b/omp/stop/residual_norm_kernels.cpp
similarity index 62%
rename from omp/stop/residual_norm_reduction_kernels.cpp
rename to omp/stop/residual_norm_kernels.cpp
index 06e2485f6c0..1fd3a14cf85 100644
--- a/omp/stop/residual_norm_reduction_kernels.cpp
+++ b/omp/stop/residual_norm_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,59 +30,64 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/stop/residual_norm_reduction_kernels.hpp"
+#include "core/stop/residual_norm_kernels.hpp"
 
 
 #include <omp.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
 /**
- * @brief The Residual norm reduction stopping criterion namespace.
+ * @brief The Residual norm stopping criterion namespace.
  * @ref resnorm
  * @ingroup resnorm
  */
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 
 template <typename ValueType>
-void residual_norm_reduction(std::shared_ptr<const OmpExecutor> exec,
-                             const matrix::Dense<ValueType> *tau,
-                             const matrix::Dense<ValueType> *orig_tau,
-                             remove_complex<ValueType> rel_residual_goal,
-                             uint8 stoppingId, bool setFinalized,
-                             Array<stopping_status> *stop_status,
-                             Array<bool> *device_storage, bool *all_converged,
-                             bool *one_changed)
+void residual_norm(std::shared_ptr<const OmpExecutor> exec,
+                   const matrix::Dense<ValueType> *tau,
+                   const matrix::Dense<ValueType> *orig_tau,
+                   ValueType rel_residual_goal, uint8 stoppingId,
+                   bool setFinalized, Array<stopping_status> *stop_status,
+                   Array<bool> *device_storage, bool *all_converged,
+                   bool *one_changed)
 {
-    *all_converged = true;
-    *one_changed = false;
-#pragma omp parallel for
+    static_assert(is_complex_s<ValueType>::value == false,
+                  "ValueType must not be complex in this function!");
+    bool local_one_changed = false;
+#pragma omp parallel for reduction(|| : local_one_changed)
     for (size_type i = 0; i < tau->get_size()[1]; ++i) {
-        if (abs(tau->at(i)) < rel_residual_goal * abs(orig_tau->at(i))) {
+        if (tau->at(i) < rel_residual_goal * orig_tau->at(i)) {
             stop_status->get_data()[i].converge(stoppingId, setFinalized);
-            *one_changed = true;
+            local_one_changed = true;
         }
     }
-    // No early stopping here because one cannot use break with omp parallel
-    // for But it's parallel so does it matter?
-#pragma omp parallel for
+    *one_changed = local_one_changed;
+    // No early stopping here because one cannot use break with parallel for
+    // But it's parallel so does it matter?
+    bool local_all_converged = true;
+#pragma omp parallel for reduction(&& : local_all_converged)
     for (size_type i = 0; i < stop_status->get_num_elems(); ++i) {
         if (!stop_status->get_const_data()[i].has_stopped()) {
-            *all_converged = false;
+            local_all_converged = false;
         }
     }
+    *all_converged = local_all_converged;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace omp
 }  // namespace kernels
 }  // namespace gko
diff --git a/omp/test/CMakeLists.txt b/omp/test/CMakeLists.txt
index cd7e0fdba99..d746413f53f 100644
--- a/omp/test/CMakeLists.txt
+++ b/omp/test/CMakeLists.txt
@@ -1,3 +1,6 @@
+include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
+
+add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
 add_subdirectory(preconditioner)
diff --git a/omp/test/components/CMakeLists.txt b/omp/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..9c1dca5bcfa
--- /dev/null
+++ b/omp/test/components/CMakeLists.txt
@@ -0,0 +1,3 @@
+ginkgo_create_test(fill_array)
+ginkgo_create_test(precision_conversion)
+ginkgo_create_test(prefix_sum)
diff --git a/omp/test/components/fill_array.cpp b/omp/test/components/fill_array.cpp
new file mode 100644
index 00000000000..ad657e7e6e2
--- /dev/null
+++ b/omp/test/components/fill_array.cpp
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/fill_array.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class FillArray : public ::testing::Test {
+protected:
+    using value_type = T;
+    FillArray()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::OmpExecutor::create()),
+          total_size(63531),
+          vals(ref, total_size),
+          dvals(exec, total_size)
+    {
+        std::fill_n(vals.get_data(), total_size, T(1523));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> exec;
+    gko::size_type total_size;
+    gko::Array<value_type> vals;
+    gko::Array<value_type> dvals;
+};
+
+TYPED_TEST_CASE(FillArray, gko::test::ValueAndIndexTypes);
+
+
+TYPED_TEST(FillArray, EqualsReference)
+{
+    using T = typename TestFixture::value_type;
+    gko::kernels::omp::components::fill_array(
+        this->exec, this->dvals.get_data(), this->total_size, T(1523));
+    GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals);
+}
+
+
+}  // namespace
diff --git a/omp/test/components/precision_conversion.cpp b/omp/test/components/precision_conversion.cpp
new file mode 100644
index 00000000000..ffd9c25df3e
--- /dev/null
+++ b/omp/test/components/precision_conversion.cpp
@@ -0,0 +1,173 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+class PrecisionConversion : public ::testing::Test {
+protected:
+    PrecisionConversion()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::OmpExecutor::create()),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          cvals(ref, total_size),
+          vals2(ref, 1),
+          expected_float(ref, 1),
+          expected_double(ref, 1),
+          dvals(exec),
+          dcvals(exec),
+          dvals2(exec)
+    {
+        auto maxval = 1e10f;
+        std::uniform_real_distribution<float> dist(-maxval, maxval);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+            cvals.get_data()[i] = {dist(rand), dist(rand)};
+        }
+        dvals = vals;
+        dcvals = cvals;
+        gko::uint64 rawdouble{0x4218888000889111ULL};
+        gko::uint32 rawfloat{0x50c44400UL};
+        gko::uint64 rawrounded{0x4218888000000000ULL};
+        std::memcpy(vals2.get_data(), &rawdouble, sizeof(double));
+        std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float));
+        std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double));
+        dvals2 = vals2;
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<float> vals;
+    gko::Array<float> dvals;
+    gko::Array<double> vals2;
+    gko::Array<double> dvals2;
+    gko::Array<float> expected_float;
+    gko::Array<double> expected_double;
+    gko::Array<std::complex<float>> cvals;
+    gko::Array<std::complex<float>> dcvals;
+};
+
+
+TEST_F(PrecisionConversion, ConvertsReal)
+{
+    gko::Array<double> dtmp;
+    gko::Array<float> dout;
+
+    dtmp = dvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealViaRef)
+{
+    gko::Array<double> tmp{ref};
+    gko::Array<float> dout;
+
+    tmp = dvals;
+    dout = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplex)
+{
+    gko::Array<std::complex<double>> dtmp;
+    gko::Array<std::complex<float>> dout;
+
+    dtmp = dcvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dcvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConversionRounds)
+{
+    gko::Array<float> dtmp;
+    gko::Array<double> dout;
+
+    dtmp = dvals2;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dtmp, expected_float);
+    GKO_ASSERT_ARRAY_EQ(dout, expected_double);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealFromRef)
+{
+    gko::Array<double> dtmp;
+    gko::Array<float> dout;
+
+    dtmp = vals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dvals, dout);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplexFromRef)
+{
+    gko::Array<std::complex<double>> dtmp;
+    gko::Array<std::complex<float>> dout;
+
+    dtmp = cvals;
+    dout = dtmp;
+
+    GKO_ASSERT_ARRAY_EQ(dcvals, dout);
+}
+
+
+}  // namespace
diff --git a/omp/test/components/prefix_sum.cpp b/omp/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..277667b7801
--- /dev/null
+++ b/omp/test/components/prefix_sum.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = T;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::OmpExecutor::create()),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::components::prefix_sum(ref, vals.get_data(),
+                                                        size);
+        gko::kernels::omp::components::prefix_sum(exec, dvals.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(vals, dvals);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+TYPED_TEST_CASE(PrefixSum, gko::test::IndexTypes);
+
+
+TYPED_TEST(PrefixSum, SmallEqualsReference) { this->test(100); }
+
+
+TYPED_TEST(PrefixSum, BigEqualsReference) { this->test(this->total_size); }
+
+
+}  // namespace
diff --git a/omp/test/factorization/CMakeLists.txt b/omp/test/factorization/CMakeLists.txt
index 36c21b93eea..b52c2d938d7 100644
--- a/omp/test/factorization/CMakeLists.txt
+++ b/omp/test/factorization/CMakeLists.txt
@@ -1 +1,3 @@
+ginkgo_create_test(par_ict_kernels)
 ginkgo_create_test(par_ilu_kernels)
+ginkgo_create_test(par_ilut_kernels)
diff --git a/omp/test/factorization/par_ict_kernels.cpp b/omp/test/factorization/par_ict_kernels.cpp
new file mode 100644
index 00000000000..95ab07b4030
--- /dev/null
+++ b/omp/test/factorization/par_ict_kernels.cpp
@@ -0,0 +1,193 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class ParIct : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    ParIct()
+        : mtx_size(532, 532),
+          rand_engine(567321),
+          ref(gko::ReferenceExecutor::create()),
+          omp(gko::OmpExecutor::create())
+    {
+        mtx = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<index_type>(10, mtx_size[1]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<index_type>(10, mtx_size[0]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+
+        dmtx_ani = Csr::create(omp);
+        dmtx_l_ani = Csr::create(omp);
+        dmtx = Csr::create(omp);
+        dmtx->copy_from(lend(mtx));
+        dmtx_l = Csr::create(omp);
+        dmtx_l->copy_from(lend(mtx_l));
+    }
+
+    void SetUp()
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        mtx_ani = gko::read<Csr>(input_file, ref);
+        mtx_ani->sort_by_column_index();
+
+        {
+            mtx_l_ani = Csr::create(ref, mtx_ani->get_size());
+            gko::matrix::CsrBuilder<value_type, index_type> l_builder(
+                lend(mtx_l_ani));
+            gko::kernels::reference::factorization::initialize_row_ptrs_l(
+                ref, lend(mtx_ani), mtx_l_ani->get_row_ptrs());
+            auto l_nnz =
+                mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+            l_builder.get_value_array().resize_and_reset(l_nnz);
+            gko::kernels::reference::factorization::initialize_l(
+                ref, lend(mtx_ani), lend(mtx_l_ani), true);
+        }
+        dmtx_ani->copy_from(lend(mtx_ani));
+        dmtx_l_ani->copy_from(lend(mtx_l_ani));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> omp;
+
+    const gko::dim<2> mtx_size;
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx_ani;
+    std::unique_ptr<Csr> mtx_l_ani;
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> mtx_l;
+
+    std::unique_ptr<Csr> dmtx_ani;
+    std::unique_ptr<Csr> dmtx_l_ani;
+    std::unique_ptr<Csr> dmtx;
+    std::unique_ptr<Csr> dmtx_l;
+};
+
+TYPED_TEST_CASE(ParIct, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto mtx_llt = Csr::create(this->ref, this->mtx_size);
+    this->mtx_l->apply(lend(this->mtx_l->transpose()), lend(mtx_llt));
+    auto dmtx_llt = Csr::create(this->omp, this->mtx_size);
+    dmtx_llt->copy_from(lend(mtx_llt));
+    auto res_mtx_l = Csr::create(this->ref, this->mtx_size);
+    auto dres_mtx_l = Csr::create(this->omp, this->mtx_size);
+
+    gko::kernels::reference::par_ict_factorization::add_candidates(
+        this->ref, lend(mtx_llt), lend(this->mtx), lend(this->mtx_l),
+        lend(res_mtx_l));
+    gko::kernels::omp::par_ict_factorization::add_candidates(
+        this->omp, lend(dmtx_llt), lend(this->dmtx), lend(this->dmtx_l),
+        lend(dres_mtx_l));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, r<value_type>::value);
+}
+
+
+TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    auto square_size = this->mtx_ani->get_size();
+    auto mtx_l_coo = Coo::create(this->ref, square_size);
+    this->mtx_l_ani->convert_to(lend(mtx_l_coo));
+    auto dmtx_l_coo = Coo::create(this->omp, square_size);
+    dmtx_l_coo->copy_from(lend(mtx_l_coo));
+
+    gko::kernels::reference::par_ict_factorization::compute_factor(
+        this->ref, lend(this->mtx_ani), lend(this->mtx_l_ani), lend(mtx_l_coo));
+    for (int i = 0; i < 20; ++i) {
+        gko::kernels::omp::par_ict_factorization::compute_factor(
+            this->omp, lend(this->dmtx_ani), lend(this->dmtx_l_ani),
+            lend(dmtx_l_coo));
+    }
+
+    GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2);
+}
+
+
+}  // namespace
diff --git a/omp/test/factorization/par_ilu_kernels.cpp b/omp/test/factorization/par_ilu_kernels.cpp
index 46f8a3e22fb..41ff692b702 100644
--- a/omp/test/factorization/par_ilu_kernels.cpp
+++ b/omp/test/factorization/par_ilu_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 #include <fstream>
 #include <memory>
+#include <random>
 #include <string>
 
 
@@ -44,11 +45,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/factorization/factorization_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
 
@@ -56,16 +59,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
+template <typename ValueIndexType>
 class ParIlu : public ::testing::Test {
 protected:
-    using value_type = gko::default_precision;
-    using index_type = gko::int32;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using Dense = gko::matrix::Dense<value_type>;
     using Coo = gko::matrix::Coo<value_type, index_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
 
+    std::ranlux48 rand_engine;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> omp;
+    std::shared_ptr<const Csr> csr_ref;
+    std::shared_ptr<const Csr> csr_omp;
+
     ParIlu()
-        : ref(gko::ReferenceExecutor::create()),
+        : rand_engine(17),
+          ref(gko::ReferenceExecutor::create()),
           omp(gko::OmpExecutor::create()),
           csr_ref(nullptr),
           csr_omp(nullptr)
@@ -79,25 +92,62 @@ class ParIlu : public ::testing::Test {
             FAIL() << "Could not find the file \"" << file_name
                    << "\", which is required for this test.\n";
         }
-        csr_ref = gko::read<Csr>(input_file, ref);
+        auto csr_ref_temp = gko::read<Csr>(input_file, ref);
         auto csr_omp_temp = Csr::create(omp);
-        csr_omp_temp->copy_from(gko::lend(csr_ref));
+        csr_omp_temp->copy_from(gko::lend(csr_ref_temp));
+        // Make sure there are diagonal elements present
+        gko::kernels::reference::factorization::add_diagonal_elements(
+            ref, gko::lend(csr_ref_temp), false);
+        gko::kernels::omp::factorization::add_diagonal_elements(
+            omp, gko::lend(csr_omp_temp), false);
+        csr_ref = gko::give(csr_ref_temp);
         csr_omp = gko::give(csr_omp_temp);
     }
 
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<gko::OmpExecutor> omp;
-    std::shared_ptr<const Csr> csr_ref;
-    std::shared_ptr<const Csr> csr_omp;
+    template <typename Mtx>
+    std::unique_ptr<Mtx> gen_mtx(index_type num_rows, index_type num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<index_type>(0, num_cols - 1),
+            std::normal_distribution<gko::remove_complex<value_type>>(0.0, 1.0),
+            rand_engine, ref);
+    }
+
+    std::unique_ptr<Csr> gen_unsorted_mtx(index_type num_rows,
+                                          index_type num_cols)
+    {
+        using std::swap;
+        auto mtx = gen_mtx<Csr>(num_rows, num_cols);
+        auto values = mtx->get_values();
+        auto col_idxs = mtx->get_col_idxs();
+        const auto row_ptrs = mtx->get_const_row_ptrs();
+        for (int row = 0; row < num_rows; ++row) {
+            const auto row_start = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            const int num_row_elements = row_end - row_start;
+            auto idx_dist = std::uniform_int_distribution<index_type>(
+                row_start, row_end - 1);
+            for (int i = 0; i < num_row_elements / 2; ++i) {
+                auto idx1 = idx_dist(rand_engine);
+                auto idx2 = idx_dist(rand_engine);
+                if (idx1 != idx2) {
+                    swap(values[idx1], values[idx2]);
+                    swap(col_idxs[idx1], col_idxs[idx2]);
+                }
+            }
+        }
+        return mtx;
+    }
 
     void initialize_row_ptrs(index_type *l_row_ptrs_ref,
                              index_type *u_row_ptrs_ref,
                              index_type *l_row_ptrs_omp,
                              index_type *u_row_ptrs_omp)
     {
-        gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u(
+        gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
             ref, gko::lend(csr_ref), l_row_ptrs_ref, u_row_ptrs_ref);
-        gko::kernels::omp::par_ilu_factorization::initialize_row_ptrs_l_u(
+        gko::kernels::omp::factorization::initialize_row_ptrs_l_u(
             omp, gko::lend(csr_omp), l_row_ptrs_omp, u_row_ptrs_omp);
     }
 
@@ -123,18 +173,18 @@ class ParIlu : public ::testing::Test {
         *l_omp = Csr::create(omp, csr_omp->get_size(), l_nnz);
         *u_omp = Csr::create(omp, csr_omp->get_size(), u_nnz);
         // Copy the already initialized `row_ptrs` to the new matrices
-        ref->copy_from(gko::lend(ref), num_row_ptrs, l_row_ptrs_ref.get_data(),
-                       (*l_ref)->get_row_ptrs());
-        ref->copy_from(gko::lend(ref), num_row_ptrs, u_row_ptrs_ref.get_data(),
-                       (*u_ref)->get_row_ptrs());
-        omp->copy_from(gko::lend(omp), num_row_ptrs, l_row_ptrs_omp.get_data(),
-                       (*l_omp)->get_row_ptrs());
-        omp->copy_from(gko::lend(omp), num_row_ptrs, u_row_ptrs_omp.get_data(),
-                       (*u_omp)->get_row_ptrs());
-
-        gko::kernels::reference::par_ilu_factorization::initialize_l_u(
+        ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(),
+                  (*l_ref)->get_row_ptrs());
+        ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(),
+                  (*u_ref)->get_row_ptrs());
+        omp->copy(num_row_ptrs, l_row_ptrs_omp.get_data(),
+                  (*l_omp)->get_row_ptrs());
+        omp->copy(num_row_ptrs, u_row_ptrs_omp.get_data(),
+                  (*u_omp)->get_row_ptrs());
+
+        gko::kernels::reference::factorization::initialize_l_u(
             ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref));
-        gko::kernels::omp::par_ilu_factorization::initialize_l_u(
+        gko::kernels::omp::factorization::initialize_l_u(
             omp, gko::lend(csr_omp), gko::lend(*l_omp), gko::lend(*u_omp));
     }
 
@@ -174,21 +224,87 @@ class ParIlu : public ::testing::Test {
     }
 };
 
+TYPED_TEST_CASE(ParIlu, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(ParIlu, OmpKernelAddDiagonalElementsSortedEquivalentToRef)
+{
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    index_type num_rows{200};
+    index_type num_cols{200};
+    auto mtx_ref = this->template gen_mtx<Csr>(num_rows, num_cols);
+    auto mtx_omp = Csr::create(this->omp);
+    mtx_omp->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(mtx_ref), true);
+    gko::kernels::omp::factorization::add_diagonal_elements(
+        this->omp, gko::lend(mtx_omp), true);
+
+    ASSERT_TRUE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_omp, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_omp);
+}
+
+
+TYPED_TEST(ParIlu, OmpKernelAddDiagonalElementsUnsortedEquivalentToRef)
+{
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    index_type num_rows{200};
+    index_type num_cols{200};
+    auto mtx_ref = this->gen_unsorted_mtx(num_rows, num_cols);
+    auto mtx_omp = Csr::create(this->omp);
+    mtx_omp->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(mtx_ref), false);
+    gko::kernels::omp::factorization::add_diagonal_elements(
+        this->omp, gko::lend(mtx_omp), false);
+
+    ASSERT_FALSE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_omp, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_omp);
+}
+
+
+TYPED_TEST(ParIlu, OmpKernelAddDiagonalElementsNonSquareEquivalentToRef)
+{
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    index_type num_rows{200};
+    index_type num_cols{100};
+    auto mtx_ref = this->template gen_mtx<Csr>(num_rows, num_cols);
+    auto mtx_omp = Csr::create(this->omp);
+    mtx_omp->copy_from(gko::lend(mtx_ref));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(mtx_ref), true);
+    gko::kernels::omp::factorization::add_diagonal_elements(
+        this->omp, gko::lend(mtx_omp), true);
+
+    ASSERT_TRUE(mtx_ref->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_omp, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_omp);
+}
+
 
-TEST_F(ParIlu, OmpKernelInitializeRowPtrsLUEquivalentToRef)
+TYPED_TEST(ParIlu, OmpKernelInitializeRowPtrsLUEquivalentToRef)
 {
-    auto num_row_ptrs = csr_ref->get_size()[0] + 1;
-    gko::Array<index_type> l_row_ptrs_array_ref(ref, num_row_ptrs);
-    gko::Array<index_type> u_row_ptrs_array_ref(ref, num_row_ptrs);
-    gko::Array<index_type> l_row_ptrs_array_omp(omp, num_row_ptrs);
-    gko::Array<index_type> u_row_ptrs_array_omp(omp, num_row_ptrs);
+    using index_type = typename TestFixture::index_type;
+    auto num_row_ptrs = this->csr_ref->get_size()[0] + 1;
+    gko::Array<index_type> l_row_ptrs_array_ref(this->ref, num_row_ptrs);
+    gko::Array<index_type> u_row_ptrs_array_ref(this->ref, num_row_ptrs);
+    gko::Array<index_type> l_row_ptrs_array_omp(this->omp, num_row_ptrs);
+    gko::Array<index_type> u_row_ptrs_array_omp(this->omp, num_row_ptrs);
     auto l_row_ptrs_ref = l_row_ptrs_array_ref.get_data();
     auto u_row_ptrs_ref = u_row_ptrs_array_ref.get_data();
     auto l_row_ptrs_omp = l_row_ptrs_array_omp.get_data();
     auto u_row_ptrs_omp = u_row_ptrs_array_omp.get_data();
 
-    initialize_row_ptrs(l_row_ptrs_ref, u_row_ptrs_ref, l_row_ptrs_omp,
-                        u_row_ptrs_omp);
+    this->initialize_row_ptrs(l_row_ptrs_ref, u_row_ptrs_ref, l_row_ptrs_omp,
+                              u_row_ptrs_omp);
 
     ASSERT_TRUE(std::equal(l_row_ptrs_ref, l_row_ptrs_ref + num_row_ptrs,
                            l_row_ptrs_omp));
@@ -197,46 +313,57 @@ TEST_F(ParIlu, OmpKernelInitializeRowPtrsLUEquivalentToRef)
 }
 
 
-TEST_F(ParIlu, KernelInitializeParILUIsEquivalentToRef)
+TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef)
 {
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
     std::unique_ptr<Csr> l_ref{};
     std::unique_ptr<Csr> u_ref{};
     std::unique_ptr<Csr> l_omp{};
     std::unique_ptr<Csr> u_omp{};
 
-    initialize_lu(&l_ref, &u_ref, &l_omp, &u_omp);
+    this->initialize_lu(&l_ref, &u_ref, &l_omp, &u_omp);
 
-    GKO_ASSERT_MTX_NEAR(l_ref, l_omp, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_ref, u_omp, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_ref, l_omp, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_omp, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_omp);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_omp);
 }
 
 
-TEST_F(ParIlu, KernelComputeParILUIsEquivalentToRef)
+TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef)
 {
+    using Csr = typename TestFixture::Csr;
     std::unique_ptr<Csr> l_ref{};
     std::unique_ptr<Csr> u_ref{};
     std::unique_ptr<Csr> l_omp{};
     std::unique_ptr<Csr> u_omp{};
 
-    compute_lu(&l_ref, &u_ref, &l_omp, &u_omp);
+    this->compute_lu(&l_ref, &u_ref, &l_omp, &u_omp);
 
     GKO_ASSERT_MTX_NEAR(l_ref, l_omp, 5e-2);
     GKO_ASSERT_MTX_NEAR(u_ref, u_omp, 5e-2);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_omp);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_omp);
 }
 
 
-TEST_F(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef)
+TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef)
 {
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
     std::unique_ptr<Csr> l_ref{};
     std::unique_ptr<Csr> u_ref{};
     std::unique_ptr<Csr> l_omp{};
     std::unique_ptr<Csr> u_omp{};
-    gko::size_type iterations{20};
+    gko::size_type iterations{30};
 
-    compute_lu(&l_ref, &u_ref, &l_omp, &u_omp, iterations);
+    this->compute_lu(&l_ref, &u_ref, &l_omp, &u_omp, iterations);
 
-    GKO_ASSERT_MTX_NEAR(l_ref, l_omp, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_ref, u_omp, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_ref, l_omp, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_omp, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_omp);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_omp);
 }
 
 
diff --git a/omp/test/factorization/par_ilut_kernels.cpp b/omp/test/factorization/par_ilut_kernels.cpp
new file mode 100644
index 00000000000..7af808a258b
--- /dev/null
+++ b/omp/test/factorization/par_ilut_kernels.cpp
@@ -0,0 +1,467 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class ParIlut : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Dense = gko::matrix::Dense<value_type>;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    ParIlut()
+        : mtx_size(532, 423),
+          rand_engine(1337),
+          ref(gko::ReferenceExecutor::create()),
+          omp(gko::OmpExecutor::create())
+    {
+        mtx1 = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<index_type>(10, mtx_size[1]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx2 = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[1],
+            std::uniform_int_distribution<index_type>(0, mtx_size[1]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx_square = gko::test::generate_random_matrix<Csr>(
+            mtx_size[0], mtx_size[0],
+            std::uniform_int_distribution<index_type>(1, mtx_size[0]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<index_type>(10, mtx_size[0]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx_l2 = gko::test::generate_random_lower_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], true,
+            std::uniform_int_distribution<index_type>(1, mtx_size[0]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx_u = gko::test::generate_random_upper_triangular_matrix<Csr>(
+            mtx_size[0], mtx_size[0], false,
+            std::uniform_int_distribution<index_type>(10, mtx_size[0]),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+
+        dmtx1 = Csr::create(omp);
+        dmtx1->copy_from(mtx1.get());
+        dmtx2 = Csr::create(omp);
+        dmtx2->copy_from(mtx2.get());
+        dmtx_square = Csr::create(omp);
+        dmtx_square->copy_from(mtx_square.get());
+        dmtx_ani = Csr::create(omp);
+        dmtx_l_ani = Csr::create(omp);
+        dmtx_u_ani = Csr::create(omp);
+        dmtx_ut_ani = Csr::create(omp);
+        dmtx_l = Csr::create(omp);
+        dmtx_l->copy_from(mtx_l.get());
+        dmtx_l2 = Csr::create(omp);
+        dmtx_l2->copy_from(mtx_l2.get());
+        dmtx_u = Csr::create(omp);
+        dmtx_u->copy_from(mtx_u.get());
+    }
+
+    void SetUp()
+    {
+        std::string file_name(gko::matrices::location_ani4_mtx);
+        auto input_file = std::ifstream(file_name, std::ios::in);
+        if (!input_file) {
+            FAIL() << "Could not find the file \"" << file_name
+                   << "\", which is required for this test.\n";
+        }
+        mtx_ani = gko::read<Csr>(input_file, ref);
+        mtx_ani->sort_by_column_index();
+
+        {
+            mtx_l_ani = Csr::create(ref, mtx_ani->get_size());
+            mtx_u_ani = Csr::create(ref, mtx_ani->get_size());
+            gko::matrix::CsrBuilder<value_type, index_type> l_builder(
+                mtx_l_ani.get());
+            gko::matrix::CsrBuilder<value_type, index_type> u_builder(
+                mtx_u_ani.get());
+            gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
+                ref, mtx_ani.get(), mtx_l_ani->get_row_ptrs(),
+                mtx_u_ani->get_row_ptrs());
+            auto l_nnz =
+                mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            auto u_nnz =
+                mtx_u_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]];
+            l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+            l_builder.get_value_array().resize_and_reset(l_nnz);
+            u_builder.get_col_idx_array().resize_and_reset(u_nnz);
+            u_builder.get_value_array().resize_and_reset(u_nnz);
+            gko::kernels::reference::factorization::initialize_l_u(
+                ref, mtx_ani.get(), mtx_l_ani.get(), mtx_u_ani.get());
+            mtx_ut_ani = Csr::create(ref, mtx_ani->get_size(),
+                                     mtx_u_ani->get_num_stored_elements());
+            gko::kernels::reference::csr::transpose(ref, mtx_u_ani.get(),
+                                                    mtx_ut_ani.get());
+        }
+        dmtx_ani->copy_from(mtx_ani.get());
+        dmtx_l_ani->copy_from(mtx_l_ani.get());
+        dmtx_u_ani->copy_from(mtx_u_ani.get());
+        dmtx_ut_ani->copy_from(mtx_ut_ani.get());
+    }
+
+    void test_select(const std::unique_ptr<Csr> &mtx,
+                     const std::unique_ptr<Csr> &dmtx, index_type rank,
+                     gko::remove_complex<value_type> tolerance = 0.0)
+    {
+        auto size = index_type(mtx->get_num_stored_elements());
+
+        gko::remove_complex<value_type> res{};
+        gko::remove_complex<value_type> dres{};
+        gko::Array<value_type> tmp(ref);
+        gko::Array<gko::remove_complex<value_type>> tmp2(ref);
+        gko::Array<value_type> dtmp(omp);
+        gko::Array<gko::remove_complex<value_type>> dtmp2(omp);
+
+        gko::kernels::reference::par_ilut_factorization::threshold_select(
+            ref, mtx.get(), rank, tmp, tmp2, res);
+        gko::kernels::omp::par_ilut_factorization::threshold_select(
+            omp, dmtx.get(), rank, dtmp, dtmp2, dres);
+
+        ASSERT_EQ(res, dres);
+    }
+
+    void test_filter(const std::unique_ptr<Csr> &mtx,
+                     const std::unique_ptr<Csr> &dmtx,
+                     gko::remove_complex<value_type> threshold, bool lower)
+    {
+        auto res = Csr::create(ref, mtx_size);
+        auto dres = Csr::create(omp, mtx_size);
+        auto res_coo = Coo::create(ref, mtx_size);
+        auto dres_coo = Coo::create(omp, mtx_size);
+        auto local_mtx = gko::as<Csr>(lower ? mtx->clone() : mtx->transpose());
+        auto local_dmtx =
+            gko::as<Csr>(lower ? dmtx->clone() : dmtx->transpose());
+
+        gko::kernels::reference::par_ilut_factorization::threshold_filter(
+            ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower);
+        gko::kernels::omp::par_ilut_factorization::threshold_filter(
+            omp, local_dmtx.get(), threshold, dres.get(), dres_coo.get(),
+            lower);
+
+        GKO_ASSERT_MTX_NEAR(res, dres, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+        GKO_ASSERT_MTX_NEAR(res, res_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo);
+        GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo);
+    }
+
+    void test_filter_approx(const std::unique_ptr<Csr> &mtx,
+                            const std::unique_ptr<Csr> &dmtx, index_type rank)
+    {
+        auto res = Csr::create(ref, mtx_size);
+        auto dres = Csr::create(omp, mtx_size);
+        auto res_coo = Coo::create(ref, mtx_size);
+        auto dres_coo = Coo::create(omp, mtx_size);
+
+        gko::Array<value_type> tmp(ref);
+        gko::Array<value_type> dtmp(omp);
+        gko::remove_complex<value_type> threshold{};
+        gko::remove_complex<value_type> dthreshold{};
+
+        gko::kernels::reference::par_ilut_factorization::
+            threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold,
+                                    res.get(), res_coo.get());
+        gko::kernels::omp::par_ilut_factorization::threshold_filter_approx(
+            omp, dmtx.get(), rank, dtmp, dthreshold, dres.get(),
+            dres_coo.get());
+
+        GKO_ASSERT_MTX_NEAR(res, dres, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+        GKO_ASSERT_MTX_NEAR(res, res_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo);
+        GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo);
+        ASSERT_EQ(threshold, dthreshold);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> omp;
+
+    const gko::dim<2> mtx_size;
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx1;
+    std::unique_ptr<Csr> mtx2;
+    std::unique_ptr<Csr> mtx_square;
+    std::unique_ptr<Csr> mtx_ani;
+    std::unique_ptr<Csr> mtx_l_ani;
+    std::unique_ptr<Csr> mtx_u_ani;
+    std::unique_ptr<Csr> mtx_ut_ani;
+    std::unique_ptr<Csr> mtx_l;
+    std::unique_ptr<Csr> mtx_l2;
+    std::unique_ptr<Csr> mtx_u;
+
+    std::unique_ptr<Csr> dmtx1;
+    std::unique_ptr<Csr> dmtx2;
+    std::unique_ptr<Csr> dmtx_square;
+    std::unique_ptr<Csr> dmtx_ani;
+    std::unique_ptr<Csr> dmtx_l_ani;
+    std::unique_ptr<Csr> dmtx_u_ani;
+    std::unique_ptr<Csr> dmtx_ut_ani;
+    std::unique_ptr<Csr> dmtx_l;
+    std::unique_ptr<Csr> dmtx_l2;
+    std::unique_ptr<Csr> dmtx_u;
+};
+
+TYPED_TEST_CASE(ParIlut, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef)
+{
+    this->test_select(this->mtx_l, this->dmtx_l,
+                      this->mtx_l->get_num_stored_elements() / 3);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef)
+{
+    this->test_select(this->mtx_l, this->dmtx_l, 0);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef)
+{
+    this->test_select(this->mtx_l, this->dmtx_l,
+                      this->mtx_l->get_num_stored_elements() - 1);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    auto res = Csr::create(this->ref, this->mtx_size);
+    auto dres = Csr::create(this->omp, this->mtx_size);
+    Coo *null_coo = nullptr;
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter(
+        this->ref, this->mtx_l.get(), 0.5, res.get(), null_coo, true);
+    gko::kernels::omp::par_ilut_factorization::threshold_filter(
+        this->omp, this->dmtx_l.get(), 0.5, dres.get(), null_coo, true);
+
+    GKO_ASSERT_MTX_NEAR(res, dres, 0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterLowerIsEquivalentToRef)
+{
+    this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterUpperIsEquivalentToRef)
+{
+    this->test_filter(this->mtx_l, this->dmtx_l, 0.5, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNoneLowerIsEquivalentToRef)
+{
+    this->test_filter(this->mtx_l, this->dmtx_l, 0, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNoneUpperIsEquivalentToRef)
+{
+    this->test_filter(this->mtx_l, this->dmtx_l, 0, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterAllLowerIsEquivalentToRef)
+{
+    this->test_filter(this->mtx_l, this->dmtx_l, 1e6, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterAllUpperIsEquivalentToRef)
+{
+    this->test_filter(this->mtx_l, this->dmtx_l, 1e6, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true);
+    auto res = Csr::create(this->ref, this->mtx_size);
+    auto dres = Csr::create(this->omp, this->mtx_size);
+    Coo *null_coo = nullptr;
+    gko::Array<value_type> tmp(this->ref);
+    gko::Array<value_type> dtmp(this->omp);
+    gko::remove_complex<value_type> threshold{};
+    gko::remove_complex<value_type> dthreshold{};
+    index_type rank{};
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter_approx(
+        this->ref, this->mtx_l.get(), rank, tmp, threshold, res.get(),
+        null_coo);
+    gko::kernels::omp::par_ilut_factorization::threshold_filter_approx(
+        this->omp, this->dmtx_l.get(), rank, dtmp, dthreshold, dres.get(),
+        null_coo);
+
+    GKO_ASSERT_MTX_NEAR(res, dres, 0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
+    ASSERT_EQ(threshold, dthreshold);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
+{
+    this->test_filter_approx(this->mtx_l, this->dmtx_l,
+                             this->mtx_l->get_num_stored_elements() / 2);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef)
+{
+    this->test_filter_approx(this->mtx_l, this->dmtx_l, 0);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef)
+{
+    this->test_filter_approx(this->mtx_l, this->dmtx_l,
+                             this->mtx_l->get_num_stored_elements() - 1);
+}
+
+
+TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto square_size = this->mtx_square->get_size();
+    auto mtx_lu = Csr::create(this->ref, square_size);
+    this->mtx_l2->apply(this->mtx_u.get(), mtx_lu.get());
+    auto dmtx_lu = Csr::create(this->omp, square_size);
+    dmtx_lu->copy_from(mtx_lu.get());
+    auto res_mtx_l = Csr::create(this->ref, square_size);
+    auto res_mtx_u = Csr::create(this->ref, square_size);
+    auto dres_mtx_l = Csr::create(this->omp, square_size);
+    auto dres_mtx_u = Csr::create(this->omp, square_size);
+
+    gko::kernels::reference::par_ilut_factorization::add_candidates(
+        this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(),
+        this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get());
+    gko::kernels::omp::par_ilut_factorization::add_candidates(
+        this->omp, dmtx_lu.get(), this->dmtx_square.get(), this->dmtx_l2.get(),
+        this->dmtx_u.get(), dres_mtx_l.get(), dres_mtx_u.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, dres_mtx_u);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(res_mtx_u, dres_mtx_u, r<value_type>::value);
+}
+
+
+TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    auto square_size = this->mtx_ani->get_size();
+    auto mtx_l_coo = Coo::create(this->ref, square_size);
+    auto mtx_u_coo = Coo::create(this->ref, square_size);
+    this->mtx_l_ani->convert_to(mtx_l_coo.get());
+    this->mtx_u_ani->convert_to(mtx_u_coo.get());
+    auto dmtx_l_coo = Coo::create(this->omp, square_size);
+    auto dmtx_u_coo = Coo::create(this->omp, square_size);
+    dmtx_l_coo->copy_from(mtx_l_coo.get());
+    dmtx_u_coo->copy_from(mtx_u_coo.get());
+
+    gko::kernels::reference::par_ilut_factorization::compute_l_u_factors(
+        this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get(),
+        this->mtx_u_ani.get(), mtx_u_coo.get(), this->mtx_ut_ani.get());
+    for (int i = 0; i < 20; ++i) {
+        gko::kernels::omp::par_ilut_factorization::compute_l_u_factors(
+            this->omp, this->dmtx_ani.get(), this->dmtx_l_ani.get(),
+            dmtx_l_coo.get(), this->dmtx_u_ani.get(), dmtx_u_coo.get(),
+            this->dmtx_ut_ani.get());
+    }
+    auto dmtx_utt_ani = gko::as<Csr>(this->dmtx_ut_ani->transpose());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2);
+    GKO_ASSERT_MTX_NEAR(this->mtx_u_ani, this->dmtx_u_ani, 1e-2);
+    GKO_ASSERT_MTX_NEAR(this->dmtx_u_ani, dmtx_utt_ani, 0);
+}
+
+
+}  // namespace
diff --git a/omp/test/matrix/coo_kernels.cpp b/omp/test/matrix/coo_kernels.cpp
index 355f31fcfa4..2bdf0361faa 100644
--- a/omp/test/matrix/coo_kernels.cpp
+++ b/omp/test/matrix/coo_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/coo_kernels.hpp"
+#include <ginkgo/core/matrix/coo.hpp>
 
 
 #include <random>
@@ -42,11 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/matrix/coo_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index 72fa988ff5e..bb607efd615 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,13 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/csr_kernels.hpp"
-
+#include <ginkgo/core/matrix/csr.hpp>
 
-#include <iostream>
 
+#include <algorithm>
+#include <numeric>
 #include <random>
-#include <utility>
+#include <vector>
 
 
 #include <gtest/gtest.h>
@@ -45,11 +45,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
@@ -58,6 +59,7 @@ namespace {
 
 class Csr : public ::testing::Test {
 protected:
+    using Arr = gko::Array<int>;
     using Mtx = gko::matrix::Csr<>;
     using Vec = gko::matrix::Dense<>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
@@ -95,6 +97,8 @@ class Csr : public ::testing::Test {
         complex_mtx = ComplexMtx::create(ref);
         complex_mtx->copy_from(
             gen_mtx<ComplexVec>(mtx_size[0], mtx_size[1], 1));
+        square_mtx = Mtx::create(ref);
+        square_mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[0], 1));
         expected = gen_mtx<Vec>(mtx_size[0], num_vectors, 1);
         y = gen_mtx<Vec>(mtx_size[1], num_vectors, 1);
         alpha = gko::initialize<Vec>({2.0}, ref);
@@ -103,6 +107,8 @@ class Csr : public ::testing::Test {
         dmtx->copy_from(mtx.get());
         complex_dmtx = ComplexMtx::create(omp);
         complex_dmtx->copy_from(complex_mtx.get());
+        square_dmtx = Mtx::create(omp);
+        square_dmtx->copy_from(square_mtx.get());
         dresult = Vec::create(omp);
         dresult->copy_from(expected.get());
         dy = Vec::create(omp);
@@ -111,6 +117,22 @@ class Csr : public ::testing::Test {
         dalpha->copy_from(alpha.get());
         dbeta = Vec::create(omp);
         dbeta->copy_from(beta.get());
+
+        std::vector<int> tmp(mtx->get_size()[0], 0);
+        auto rng = std::default_random_engine{};
+        std::iota(tmp.begin(), tmp.end(), 0);
+        std::shuffle(tmp.begin(), tmp.end(), rng);
+        std::vector<int> tmp2(mtx->get_size()[1], 0);
+        std::iota(tmp2.begin(), tmp2.end(), 0);
+        std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        rpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        drpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{omp, tmp.begin(), tmp.end()});
+        cpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
+        dcpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{omp, tmp2.begin(), tmp2.end()});
     }
 
     struct matrix_pair {
@@ -120,7 +142,7 @@ class Csr : public ::testing::Test {
 
     matrix_pair gen_unsorted_mtx()
     {
-        constexpr int min_nnz_per_row = 2;  // Must be larger/equal than 2
+        constexpr int min_nnz_per_row = 2;  // Must be at least 2
         auto local_mtx_ref =
             gen_mtx<Mtx>(mtx_size[0], mtx_size[1], min_nnz_per_row);
         for (size_t row = 0; row < mtx_size[0]; ++row) {
@@ -153,6 +175,7 @@ class Csr : public ::testing::Test {
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<ComplexMtx> complex_mtx;
+    std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
     std::unique_ptr<Vec> y;
     std::unique_ptr<Vec> alpha;
@@ -160,10 +183,15 @@ class Csr : public ::testing::Test {
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<ComplexMtx> complex_dmtx;
+    std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
     std::unique_ptr<Vec> dy;
     std::unique_ptr<Vec> dalpha;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> drpermute_idxs;
+    std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Arr> dcpermute_idxs;
 };
 
 
@@ -200,6 +228,57 @@ TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    mtx->apply(alpha.get(), trans.get(), beta.get(), square_mtx.get());
+    dmtx->apply(dalpha.get(), d_trans.get(), dbeta.get(), square_dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
+    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto trans = mtx->transpose();
+    auto d_trans = dmtx->transpose();
+
+    mtx->apply(trans.get(), square_mtx.get());
+    dmtx->apply(d_trans.get(), square_dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
+    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto da = Mtx::create(omp);
+    auto db = Mtx::create(omp);
+    da->copy_from(a.get());
+    db->copy_from(b.get());
+    auto id = gko::matrix::Identity<Mtx::value_type>::create(ref, mtx_size[1]);
+    auto did = gko::matrix::Identity<Mtx::value_type>::create(omp, mtx_size[1]);
+
+    a->apply(alpha.get(), id.get(), beta.get(), b.get());
+    da->apply(dalpha.get(), did.get(), dbeta.get(), db.get());
+
+    GKO_ASSERT_MTX_NEAR(b, db, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(b, db);
+    ASSERT_TRUE(db->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(3);
@@ -324,7 +403,7 @@ TEST_F(Csr, CalculatesNonzerosPerRow)
     gko::kernels::omp::csr::calculate_nonzeros_per_row(omp, dmtx.get(),
                                                        &drow_nnz);
 
-    GKO_ASSERT_ARRAY_EQ(&row_nnz, &drow_nnz);
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
 }
 
 
@@ -360,6 +439,51 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, IsRowPermutable)
+{
+    set_up_apply_data();
+    auto r_permute = mtx->row_permute(rpermute_idxs.get());
+    auto dr_permute = dmtx->row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
+                        static_cast<Mtx *>(dr_permute.get()), 0);
+}
+
+
+TEST_F(Csr, IsColPermutable)
+{
+    set_up_apply_data();
+    auto c_permute = mtx->column_permute(cpermute_idxs.get());
+    auto dc_permute = dmtx->column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
+                        static_cast<Mtx *>(dc_permute.get()), 0);
+}
+
+
+TEST_F(Csr, IsInverseRowPermutable)
+{
+    set_up_apply_data();
+    auto inverse_r_permute = mtx->inverse_row_permute(rpermute_idxs.get());
+    auto d_inverse_r_permute = dmtx->inverse_row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
+                        static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
+}
+
+
+TEST_F(Csr, IsInverseColPermutable)
+{
+    set_up_apply_data();
+    auto inverse_c_permute = mtx->inverse_column_permute(cpermute_idxs.get());
+    auto d_inverse_c_permute =
+        dmtx->inverse_column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
+                        static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
+}
+
+
 TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
 {
     set_up_apply_data();
diff --git a/omp/test/matrix/dense_kernels.cpp b/omp/test/matrix/dense_kernels.cpp
index b1d601291f1..dd0aa4fb8d6 100644
--- a/omp/test/matrix/dense_kernels.cpp
+++ b/omp/test/matrix/dense_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,26 +30,29 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/dense_kernels.hpp"
+#include <ginkgo/core/matrix/dense.hpp>
 
 
-#include <iostream>
+#include <algorithm>
+#include <numeric>
 #include <random>
+#include <vector>
 
 
 #include <gtest/gtest.h>
 
 
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/dense_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
@@ -59,6 +62,8 @@ namespace {
 class Dense : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Dense<>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<Mtx::value_type>>;
+    using Arr = gko::Array<int>;
     using ComplexMtx = gko::matrix::Dense<std::complex<double>>;
 
     Dense() : rand_engine(15) {}
@@ -135,6 +140,22 @@ class Dense : public ::testing::Test {
         dalpha->copy_from(alpha.get());
         dbeta = Mtx::create(omp);
         dbeta->copy_from(beta.get());
+
+        std::vector<int> tmp(x->get_size()[0], 0);
+        auto rng = std::default_random_engine{};
+        std::iota(tmp.begin(), tmp.end(), 0);
+        std::shuffle(tmp.begin(), tmp.end(), rng);
+        std::vector<int> tmp2(x->get_size()[1], 0);
+        std::iota(tmp2.begin(), tmp2.end(), 0);
+        std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        rpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        drpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{omp, tmp.begin(), tmp.end()});
+        cpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
+        dcpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{omp, tmp2.begin(), tmp2.end()});
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -154,6 +175,10 @@ class Dense : public ::testing::Test {
     std::unique_ptr<Mtx> dy;
     std::unique_ptr<Mtx> dalpha;
     std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> drpermute_idxs;
+    std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Arr> dcpermute_idxs;
 };
 
 
@@ -250,11 +275,14 @@ TEST_F(Dense, MultipleVectorOmpComputeDotIsEquivalentToRef)
 TEST_F(Dense, ComputesNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->omp, norm_size);
 
-    x->compute_norm2(expected.get());
-    dx->compute_norm2(dresult.get());
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
 }
 
 
@@ -524,6 +552,18 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(omp);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(omp);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(*dsellp_mtx->get_const_slice_sets(), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
 TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef)
 {
     std::size_t ref_max_nnz_per_row = 0;
@@ -582,4 +622,48 @@ TEST_F(Dense, IsConjugateTransposable)
 }
 
 
+TEST_F(Dense, IsRowPermutable)
+{
+    set_up_apply_data();
+    auto r_permute = x->row_permute(rpermute_idxs.get());
+    auto dr_permute = dx->row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
+                        static_cast<Mtx *>(dr_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsColPermutable)
+{
+    set_up_apply_data();
+    auto c_permute = x->column_permute(cpermute_idxs.get());
+    auto dc_permute = dx->column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
+                        static_cast<Mtx *>(dc_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseRowPermutable)
+{
+    set_up_apply_data();
+    auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
+                        static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseColPermutable)
+{
+    set_up_apply_data();
+    auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
+                        static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
+}
+
+
 }  // namespace
diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp
index 98e45f9a7ae..0fbc9173b30 100644
--- a/omp/test/matrix/ell_kernels.cpp
+++ b/omp/test/matrix/ell_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/ell_kernels.hpp"
+#include <ginkgo/core/matrix/ell.hpp>
 
 
 #include <random>
@@ -39,12 +39,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/matrix/ell_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp
index 13bc0cf3ff0..47e809fc5bd 100644
--- a/omp/test/matrix/hybrid_kernels.cpp
+++ b/omp/test/matrix/hybrid_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/hybrid_kernels.hpp"
+#include <ginkgo/core/matrix/hybrid.hpp>
 
 
 #include <random>
@@ -39,12 +39,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
+
+
+#include "core/matrix/hybrid_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
diff --git a/omp/test/matrix/sellp_kernels.cpp b/omp/test/matrix/sellp_kernels.cpp
index a9a77452978..217ce430f8c 100644
--- a/omp/test/matrix/sellp_kernels.cpp
+++ b/omp/test/matrix/sellp_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,13 +39,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
diff --git a/omp/test/matrix/sparsity_csr_kernels.cpp b/omp/test/matrix/sparsity_csr_kernels.cpp
index 91852dbcb53..dea4844885a 100644
--- a/omp/test/matrix/sparsity_csr_kernels.cpp
+++ b/omp/test/matrix/sparsity_csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/sparsity_csr_kernels.hpp"
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include <memory>
@@ -44,9 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/sparsity_csr_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
@@ -117,7 +117,7 @@ class SparsityCsr : public ::testing::Test {
 
     matrix_pair gen_unsorted_mtx()
     {
-        constexpr int min_nnz_per_row = 2;  // Must be larger/equal than 2
+        constexpr int min_nnz_per_row = 2;  // Must be at least 2
         auto local_mtx_ref =
             gen_mtx<Mtx>(mtx_size[0], mtx_size[1], min_nnz_per_row);
         for (size_t row = 0; row < mtx_size[0]; ++row) {
@@ -245,10 +245,10 @@ TEST_F(SparsityCsr, RemovesDiagElementsKernelIsEquivalentToRef)
                              dmtx->get_num_nonzeros() - num_diags);
 
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        ref, tmp.get(), mtx->get_const_row_ptrs(), mtx->get_const_col_idxs());
+        ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), tmp.get());
     gko::kernels::omp::sparsity_csr::remove_diagonal_elements(
-        omp, d_tmp.get(), dmtx->get_const_row_ptrs(),
-        dmtx->get_const_col_idxs());
+        omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+        d_tmp.get());
 
     GKO_ASSERT_MTX_NEAR(tmp.get(), d_tmp.get(), 0.0);
 }
diff --git a/omp/test/preconditioner/CMakeLists.txt b/omp/test/preconditioner/CMakeLists.txt
index a0ca5a2e38a..575384a4c84 100644
--- a/omp/test/preconditioner/CMakeLists.txt
+++ b/omp/test/preconditioner/CMakeLists.txt
@@ -1 +1,2 @@
 ginkgo_create_test(jacobi_kernels)
+ginkgo_create_test(isai_kernels)
diff --git a/omp/test/preconditioner/isai_kernels.cpp b/omp/test/preconditioner/isai_kernels.cpp
new file mode 100644
index 00000000000..ea3c52755a1
--- /dev/null
+++ b/omp/test/preconditioner/isai_kernels.cpp
@@ -0,0 +1,324 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/preconditioner/isai_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+enum struct matrix_type { lower, upper };
+class Isai : public ::testing::Test {
+protected:
+    using value_type = double;
+    using index_type = gko::int32;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    Isai() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        omp = gko::OmpExecutor::create();
+    }
+
+    std::unique_ptr<Csr> clone_allocations(const Csr *csr_mtx)
+    {
+        if (csr_mtx->get_executor() != ref) {
+            return {nullptr};
+        }
+        const auto num_elems = csr_mtx->get_num_stored_elements();
+        auto sparsity = csr_mtx->clone();
+
+        // values are now filled with invalid data to catch potential errors
+        auto begin_values = sparsity->get_values();
+        auto end_values = begin_values + num_elems;
+        std::fill(begin_values, end_values, -gko::one<value_type>());
+        return sparsity;
+    }
+
+    void initialize_data(matrix_type type, gko::size_type n,
+                         gko::size_type row_limit)
+    {
+        const bool for_lower_tm = type == matrix_type::lower;
+        auto nz_dist = std::uniform_int_distribution<index_type>(1, row_limit);
+        auto val_dist = std::uniform_real_distribution<value_type>(-1., 1.);
+        mtx = Csr::create(ref);
+        mtx = gko::test::generate_random_triangular_matrix<Csr>(
+            n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref,
+            gko::dim<2>{n, n});
+        inverse = clone_allocations(mtx.get());
+
+        d_mtx = Csr::create(omp);
+        d_mtx->copy_from(mtx.get());
+        d_inverse = Csr::create(omp);
+        d_inverse->copy_from(inverse.get());
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::OmpExecutor> omp;
+
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> inverse;
+
+    std::unique_ptr<Csr> d_mtx;
+    std::unique_ptr<Csr> d_inverse;
+};
+
+
+TEST_F(Isai, OmpIsaiGenerateLinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 536, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(omp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::omp::isai::generate_tri_inverse(
+        omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, OmpIsaiGenerateUinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 615, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(omp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::omp::isai::generate_tri_inverse(
+        omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, OmpIsaiGenerateLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 554, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(omp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::omp::isai::generate_tri_inverse(
+        omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, OmpIsaiGenerateUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 695, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(omp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::omp::isai::generate_tri_inverse(
+        omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, OmpIsaiGenerateExcessLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 518, 40);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(omp, a1);
+    gko::Array<index_type> da2(omp, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(omp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(omp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get());
+    gko::kernels::omp::isai::generate_excess_system(
+        omp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, OmpIsaiGenerateExcessUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 673, 51);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(omp, a1);
+    gko::Array<index_type> da2(omp, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(omp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(omp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get());
+    gko::kernels::omp::isai::generate_excess_system(
+        omp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, OmpIsaiScatterExcessSolutionLIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 572, 52);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(omp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = Dense::create(omp);
+    de_rhs->copy_from(lend(e_rhs));
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get());
+    gko::kernels::omp::isai::scatter_excess_solution(
+        omp, da1.get_const_data(), de_rhs.get(), d_inverse.get());
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, OmpIsaiScatterExcessSolutionUIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 702, 45);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(omp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = Dense::create(omp);
+    de_rhs->copy_from(lend(e_rhs));
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get());
+    gko::kernels::omp::isai::scatter_excess_solution(
+        omp, da1.get_const_data(), de_rhs.get(), d_inverse.get());
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+}  // namespace
diff --git a/omp/test/preconditioner/jacobi_kernels.cpp b/omp/test/preconditioner/jacobi_kernels.cpp
index 67454f8c6b1..0753d22f008 100644
--- a/omp/test/preconditioner/jacobi_kernels.cpp
+++ b/omp/test/preconditioner/jacobi_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,17 +33,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -322,6 +324,34 @@ TEST_F(Jacobi, OmpPreconditionerEquivalentToRefWithMPW)
 }
 
 
+TEST_F(Jacobi, OmpTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, OmpConjTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
 TEST_F(Jacobi, OmpApplyEquivalentToRefWithBlockSize32)
 {
     initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111);
@@ -560,6 +590,37 @@ TEST_F(Jacobi, OmpPreconditionerEquivalentToRefWithAdaptivePrecision)
 }
 
 
+TEST_F(Jacobi, OmpTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi,
+       OmpConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
 TEST_F(Jacobi, OmpApplyEquivalentToRefWithFullPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
diff --git a/omp/test/solver/CMakeLists.txt b/omp/test/solver/CMakeLists.txt
index e2a017962a5..44d37b6240d 100644
--- a/omp/test/solver/CMakeLists.txt
+++ b/omp/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_test(bicg_kernels)
 ginkgo_create_test(bicgstab_kernels)
 ginkgo_create_test(cg_kernels)
 ginkgo_create_test(cgs_kernels)
diff --git a/omp/test/solver/bicg_kernels.cpp b/omp/test/solver/bicg_kernels.cpp
new file mode 100644
index 00000000000..2766f1eb910
--- /dev/null
+++ b/omp/test/solver/bicg_kernels.cpp
@@ -0,0 +1,340 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicg.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/bicg_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+class Bicg : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    Bicg() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        omp = gko::OmpExecutor::create();
+    }
+
+    void TearDown()
+    {
+        if (omp != nullptr) {
+            ASSERT_NO_THROW(omp->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 43;
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        q = gen_mtx(m, n);
+        r2 = gen_mtx(m, n);
+        z2 = gen_mtx(m, n);
+        p2 = gen_mtx(m, n);
+        q2 = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        beta = gen_mtx(1, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(ref, n));
+        for (size_t i = 0; i < stop_status->get_num_elems(); ++i) {
+            stop_status->get_data()[i].reset();
+        }
+
+        d_b = Mtx::create(omp);
+        d_b->copy_from(b.get());
+        d_r = Mtx::create(omp);
+        d_r->copy_from(r.get());
+        d_z = Mtx::create(omp);
+        d_z->copy_from(z.get());
+        d_p = Mtx::create(omp);
+        d_p->copy_from(p.get());
+        d_q = Mtx::create(omp);
+        d_q->copy_from(q.get());
+        d_r2 = Mtx::create(omp);
+        d_r2->copy_from(r2.get());
+        d_z2 = Mtx::create(omp);
+        d_z2->copy_from(z2.get());
+        d_p2 = Mtx::create(omp);
+        d_p2->copy_from(p2.get());
+        d_q2 = Mtx::create(omp);
+        d_q2->copy_from(q2.get());
+        d_x = Mtx::create(omp);
+        d_x->copy_from(x.get());
+        d_beta = Mtx::create(omp);
+        d_beta->copy_from(beta.get());
+        d_prev_rho = Mtx::create(omp);
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho = Mtx::create(omp);
+        d_rho->copy_from(rho.get());
+        d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
+            new gko::Array<gko::stopping_status>(omp, n));
+        *d_stop_status = *stop_status;
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::OmpExecutor> omp;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> q;
+    std::unique_ptr<Mtx> r2;
+    std::unique_ptr<Mtx> z2;
+    std::unique_ptr<Mtx> p2;
+    std::unique_ptr<Mtx> q2;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> stop_status;
+
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_q;
+    std::unique_ptr<Mtx> d_r2;
+    std::unique_ptr<Mtx> d_z2;
+    std::unique_ptr<Mtx> d_p2;
+    std::unique_ptr<Mtx> d_q2;
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<gko::Array<gko::stopping_status>> d_stop_status;
+};
+
+
+TEST_F(Bicg, OmpBicgInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::initialize(
+        ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(),
+        rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get());
+    gko::kernels::omp::bicg::initialize(
+        omp, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
+        d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(),
+        d_q2.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
+TEST_F(Bicg, OmpBicgStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(),
+                                          z2.get(), rho.get(), prev_rho.get(),
+                                          stop_status.get());
+    gko::kernels::omp::bicg::step_1(omp, d_p.get(), d_z.get(), d_p2.get(),
+                                    d_z2.get(), d_rho.get(), d_prev_rho.get(),
+                                    d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14);
+}
+
+
+TEST_F(Bicg, OmpBicgStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::bicg::step_2(
+        ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(),
+        rho.get(), stop_status.get());
+    gko::kernels::omp::bicg::step_2(
+        omp, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), d_q2.get(),
+        d_beta.get(), d_rho.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
+    GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14);
+}
+
+
+TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    make_spd(mtx.get());
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = Mtx::create(omp);
+    d_mtx->copy_from(mtx.get());
+    auto d_x = Mtx::create(omp);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(omp);
+    d_b->copy_from(b.get());
+    auto bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(omp),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(omp))
+            .on(omp);
+    auto solver = bicg_factory->generate(std::move(mtx));
+    auto d_solver = d_bicg_factory->generate(std::move(d_mtx));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Bicg, ApplyWithRandomMatrixIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = Mtx::create(omp);
+    d_mtx->copy_from(mtx.get());
+    auto d_x = Mtx::create(omp);
+    d_x->copy_from(x.get());
+    auto d_b = Mtx::create(omp);
+    d_b->copy_from(b.get());
+    auto bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(ref))
+            .on(ref);
+    auto d_bicg_factory =
+        gko::solver::Bicg<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(50u).on(omp),
+                gko::stop::ResidualNormReduction<>::build()
+                    .with_reduction_factor(1e-14)
+                    .on(omp))
+            .on(omp);
+    auto solver = bicg_factory->generate(std::move(mtx));
+    auto d_solver = d_bicg_factory->generate(std::move(d_mtx));
+
+    solver->apply(b.get(), x.get());
+    d_solver->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+}  // namespace
diff --git a/omp/test/solver/bicgstab_kernels.cpp b/omp/test/solver/bicgstab_kernels.cpp
index 7b5d96bccb4..5a81532f787 100644
--- a/omp/test/solver/bicgstab_kernels.cpp
+++ b/omp/test/solver/bicgstab_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,21 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/bicgstab_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/bicgstab_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
@@ -256,7 +258,7 @@ TEST_F(Bicgstab, OmpBicgstabInitializeIsEquivalentToRef)
     GKO_EXPECT_MTX_NEAR(d_beta, beta, 1e-14);
     GKO_EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14);
     GKO_EXPECT_MTX_NEAR(d_omega, omega, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/omp/test/solver/cg_kernels.cpp b/omp/test/solver/cg_kernels.cpp
index db3fd60798e..695789f0205 100644
--- a/omp/test/solver/cg_kernels.cpp
+++ b/omp/test/solver/cg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/cg.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/cg_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/cg_kernels.hpp"
+#include "core/test/utils.hpp"
+
 
 namespace {
 
@@ -192,7 +195,7 @@ TEST_F(Cg, OmpCgInitializeIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/omp/test/solver/cgs_kernels.cpp b/omp/test/solver/cgs_kernels.cpp
index 51b45f0ab5b..7fabfe22e93 100644
--- a/omp/test/solver/cgs_kernels.cpp
+++ b/omp/test/solver/cgs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/cgs.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/cgs_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/cgs_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
@@ -248,7 +250,7 @@ TEST_F(Cgs, OmpCgsInitializeIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_gamma, gamma, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/omp/test/solver/fcg_kernels.cpp b/omp/test/solver/fcg_kernels.cpp
index 5935d1233ea..af7fe606413 100644
--- a/omp/test/solver/fcg_kernels.cpp
+++ b/omp/test/solver/fcg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/fcg.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/fcg_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/solver/fcg_kernels.hpp"
+#include "core/test/utils.hpp"
+
 
 namespace {
 
@@ -206,7 +209,7 @@ TEST_F(Fcg, OmpFcgInitializeIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_rho_t, rho_t, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
diff --git a/omp/test/solver/gmres_kernels.cpp b/omp/test/solver/gmres_kernels.cpp
index 229d6aa1c1b..aa845b21ff0 100644
--- a/omp/test/solver/gmres_kernels.cpp
+++ b/omp/test/solver/gmres_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/solver/gmres_kernels.hpp"
+#include <ginkgo/core/solver/gmres.hpp>
 
 
 #include <random>
@@ -41,13 +41,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include "core/solver/gmres_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
@@ -56,7 +57,14 @@ namespace {
 
 class Gmres : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using norm_type = gko::remove_complex<value_type>;
+    using NormVector = gko::matrix::Dense<norm_type>;
+    template <typename T>
+    using Dense = typename gko::matrix::Dense<T>;
+
     Gmres() : rand_engine(30) {}
 
     void SetUp()
@@ -72,12 +80,13 @@ class Gmres : public ::testing::Test {
         }
     }
 
-    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    template <typename ValueType = value_type, typename IndexType = index_type>
+    std::unique_ptr<Dense<ValueType>> gen_mtx(int num_rows, int num_cols)
     {
-        return gko::test::generate_random_matrix<Mtx>(
+        return gko::test::generate_random_matrix<Dense<ValueType>>(
             num_rows, num_cols,
-            std::uniform_int_distribution<>(num_cols, num_cols),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::uniform_int_distribution<IndexType>(num_cols, num_cols),
+            std::normal_distribution<ValueType>(-1.0, 1.0), rand_engine, ref);
     }
 
     void initialize_data()
@@ -88,14 +97,12 @@ class Gmres : public ::testing::Test {
         y = gen_mtx(gko::solver::default_krylov_dim, n);
         before_preconditioner = Mtx::create_with_config_of(x.get());
         b = gen_mtx(m, n);
-        b_norm = gen_mtx(1, n);
-        krylov_bases = gen_mtx(m, (gko::solver::default_krylov_dim + 1) * n);
-        next_krylov_basis = gen_mtx(m, n);
+        krylov_bases = gen_mtx(m * (gko::solver::default_krylov_dim + 1), n);
         hessenberg = gen_mtx(gko::solver::default_krylov_dim + 1,
                              gko::solver::default_krylov_dim * n);
         hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, n);
         residual = gen_mtx(m, n);
-        residual_norm = gen_mtx(1, n);
+        residual_norm = gen_mtx<norm_type>(1, n);
         residual_norm_collection =
             gen_mtx(gko::solver::default_krylov_dim + 1, n);
         givens_sin = gen_mtx(gko::solver::default_krylov_dim, n);
@@ -118,19 +125,15 @@ class Gmres : public ::testing::Test {
         d_y->copy_from(y.get());
         d_b = Mtx::create(omp);
         d_b->copy_from(b.get());
-        d_b_norm = Mtx::create(omp);
-        d_b_norm->copy_from(b_norm.get());
         d_krylov_bases = Mtx::create(omp);
         d_krylov_bases->copy_from(krylov_bases.get());
-        d_next_krylov_basis = Mtx::create(omp);
-        d_next_krylov_basis->copy_from(next_krylov_basis.get());
         d_hessenberg = Mtx::create(omp);
         d_hessenberg->copy_from(hessenberg.get());
         d_hessenberg_iter = Mtx::create(omp);
         d_hessenberg_iter->copy_from(hessenberg_iter.get());
         d_residual = Mtx::create(omp);
         d_residual->copy_from(residual.get());
-        d_residual_norm = Mtx::create(omp);
+        d_residual_norm = NormVector::create(omp);
         d_residual_norm->copy_from(residual_norm.get());
         d_residual_norm_collection = Mtx::create(omp);
         d_residual_norm_collection->copy_from(residual_norm_collection.get());
@@ -155,13 +158,11 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<Mtx> x;
     std::unique_ptr<Mtx> y;
     std::unique_ptr<Mtx> b;
-    std::unique_ptr<Mtx> b_norm;
     std::unique_ptr<Mtx> krylov_bases;
-    std::unique_ptr<Mtx> next_krylov_basis;
     std::unique_ptr<Mtx> hessenberg;
     std::unique_ptr<Mtx> hessenberg_iter;
     std::unique_ptr<Mtx> residual;
-    std::unique_ptr<Mtx> residual_norm;
+    std::unique_ptr<NormVector> residual_norm;
     std::unique_ptr<Mtx> residual_norm_collection;
     std::unique_ptr<Mtx> givens_sin;
     std::unique_ptr<Mtx> givens_cos;
@@ -172,13 +173,11 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<Mtx> d_before_preconditioner;
     std::unique_ptr<Mtx> d_y;
     std::unique_ptr<Mtx> d_b;
-    std::unique_ptr<Mtx> d_b_norm;
     std::unique_ptr<Mtx> d_krylov_bases;
-    std::unique_ptr<Mtx> d_next_krylov_basis;
     std::unique_ptr<Mtx> d_hessenberg;
     std::unique_ptr<Mtx> d_hessenberg_iter;
     std::unique_ptr<Mtx> d_residual;
-    std::unique_ptr<Mtx> d_residual_norm;
+    std::unique_ptr<NormVector> d_residual_norm;
     std::unique_ptr<Mtx> d_residual_norm_collection;
     std::unique_ptr<Mtx> d_givens_sin;
     std::unique_ptr<Mtx> d_givens_cos;
@@ -192,18 +191,17 @@ TEST_F(Gmres, OmpGmresInitialize1IsEquivalentToRef)
     initialize_data();
 
     gko::kernels::reference::gmres::initialize_1(
-        ref, b.get(), b_norm.get(), residual.get(), givens_sin.get(),
-        givens_cos.get(), stop_status.get(), gko::solver::default_krylov_dim);
+        ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
+        stop_status.get(), gko::solver::default_krylov_dim);
     gko::kernels::omp::gmres::initialize_1(
-        omp, d_b.get(), d_b_norm.get(), d_residual.get(), d_givens_sin.get(),
+        omp, d_b.get(), d_residual.get(), d_givens_sin.get(),
         d_givens_cos.get(), d_stop_status.get(),
         gko::solver::default_krylov_dim);
 
-    GKO_ASSERT_MTX_NEAR(d_b_norm, b_norm, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_residual, residual, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
 }
 
 
@@ -224,7 +222,7 @@ TEST_F(Gmres, OmpGmresInitialize2IsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection,
                         1e-14);
     GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
 }
 
 
@@ -234,17 +232,15 @@ TEST_F(Gmres, OmpGmresStep1IsEquivalentToRef)
     int iter = 5;
 
     gko::kernels::reference::gmres::step_1(
-        ref, next_krylov_basis.get(), givens_sin.get(), givens_cos.get(),
+        ref, x->get_size()[0], givens_sin.get(), givens_cos.get(),
         residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(),
-        hessenberg_iter.get(), b_norm.get(), iter, final_iter_nums.get(),
-        stop_status.get());
+        hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get());
     gko::kernels::omp::gmres::step_1(
-        omp, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(),
+        omp, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(),
         d_residual_norm.get(), d_residual_norm_collection.get(),
-        d_krylov_bases.get(), d_hessenberg_iter.get(), d_b_norm.get(), iter,
+        d_krylov_bases.get(), d_hessenberg_iter.get(), iter,
         d_final_iter_nums.get(), d_stop_status.get());
 
-    GKO_ASSERT_MTX_NEAR(d_next_krylov_basis, next_krylov_basis, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14);
@@ -252,7 +248,7 @@ TEST_F(Gmres, OmpGmresStep1IsEquivalentToRef)
                         1e-14);
     GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14);
     GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14);
-    GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums);
+    GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums);
 }
 
 
diff --git a/omp/test/solver/ir_kernels.cpp b/omp/test/solver/ir_kernels.cpp
index 247467af2ca..a1b7b55f448 100644
--- a/omp/test/solver/ir_kernels.cpp
+++ b/omp/test/solver/ir_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,21 +33,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/ir.hpp>
 
 
-#include <gtest/gtest.h>
+#include <random>
 
 
-#include <random>
+#include <gtest/gtest.h>
 
 
-#include <core/solver/ir_kernels.hpp>
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/solver/ir_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -126,4 +129,124 @@ TEST_F(Ir, ApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(omp, mtx);
+    auto d_x = clone(omp, x);
+    auto d_b = clone(omp, b);
+
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            ref))
+                    .on(ref))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            omp))
+                    .on(omp))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(omp))
+            .on(omp);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres
+    // iteration gets amplified by the difference in IR.
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Ir, RichardsonApplyIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(omp, mtx);
+    auto d_x = clone(omp, x);
+    auto d_b = clone(omp, b);
+    // Forget about accuracy - Richardson is not going to converge for a random
+    // matrix, just check that a couple of iterations gives the same result on
+    // both executors
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_relaxation_factor(0.9)
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(omp))
+            .with_relaxation_factor(0.9)
+            .on(omp);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef)
+{
+    auto mtx = gen_mtx(50, 50);
+    auto x = gen_mtx(50, 3);
+    auto b = gen_mtx(50, 3);
+    auto d_mtx = clone(omp, mtx);
+    auto d_x = clone(omp, x);
+    auto d_b = clone(omp, b);
+    auto ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            ref))
+                    .on(ref))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_relaxation_factor(0.9)
+            .on(ref);
+    auto d_ir_factory =
+        gko::solver::Ir<>::build()
+            .with_solver(
+                gko::solver::Gmres<>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(1u).on(
+                            omp))
+                    .on(omp))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(2u).on(omp))
+            .with_relaxation_factor(0.9)
+            .on(omp);
+    auto solver = ir_factory->generate(std::move(mtx));
+    auto d_solver = d_ir_factory->generate(std::move(d_mtx));
+
+    solver->apply(lend(b), lend(x));
+    d_solver->apply(lend(d_b), lend(d_x));
+
+    // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres
+    // iteration gets amplified by the difference in IR.
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
 }  // namespace
diff --git a/omp/test/solver/lower_trs_kernels.cpp b/omp/test/solver/lower_trs_kernels.cpp
index 0d86e5e65cb..52c21bfa356 100644
--- a/omp/test/solver/lower_trs_kernels.cpp
+++ b/omp/test/solver/lower_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/test/solver/upper_trs_kernels.cpp b/omp/test/solver/upper_trs_kernels.cpp
index 3d322fe9c3d..db6097f6623 100644
--- a/omp/test/solver/upper_trs_kernels.cpp
+++ b/omp/test/solver/upper_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/test/stop/CMakeLists.txt b/omp/test/stop/CMakeLists.txt
index 5e686b8fbb4..0ba0781e077 100644
--- a/omp/test/stop/CMakeLists.txt
+++ b/omp/test/stop/CMakeLists.txt
@@ -1,2 +1,2 @@
 ginkgo_create_test(criterion_kernels)
-ginkgo_create_test(residual_norm_reduction_kernels)
+ginkgo_create_test(residual_norm_kernels)
diff --git a/omp/test/stop/criterion_kernels.cpp b/omp/test/stop/criterion_kernels.cpp
index df98120cffd..8ab87f2b6f2 100644
--- a/omp/test/stop/criterion_kernels.cpp
+++ b/omp/test/stop/criterion_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <ginkgo/core/stop/criterion.hpp>
-#include <ginkgo/core/stop/iteration.hpp>
 
 
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/stop/iteration.hpp>
+
+
 namespace {
 
 
diff --git a/omp/test/stop/residual_norm_kernels.cpp b/omp/test/stop/residual_norm_kernels.cpp
new file mode 100644
index 00000000000..3d33fb59628
--- /dev/null
+++ b/omp/test/stop/residual_norm_kernels.cpp
@@ -0,0 +1,348 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class ResidualNormReduction : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<T>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+
+    ResidualNormReduction()
+    {
+        omp_ = gko::OmpExecutor::create();
+        factory_ = gko::stop::ResidualNormReduction<T>::build()
+                       .with_reduction_factor(r<T>::value)
+                       .on(omp_);
+    }
+
+    std::unique_ptr<typename gko::stop::ResidualNormReduction<T>::Factory>
+        factory_;
+    std::shared_ptr<const gko::OmpExecutor> omp_;
+};
+
+TYPED_TEST_CASE(ResidualNormReduction, gko::test::ValueTypes);
+
+
+TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoal)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    auto initial_res = gko::initialize<Mtx>({100.0}, this->omp_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, this->omp_);
+    auto res_norm = gko::initialize<NormVector>({100.0}, this->omp_);
+    auto criterion =
+        this->factory_->generate(nullptr, rhs, nullptr, initial_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->omp_, 1);
+    stop_status.get_data()[0].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0) = r<TypeParam>::value * 1.1e+2;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
+    ASSERT_EQ(one_changed, false);
+
+    res_norm->at(0) = r<TypeParam>::value * 0.9e+2;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = TypeParam;
+    using T_nc = gko::remove_complex<TypeParam>;
+    auto res = gko::initialize<Mtx>({I<T>{100.0, 100.0}}, this->omp_);
+    auto res_norm =
+        gko::initialize<NormVector>({I<T_nc>{100.0, 100.0}}, this->omp_);
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({I<T>{10.0, 10.0}}, this->omp_);
+    auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->omp_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0, 0) = r<TypeParam>::value * 0.9e+2;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+
+    res_norm->at(0, 1) = r<TypeParam>::value * 0.9e+2;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+template <typename T>
+class RelativeResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<T>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+
+    RelativeResidualNorm()
+    {
+        omp_ = gko::OmpExecutor::create();
+        factory_ = gko::stop::RelativeResidualNorm<T>::build()
+                       .with_tolerance(r<T>::value)
+                       .on(omp_);
+    }
+
+    std::unique_ptr<typename gko::stop::RelativeResidualNorm<T>::Factory>
+        factory_;
+    std::shared_ptr<const gko::OmpExecutor> omp_;
+};
+
+TYPED_TEST_CASE(RelativeResidualNorm, gko::test::ValueTypes);
+
+
+TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoal)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    auto initial_res = gko::initialize<Mtx>({100.0}, this->omp_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, this->omp_);
+    auto res_norm = gko::initialize<NormVector>({100.0}, this->omp_);
+    auto criterion =
+        this->factory_->generate(nullptr, rhs, nullptr, initial_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->omp_, 1);
+    stop_status.get_data()[0].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0) = r<TypeParam>::value * 1.1e+1;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
+    ASSERT_EQ(one_changed, false);
+
+    res_norm->at(0) = r<TypeParam>::value * 0.9e+1;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = TypeParam;
+    using T_nc = gko::remove_complex<TypeParam>;
+    auto res = gko::initialize<Mtx>({I<T>{100.0, 100.0}}, this->omp_);
+    auto res_norm =
+        gko::initialize<NormVector>({I<T_nc>{100.0, 100.0}}, this->omp_);
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({I<T>{10.0, 10.0}}, this->omp_);
+    auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->omp_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0, 0) = r<TypeParam>::value * 0.9e+1;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+
+    res_norm->at(0, 1) = r<TypeParam>::value * 0.9e+1;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+template <typename T>
+class AbsoluteResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<T>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+
+    AbsoluteResidualNorm()
+    {
+        omp_ = gko::OmpExecutor::create();
+        factory_ = gko::stop::AbsoluteResidualNorm<T>::build()
+                       .with_tolerance(r<T>::value)
+                       .on(omp_);
+    }
+
+    std::unique_ptr<typename gko::stop::AbsoluteResidualNorm<T>::Factory>
+        factory_;
+    std::shared_ptr<const gko::OmpExecutor> omp_;
+};
+
+TYPED_TEST_CASE(AbsoluteResidualNorm, gko::test::ValueTypes);
+
+
+TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoal)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    auto initial_res = gko::initialize<Mtx>({100.0}, this->omp_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, this->omp_);
+    auto res_norm = gko::initialize<NormVector>({100.0}, this->omp_);
+    auto criterion =
+        this->factory_->generate(nullptr, rhs, nullptr, initial_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->omp_, 1);
+    stop_status.get_data()[0].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0) = r<TypeParam>::value * 1.1;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
+    ASSERT_EQ(one_changed, false);
+
+    res_norm->at(0) = r<TypeParam>::value * 0.9;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = TypeParam;
+    using T_nc = gko::remove_complex<TypeParam>;
+    auto res = gko::initialize<Mtx>({I<T>{100.0, 100.0}}, this->omp_);
+    auto res_norm =
+        gko::initialize<NormVector>({I<T_nc>{100.0, 100.0}}, this->omp_);
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({I<T>{10.0, 10.0}}, this->omp_);
+    auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->omp_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0, 0) = r<TypeParam>::value * 0.9;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+
+    res_norm->at(0, 1) = r<TypeParam>::value * 0.9;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+}  // namespace
diff --git a/omp/test/stop/residual_norm_reduction_kernels.cpp b/omp/test/stop/residual_norm_reduction_kernels.cpp
deleted file mode 100644
index e528811a9bb..00000000000
--- a/omp/test/stop/residual_norm_reduction_kernels.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
-
-
-#include <gtest/gtest.h>
-
-
-namespace {
-
-
-constexpr double reduction_factor = 1.0e-14;
-
-
-class ResidualNormReduction : public ::testing::Test {
-protected:
-    using Mtx = gko::matrix::Dense<>;
-
-    ResidualNormReduction()
-    {
-        omp_ = gko::OmpExecutor::create();
-        factory_ = gko::stop::ResidualNormReduction<>::build()
-                       .with_reduction_factor(reduction_factor)
-                       .on(omp_);
-    }
-
-    std::unique_ptr<gko::stop::ResidualNormReduction<>::Factory> factory_;
-    std::shared_ptr<const gko::OmpExecutor> omp_;
-};
-
-
-TEST_F(ResidualNormReduction, WaitsTillResidualGoal)
-{
-    auto scalar = gko::initialize<Mtx>({1.0}, omp_);
-    auto criterion =
-        factory_->generate(nullptr, nullptr, nullptr, scalar.get());
-    bool one_changed{};
-    constexpr gko::uint8 RelativeStoppingId{1};
-    gko::Array<gko::stopping_status> stop_status(omp_, 1);
-    stop_status.get_data()[0].reset();
-
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-
-    scalar->at(0) = reduction_factor * 1.0e+2;
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
-    ASSERT_EQ(one_changed, false);
-
-    scalar->at(0) = reduction_factor * 1.0e-2;
-    ASSERT_TRUE(
-        criterion->update()
-            .residual_norm(scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-}
-
-
-TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
-{
-    auto mtx = gko::initialize<Mtx>({{1.0, 1.0}}, omp_);
-    auto criterion = factory_->generate(nullptr, nullptr, nullptr, mtx.get());
-    bool one_changed{};
-    constexpr gko::uint8 RelativeStoppingId{1};
-    gko::Array<gko::stopping_status> stop_status(omp_, 2);
-    stop_status.get_data()[0].reset();
-    stop_status.get_data()[1].reset();
-
-    ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check(
-        RelativeStoppingId, true, &stop_status, &one_changed));
-
-    mtx->at(0, 0) = reduction_factor * 1.0e-2;
-    ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check(
-        RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-
-    mtx->at(0, 1) = reduction_factor * 1.0e-2;
-    ASSERT_TRUE(criterion->update().residual_norm(mtx.get()).check(
-        RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-}
-
-
-}  // namespace
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 7516fc8641f..9cb2256bf13 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -1,32 +1,45 @@
 add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> "")
 target_sources(ginkgo_reference
     PRIVATE
-        base/version.cpp
-        factorization/par_ilu_kernels.cpp
-        matrix/coo_kernels.cpp
-        matrix/csr_kernels.cpp
-        matrix/dense_kernels.cpp
-        matrix/ell_kernels.cpp
-        matrix/hybrid_kernels.cpp
-        matrix/sellp_kernels.cpp
-        matrix/sparsity_csr_kernels.cpp
-        preconditioner/jacobi_kernels.cpp
-        solver/bicgstab_kernels.cpp
-        solver/cg_kernels.cpp
-        solver/cgs_kernels.cpp
-        solver/fcg_kernels.cpp
-        solver/gmres_kernels.cpp
-        solver/ir_kernels.cpp
-        solver/lower_trs_kernels.cpp
-        solver/upper_trs_kernels.cpp
-        stop/criterion_kernels.cpp
-        stop/residual_norm_reduction_kernels.cpp)
+    base/version.cpp
+    components/fill_array.cpp
+    components/precision_conversion.cpp
+    components/prefix_sum.cpp
+    factorization/ilu_kernels.cpp
+    factorization/factorization_kernels.cpp
+    factorization/par_ict_kernels.cpp
+    factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/csr_kernels.cpp
+    matrix/dense_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/hybrid_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    preconditioner/isai_kernels.cpp
+    preconditioner/jacobi_kernels.cpp
+    solver/bicg_kernels.cpp
+    solver/bicgstab_kernels.cpp
+    solver/cg_kernels.cpp
+    solver/cgs_kernels.cpp
+    solver/fcg_kernels.cpp
+    solver/gmres_kernels.cpp
+    solver/ir_kernels.cpp
+    solver/lower_trs_kernels.cpp
+    solver/upper_trs_kernels.cpp
+    stop/criterion_kernels.cpp
+    stop/residual_norm_kernels.cpp)
 
 ginkgo_compile_features(ginkgo_reference)
 ginkgo_default_includes(ginkgo_reference)
 ginkgo_install_library(ginkgo_reference reference)
 target_compile_options(ginkgo_reference PRIVATE "${GINKGO_COMPILER_FLAGS}")
 
+if (GINKGO_CHECK_CIRCULAR_DEPS)
+    ginkgo_check_headers(ginkgo_reference)
+endif()
+
 if(GINKGO_BUILD_TESTS)
     add_subdirectory(test)
 endif()
diff --git a/reference/base/version.cpp b/reference/base/version.cpp
index 5f6b82582b2..aac3a23180e 100644
--- a/reference/base/version.cpp
+++ b/reference/base/version.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/components/convert_ptrs.hpp b/reference/components/convert_ptrs.hpp
index ee007a96edd..bc89f9f2df0 100644
--- a/reference/components/convert_ptrs.hpp
+++ b/reference/components/convert_ptrs.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <numeric>
 
 
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 namespace kernels {
 namespace reference {
diff --git a/reference/components/csr_spgeam.hpp b/reference/components/csr_spgeam.hpp
new file mode 100644
index 00000000000..f09b34d5926
--- /dev/null
+++ b/reference/components/csr_spgeam.hpp
@@ -0,0 +1,112 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_
+#define GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_
+
+
+#include <limits>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/base/utils.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+
+
+/**
+ * Adds two (sorted) sparse matrices.
+ *
+ * Calls begin_cb(row) on each row to initialize row-local data
+ * Calls entry_cb(row, col, a_val, b_val, local_data) on each output non-zero
+ * Calls end_cb(row, local_data) on each row to finalize row-local data
+ */
+template <typename ValueType, typename IndexType, typename BeginCallback,
+          typename EntryCallback, typename EndCallback>
+void abstract_spgeam(const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Csr<ValueType, IndexType> *b,
+                     BeginCallback begin_cb, EntryCallback entry_cb,
+                     EndCallback end_cb)
+{
+    auto num_rows = a->get_size()[0];
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto b_vals = b->get_const_values();
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto a_begin = a_row_ptrs[row];
+        auto a_end = a_row_ptrs[row + 1];
+        auto b_begin = b_row_ptrs[row];
+        auto b_end = b_row_ptrs[row + 1];
+        auto total_size = (a_end - a_begin) + (b_end - b_begin);
+        bool skip{};
+        auto local_data = begin_cb(row);
+        for (IndexType i = 0; i < total_size; ++i) {
+            if (skip) {
+                skip = false;
+                continue;
+            }
+            // load column indices or sentinel
+            auto a_col = checked_load(a_col_idxs, a_begin, a_end, sentinel);
+            auto b_col = checked_load(b_col_idxs, b_begin, b_end, sentinel);
+            auto a_val =
+                checked_load(a_vals, a_begin, a_end, zero<ValueType>());
+            auto b_val =
+                checked_load(b_vals, b_begin, b_end, zero<ValueType>());
+            auto col = min(a_col, b_col);
+            // callback
+            entry_cb(row, col, a_col == col ? a_val : zero<ValueType>(),
+                     b_col == col ? b_val : zero<ValueType>(), local_data);
+            // advance indices
+            a_begin += (a_col <= b_col);
+            b_begin += (b_col <= a_col);
+            skip = a_col == b_col;
+        }
+        end_cb(row, local_data);
+    }
+}
+
+
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_
diff --git a/reference/components/fill_array.cpp b/reference/components/fill_array.cpp
new file mode 100644
index 00000000000..23499dabd3b
--- /dev/null
+++ b/reference/components/fill_array.cpp
@@ -0,0 +1,57 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/fill_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace components {
+
+
+template <typename ValueType>
+void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType *array,
+                size_type n, ValueType val)
+{
+    std::fill_n(array, n, val);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/components/format_conversion.hpp b/reference/components/format_conversion.hpp
index 95e17374b88..38520dd8b66 100644
--- a/reference/components/format_conversion.hpp
+++ b/reference/components/format_conversion.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <numeric>
 
 
+#include <ginkgo/core/base/types.hpp>
+
+
 namespace gko {
 namespace kernels {
 namespace reference {
diff --git a/reference/components/matrix_operations.hpp b/reference/components/matrix_operations.hpp
index 2214e4cb06d..b62a3a84ec8 100644
--- a/reference/components/matrix_operations.hpp
+++ b/reference/components/matrix_operations.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/components/precision_conversion.cpp b/reference/components/precision_conversion.cpp
new file mode 100644
index 00000000000..6bc37efe940
--- /dev/null
+++ b/reference/components/precision_conversion.cpp
@@ -0,0 +1,58 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/precision_conversion.hpp"
+
+
+#include <algorithm>
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace components {
+
+
+template <typename SourceType, typename TargetType>
+void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
+                       size_type size, const SourceType *in, TargetType *out)
+{
+    std::copy_n(in, size, out);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+
+
+}  // namespace components
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/components/prefix_sum.cpp b/reference/components/prefix_sum.cpp
new file mode 100644
index 00000000000..2530e960a68
--- /dev/null
+++ b/reference/components/prefix_sum.cpp
@@ -0,0 +1,63 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace components {
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const ReferenceExecutor> exec,
+                IndexType *counts, size_type num_entries)
+{
+    IndexType partial_sum{};
+    for (IndexType i = 0; i < num_entries; ++i) {
+        auto nnz = counts[i];
+        counts[i] = partial_sum;
+        partial_sum += nnz;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// instantiate for size_type as well, as this is used in the Sellp format
+template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
+
+
+}  // namespace components
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp
new file mode 100644
index 00000000000..feb64a56b40
--- /dev/null
+++ b/reference/factorization/factorization_kernels.cpp
@@ -0,0 +1,327 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/factorization_kernels.hpp"
+
+
+#include <algorithm>
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+template <typename IndexType>
+size_type count_missing_elements(IndexType num_rows, IndexType num_cols,
+                                 const IndexType *col_idxs,
+                                 const IndexType *row_ptrs)
+{
+    size_type missing_elements{};
+    // if row >= num_cols, diagonal elements no longer exist
+    for (IndexType row = 0; row < num_rows && row < num_cols; ++row) {
+        bool was_diagonal_found{false};
+        for (IndexType idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) {
+            const auto col = col_idxs[idx];
+            if (col == row) {
+                was_diagonal_found = true;
+                break;
+            }
+        }
+        if (!was_diagonal_found) {
+            ++missing_elements;
+        }
+    }
+    return missing_elements;
+}
+
+
+template <typename ValueType, typename IndexType>
+void add_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
+                           matrix::Csr<ValueType, IndexType> *mtx,
+                           bool /*is_sorted*/)
+{
+    const auto values = mtx->get_const_values();
+    const auto col_idxs = mtx->get_const_col_idxs();
+    auto row_ptrs = mtx->get_row_ptrs();
+    auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
+    auto num_cols = static_cast<IndexType>(mtx->get_size()[1]);
+
+    auto missing_elements =
+        count_missing_elements(num_rows, num_cols, col_idxs, row_ptrs);
+
+    if (missing_elements == 0) {
+        return;
+    }
+
+    const auto old_nnz = mtx->get_num_stored_elements();
+    const size_type new_nnz = old_nnz + missing_elements;
+    Array<ValueType> new_values_array{exec, new_nnz};
+    Array<IndexType> new_col_idxs_array{exec, new_nnz};
+    auto new_values = new_values_array.get_data();
+    auto new_col_idxs = new_col_idxs_array.get_data();
+    IndexType added_elements{};
+    // row_ptrs will be updated in-place
+
+    for (IndexType row = 0; row < num_rows; ++row) {
+        bool diagonal_handled{false};
+        const IndexType old_row_ptrs_start{row_ptrs[row]};
+        const IndexType old_row_ptrs_end{row_ptrs[row + 1]};
+        const IndexType new_row_ptrs_start =
+            old_row_ptrs_start + added_elements;
+
+        row_ptrs[row] = new_row_ptrs_start;
+        for (IndexType old_idx = old_row_ptrs_start; old_idx < old_row_ptrs_end;
+             ++old_idx) {
+            auto new_idx = old_idx + added_elements;
+            const auto col_idx = col_idxs[old_idx];
+            if (!diagonal_handled && col_idx > row) {
+                const auto start_cols = col_idxs + old_idx;
+                const auto end_cols = col_idxs + old_row_ptrs_end;
+                // expect row to not be sorted, so search for a diagonal entry
+                if (std::find(start_cols, end_cols, row) != end_cols) {
+                    // no need to add diagonal since diagonal is already present
+                    diagonal_handled = true;
+                }
+                // if diagonal was not found, add it
+                if (!diagonal_handled) {
+                    new_values[new_idx] = zero<ValueType>();
+                    new_col_idxs[new_idx] = row;
+                    ++added_elements;
+                    new_idx = old_idx + added_elements;
+                    diagonal_handled = true;
+                }
+            }
+            if (row >= num_cols || col_idx == row) {
+                diagonal_handled = true;
+            }
+            new_values[new_idx] = values[old_idx];
+            new_col_idxs[new_idx] = col_idx;
+        }
+        if (row < num_cols && !diagonal_handled) {
+            const auto new_idx = old_row_ptrs_end + added_elements;
+            new_values[new_idx] = zero<ValueType>();
+            new_col_idxs[new_idx] = row;
+            diagonal_handled = true;
+            ++added_elements;
+        }
+    }
+    row_ptrs[num_rows] = new_nnz;
+
+    matrix::CsrBuilder<ValueType, IndexType> mtx_builder{mtx};
+    mtx_builder.get_value_array() = std::move(new_values_array);
+    mtx_builder.get_col_idx_array() = std::move(new_col_idxs_array);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l_u(
+    std::shared_ptr<const ReferenceExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
+{
+    auto row_ptrs = system_matrix->get_const_row_ptrs();
+    auto col_idxs = system_matrix->get_const_col_idxs();
+    size_type l_nnz{};
+    size_type u_nnz{};
+
+    l_row_ptrs[0] = 0;
+    u_row_ptrs[0] = 0;
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            size_type col = col_idxs[el];
+            // don't count diagonal
+            l_nnz += col < row;
+            u_nnz += col > row;
+        }
+        // add diagonal again
+        l_nnz++;
+        u_nnz++;
+        l_row_ptrs[row + 1] = l_nnz;
+        u_row_ptrs[row + 1] = u_nnz;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l_u(std::shared_ptr<const ReferenceExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *system_matrix,
+                    matrix::Csr<ValueType, IndexType> *csr_l,
+                    matrix::Csr<ValueType, IndexType> *csr_u)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
+    auto col_idxs_u = csr_u->get_col_idxs();
+    auto vals_u = csr_u->get_values();
+
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        size_type current_index_u =
+            row_ptrs_u[row] + 1;  // we treat the diagonal separately
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = val;
+                ++current_index_l;
+            } else if (col == row) {
+                // save diagonal value
+                diag_val = val;
+            } else {  // col > row
+                col_idxs_u[current_index_u] = col;
+                vals_u[current_index_u] = val;
+                ++current_index_u;
+            }
+        }
+        // store diagonal values separately
+        auto l_diag_idx = row_ptrs_l[row + 1] - 1;
+        auto u_diag_idx = row_ptrs_u[row];
+        col_idxs_l[l_diag_idx] = row;
+        col_idxs_u[u_diag_idx] = row;
+        vals_l[l_diag_idx] = one<ValueType>();
+        vals_u[u_diag_idx] = diag_val;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_row_ptrs_l(
+    std::shared_ptr<const ReferenceExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *system_matrix,
+    IndexType *l_row_ptrs)
+{
+    auto row_ptrs = system_matrix->get_const_row_ptrs();
+    auto col_idxs = system_matrix->get_const_col_idxs();
+    size_type l_nnz{};
+
+    l_row_ptrs[0] = 0;
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            size_type col = col_idxs[el];
+            // skip diagonal
+            l_nnz += col < row;
+        }
+        // add diagonal again
+        l_nnz++;
+        l_row_ptrs[row + 1] = l_nnz;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_l(std::shared_ptr<const ReferenceExecutor> exec,
+                  const matrix::Csr<ValueType, IndexType> *system_matrix,
+                  matrix::Csr<ValueType, IndexType> *csr_l, bool diag_sqrt)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = val;
+                ++current_index_l;
+            } else if (col == row) {
+                // save diagonal value
+                diag_val = val;
+            }
+        }
+        // store diagonal values separately
+        auto l_diag_idx = row_ptrs_l[row + 1] - 1;
+        col_idxs_l[l_diag_idx] = row;
+        // compute square root with sentinel
+        if (diag_sqrt) {
+            diag_val = sqrt(diag_val);
+            if (!is_finite(diag_val)) {
+                diag_val = one<ValueType>();
+            }
+        }
+        vals_l[l_diag_idx] = diag_val;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp
new file mode 100644
index 00000000000..279bfda5c18
--- /dev/null
+++ b/reference/factorization/ilu_kernels.cpp
@@ -0,0 +1,58 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/ilu_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace ilu_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
+                matrix::Csr<ValueType, IndexType> *m) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+
+
+}  // namespace ilu_factorization
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp
new file mode 100644
index 00000000000..8114d22b493
--- /dev/null
+++ b/reference/factorization/par_ict_kernels.cpp
@@ -0,0 +1,209 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <algorithm>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/base/utils.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "reference/components/csr_spgeam.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The parallel ict factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Coo<ValueType, IndexType> *)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1];
+             ++l_nz) {
+            auto col = l_col_idxs[l_nz];
+            // find value from A
+            auto a_begin = a_row_ptrs[row];
+            auto a_end = a_row_ptrs[row + 1];
+            auto a_nz_it =
+                std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col);
+            auto a_nz = std::distance(a_col_idxs, a_nz_it);
+            auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col;
+            auto a_val = has_a ? a_vals[a_nz] : zero<ValueType>();
+            // accumulate l(row,:) * l(col,:) without the last entry l(col, col)
+            ValueType sum{};
+            IndexType lt_nz{};
+            auto l_begin = l_row_ptrs[row];
+            auto l_end = l_row_ptrs[row + 1];
+            auto lt_begin = l_row_ptrs[col];
+            auto lt_end = l_row_ptrs[col + 1];
+            while (l_begin < l_end && lt_begin < lt_end) {
+                auto l_col = l_col_idxs[l_begin];
+                auto lt_row = l_col_idxs[lt_begin];
+                if (l_col == lt_row && l_col < col) {
+                    sum += l_vals[l_begin] * l_vals[lt_begin];
+                }
+                if (lt_row == row) {
+                    lt_nz = lt_begin;
+                }
+                l_begin += (l_col <= lt_row);
+                lt_begin += (lt_row <= l_col);
+            }
+            auto new_val = a_val - sum;
+            if (row == col) {
+                new_val = sqrt(new_val);
+            } else {
+                auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+                new_val = new_val / diag;
+            }
+            if (is_finite(new_val)) {
+                l_vals[l_nz] = new_val;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *llt,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    matrix::Csr<ValueType, IndexType> *l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    // count nnz
+    IndexType l_nnz{};
+    abstract_spgeam(
+        a, llt,
+        [&](IndexType row) {
+            l_new_row_ptrs[row] = l_nnz;
+            return 0;
+        },
+        [&](IndexType row, IndexType col, ValueType, ValueType, int) {
+            l_nnz += col <= row;
+        },
+        [](IndexType, int) {});
+    l_new_row_ptrs[num_rows] = l_nnz;
+
+    // resize arrays
+    matrix::CsrBuilder<ValueType, IndexType> l_builder{l_new};
+    l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+    l_builder.get_value_array().resize_and_reset(l_nnz);
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // accumulate non-zeros
+    struct row_state {
+        IndexType l_new_nz;
+        IndexType l_old_begin;
+        IndexType l_old_end;
+    };
+    abstract_spgeam(
+        a, llt,
+        [&](IndexType row) {
+            row_state state{};
+            state.l_new_nz = l_new_row_ptrs[row];
+            state.l_old_begin = l_row_ptrs[row];
+            state.l_old_end = l_row_ptrs[row + 1];
+            return state;
+        },
+        [&](IndexType row, IndexType col, ValueType a_val, ValueType llt_val,
+            row_state &state) {
+            auto r_val = a_val - llt_val;
+            // load matching entry of L
+            auto l_col = checked_load(l_col_idxs, state.l_old_begin,
+                                      state.l_old_end, sentinel);
+            auto l_val = checked_load(l_vals, state.l_old_begin,
+                                      state.l_old_end, zero<ValueType>());
+            // load diagonal entry of L
+            auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+            // if there is already an entry present, use that
+            // instead.
+            auto out_val = l_col == col ? l_val : r_val / diag;
+            // store output entries
+            if (row >= col) {
+                l_new_col_idxs[state.l_new_nz] = col;
+                l_new_vals[state.l_new_nz] = out_val;
+                state.l_new_nz++;
+            }
+            // advance entry of L if we used it
+            state.l_old_begin += (l_col == col);
+        },
+        [](IndexType, row_state) {});
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp
index d7df460e2d5..e2234d52fdf 100644
--- a/reference/factorization/par_ilu_kernels.cpp
+++ b/reference/factorization/par_ilu_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
+#include <memory>
+
+
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -49,88 +52,6 @@ namespace reference {
 namespace par_ilu_factorization {
 
 
-template <typename ValueType, typename IndexType>
-void initialize_row_ptrs_l_u(
-    std::shared_ptr<const ReferenceExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *system_matrix,
-    IndexType *l_row_ptrs, IndexType *u_row_ptrs)
-{
-    auto row_ptrs = system_matrix->get_const_row_ptrs();
-    auto col_idxs = system_matrix->get_const_col_idxs();
-    size_type l_nnz{};
-    size_type u_nnz{};
-
-    l_row_ptrs[0] = 0;
-    u_row_ptrs[0] = 0;
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            size_type col = col_idxs[el];
-            if (col <= row) {
-                ++l_nnz;
-            }
-            if (col >= row) {
-                ++u_nnz;
-            }
-        }
-        l_row_ptrs[row + 1] = l_nnz;
-        u_row_ptrs[row + 1] = u_nnz;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void initialize_l_u(std::shared_ptr<const ReferenceExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType> *system_matrix,
-                    matrix::Csr<ValueType, IndexType> *csr_l,
-                    matrix::Csr<ValueType, IndexType> *csr_u)
-{
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
-    auto col_idxs_u = csr_u->get_col_idxs();
-    auto vals_u = csr_u->get_values();
-
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        size_type current_index_u = row_ptrs_u[row];
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // Update both L and U
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = one<ValueType>();
-                ++current_index_l;
-
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            } else {  // col > row
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            }
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void compute_l_u_factors(std::shared_ptr<const ReferenceExecutor> exec,
                          size_type iterations,
@@ -182,12 +103,12 @@ void compute_l_u_factors(std::shared_ptr<const ReferenceExecutor> exec,
 
             if (row > col) {  // modify entry in L
                 auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1];
-                if (::gko::isfinite(to_write)) {
+                if (is_finite(to_write)) {
                     vals_l[row_l - 1] = to_write;
                 }
             } else {  // modify entry in U
                 auto to_write = sum;
-                if (::gko::isfinite(to_write)) {
+                if (is_finite(to_write)) {
                     vals_u[row_u - 1] = to_write;
                 }
             }
diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp
new file mode 100644
index 00000000000..63df1c5634e
--- /dev/null
+++ b/reference/factorization/par_ilut_kernels.cpp
@@ -0,0 +1,473 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/utils.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "reference/components/csr_spgeam.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The parallel ilut factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+/**
+ * @internal
+ *
+ * Selects the `rank`th smallest element (0-based, magnitude-wise)
+ * from the values of `m`. It uses two temporary arrays.
+ */
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *m,
+                      IndexType rank, Array<ValueType> &tmp,
+                      Array<remove_complex<ValueType>> &,
+                      remove_complex<ValueType> &threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    tmp.resize_and_reset(size);
+    std::copy_n(values, size, tmp.get_data());
+
+    auto begin = tmp.get_data();
+    auto target = begin + rank;
+    auto end = begin + size;
+    std::nth_element(begin, target, end,
+                     [](ValueType a, ValueType b) { return abs(a) < abs(b); });
+    threshold = abs(*target);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+/**
+ * Removes all the elements from the input matrix for which pred is false.
+ * Stores the result in m_out and (if non-null) m_out_coo.
+ * pred(row, nz) is called for each entry, where nz is the index in
+ * values/col_idxs.
+ */
+template <typename Predicate, typename ValueType, typename IndexType>
+void abstract_filter(std::shared_ptr<const DefaultExecutor> exec,
+                     const matrix::Csr<ValueType, IndexType> *m,
+                     matrix::Csr<ValueType, IndexType> *m_out,
+                     matrix::Coo<ValueType, IndexType> *m_out_coo,
+                     Predicate pred)
+{
+    auto num_rows = m->get_size()[0];
+    auto row_ptrs = m->get_const_row_ptrs();
+    auto col_idxs = m->get_const_col_idxs();
+    auto vals = m->get_const_values();
+
+    // first sweep: count nnz for each row
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    for (size_type row = 0; row < num_rows; ++row) {
+        IndexType count{};
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; ++nz) {
+            count += pred(row, nz);
+        }
+        new_row_ptrs[row] = count;
+    }
+
+    // build row pointers
+    components::prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = new_row_ptrs[num_rows];
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType *new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            Array<ValueType>::view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto new_nz = new_row_ptrs[row];
+        auto begin = row_ptrs[row];
+        auto end = row_ptrs[row + 1];
+        for (auto nz = begin; nz < end; ++nz) {
+            if (pred(row, nz)) {
+                if (new_row_idxs) {
+                    new_row_idxs[new_nz] = row;
+                }
+                new_col_idxs[new_nz] = col_idxs[nz];
+                new_vals[new_nz] = vals[nz];
+                ++new_nz;
+            }
+        }
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Removes all elements below the given threshold from a matrix.
+ */
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *m,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType> *m_out,
+                      matrix::Coo<ValueType, IndexType> *m_out_coo, bool)
+{
+    auto col_idxs = m->get_const_col_idxs();
+    auto vals = m->get_const_values();
+    abstract_filter(
+        exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) {
+            return abs(vals[nz]) >= threshold || col_idxs[nz] == row;
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+constexpr auto bucket_count = 1 << sampleselect_searchtree_height;
+constexpr auto sample_size = bucket_count * sampleselect_oversampling;
+
+
+/**
+ * @internal
+ *
+ * Approximately selects the `rank`th smallest element as a threshold
+ * and removes all elements below this threshold from the input matrix.
+ */
+template <typename ValueType, typename IndexType>
+void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> &tmp,
+                             remove_complex<ValueType> &threshold,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto vals = m->get_const_values();
+    auto col_idxs = m->get_const_col_idxs();
+    auto size = static_cast<IndexType>(m->get_num_stored_elements());
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto storage_size = ceildiv(
+        sample_size * sizeof(AbsType) + bucket_count * sizeof(IndexType),
+        sizeof(ValueType));
+    tmp.resize_and_reset(storage_size);
+    // pick and sort sample
+    auto sample = reinterpret_cast<AbsType *>(tmp.get_data());
+    // assuming rounding towards zero
+    auto stride = double(size) / sample_size;
+    for (IndexType i = 0; i < sample_size; ++i) {
+        sample[i] = abs(vals[static_cast<IndexType>(i * stride)]);
+    }
+    std::sort(sample, sample + sample_size);
+    // pick splitters
+    for (IndexType i = 0; i < bucket_count - 1; ++i) {
+        // shift by one so we get upper bounds for the buckets
+        sample[i] = sample[(i + 1) * sampleselect_oversampling];
+    }
+    // count elements per bucket
+    auto histogram = reinterpret_cast<IndexType *>(sample + bucket_count);
+    for (IndexType bucket = 0; bucket < bucket_count; ++bucket) {
+        histogram[bucket] = 0;
+    }
+    for (IndexType nz = 0; nz < size; ++nz) {
+        auto bucket_it =
+            std::upper_bound(sample, sample + bucket_count - 1, abs(vals[nz]));
+        auto bucket = std::distance(sample, bucket_it);
+        // smallest bucket s.t. sample[bucket] >= abs(val[nz])
+        histogram[bucket]++;
+    }
+    // determine splitter ranks: prefix sum over bucket counts
+    components::prefix_sum(exec, histogram, bucket_count + 1);
+    // determine the bucket containing the threshold rank:
+    // prefix_sum[bucket] <= rank < prefix_sum[bucket + 1]
+    auto it = std::upper_bound(histogram, histogram + bucket_count + 1, rank);
+    auto threshold_bucket = std::distance(histogram + 1, it);
+    // sample contains upper bounds for the buckets
+    threshold = threshold_bucket > 0 ? sample[threshold_bucket - 1]
+                                     : zero<remove_complex<ValueType>>();
+    // filter elements
+    abstract_filter(
+        exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) {
+            return abs(vals[nz]) >= threshold || col_idxs[nz] == row;
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
+
+
+/**
+ * @internal
+ *
+ * Computes a ParILUT sweep on the input matrices.
+ */
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *a,
+                         matrix::Csr<ValueType, IndexType> *l,
+                         const matrix::Coo<ValueType, IndexType> *,
+                         matrix::Csr<ValueType, IndexType> *u,
+                         const matrix::Coo<ValueType, IndexType> *,
+                         matrix::Csr<ValueType, IndexType> *u_csc)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_values();
+    auto ut_col_ptrs = u_csc->get_const_row_ptrs();
+    auto ut_row_idxs = u_csc->get_const_col_idxs();
+    auto ut_vals = u_csc->get_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+
+    auto compute_sum = [&](IndexType row, IndexType col) {
+        // find value from A
+        auto a_begin = a_row_ptrs[row];
+        auto a_end = a_row_ptrs[row + 1];
+        auto a_nz_it =
+            std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col);
+        auto a_nz = std::distance(a_col_idxs, a_nz_it);
+        auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col;
+        auto a_val = has_a ? a_vals[a_nz] : zero<ValueType>();
+        // accumulate l(row,:) * u(:,col) without the last entry (row, col)
+        ValueType sum{};
+        IndexType ut_nz{};
+        auto l_begin = l_row_ptrs[row];
+        auto l_end = l_row_ptrs[row + 1];
+        auto u_begin = ut_col_ptrs[col];
+        auto u_end = ut_col_ptrs[col + 1];
+        auto last_entry = min(row, col);
+        while (l_begin < l_end && u_begin < u_end) {
+            auto l_col = l_col_idxs[l_begin];
+            auto u_row = ut_row_idxs[u_begin];
+            if (l_col == u_row && l_col < last_entry) {
+                sum += l_vals[l_begin] * ut_vals[u_begin];
+            }
+            if (u_row == row) {
+                ut_nz = u_begin;
+            }
+            l_begin += (l_col <= u_row);
+            u_begin += (u_row <= l_col);
+        }
+        return std::make_pair(a_val - sum, ut_nz);
+    };
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1] - 1;
+             ++l_nz) {
+            auto col = l_col_idxs[l_nz];
+            auto u_diag = ut_vals[ut_col_ptrs[col + 1] - 1];
+            auto new_val = compute_sum(row, col).first / u_diag;
+            if (is_finite(new_val)) {
+                l_vals[l_nz] = new_val;
+            }
+        }
+        for (size_type u_nz = u_row_ptrs[row]; u_nz < u_row_ptrs[row + 1];
+             ++u_nz) {
+            auto col = u_col_idxs[u_nz];
+            auto result = compute_sum(row, col);
+            auto new_val = result.first;
+            auto ut_nz = result.second;
+            if (is_finite(new_val)) {
+                u_vals[u_nz] = new_val;
+                ut_vals[ut_nz] = new_val;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+
+/**
+ * @internal
+ *
+ * Adds new entries from the sparsity pattern of A - L * U
+ * to L and U, where new values are chosen based on the residual
+ * value divided by the corresponding diagonal entry.
+ */
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType> *lu,
+                    const matrix::Csr<ValueType, IndexType> *a,
+                    const matrix::Csr<ValueType, IndexType> *l,
+                    const matrix::Csr<ValueType, IndexType> *u,
+                    matrix::Csr<ValueType, IndexType> *l_new,
+                    matrix::Csr<ValueType, IndexType> *u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    // count nnz
+    IndexType l_nnz{};
+    IndexType u_nnz{};
+    abstract_spgeam(
+        a, lu,
+        [&](IndexType row) {
+            l_new_row_ptrs[row] = l_nnz;
+            u_new_row_ptrs[row] = u_nnz;
+            return 0;
+        },
+        [&](IndexType row, IndexType col, ValueType, ValueType, int) {
+            l_nnz += col <= row;
+            u_nnz += col >= row;
+        },
+        [](IndexType, int) {});
+    l_new_row_ptrs[num_rows] = l_nnz;
+    u_new_row_ptrs[num_rows] = u_nnz;
+
+    // resize arrays
+    matrix::CsrBuilder<ValueType, IndexType> l_builder{l_new};
+    matrix::CsrBuilder<ValueType, IndexType> u_builder{u_new};
+    l_builder.get_col_idx_array().resize_and_reset(l_nnz);
+    l_builder.get_value_array().resize_and_reset(l_nnz);
+    u_builder.get_col_idx_array().resize_and_reset(u_nnz);
+    u_builder.get_value_array().resize_and_reset(u_nnz);
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    // accumulate non-zeros
+    struct row_state {
+        IndexType l_new_nz;
+        IndexType u_new_nz;
+        IndexType l_old_begin;
+        IndexType l_old_end;
+        IndexType u_old_begin;
+        IndexType u_old_end;
+        bool finished_l;
+    };
+    abstract_spgeam(
+        a, lu,
+        [&](IndexType row) {
+            row_state state{};
+            state.l_new_nz = l_new_row_ptrs[row];
+            state.u_new_nz = u_new_row_ptrs[row];
+            state.l_old_begin = l_row_ptrs[row];
+            state.l_old_end = l_row_ptrs[row + 1] - 1;  // skip diagonal
+            state.u_old_begin = u_row_ptrs[row];
+            state.u_old_end = u_row_ptrs[row + 1];
+            state.finished_l = (state.l_old_begin == state.l_old_end);
+            return state;
+        },
+        [&](IndexType row, IndexType col, ValueType a_val, ValueType lu_val,
+            row_state &state) {
+            auto r_val = a_val - lu_val;
+            // load matching entry of L + U
+            auto lpu_col = state.finished_l
+                               ? checked_load(u_col_idxs, state.u_old_begin,
+                                              state.u_old_end, sentinel)
+                               : l_col_idxs[state.l_old_begin];
+            auto lpu_val =
+                state.finished_l
+                    ? checked_load(u_vals, state.u_old_begin, state.u_old_end,
+                                   zero<ValueType>())
+                    : l_vals[state.l_old_begin];
+            // load diagonal entry of U for lower diagonal entries
+            auto diag = col < row ? u_vals[u_row_ptrs[col]] : one<ValueType>();
+            // if there is already an entry present, use that instead.
+            auto out_val = lpu_col == col ? lpu_val : r_val / diag;
+            // store output entries
+            if (row >= col) {
+                l_new_col_idxs[state.l_new_nz] = col;
+                l_new_vals[state.l_new_nz] =
+                    row == col ? one<ValueType>() : out_val;
+                state.l_new_nz++;
+            }
+            if (row <= col) {
+                u_new_col_idxs[state.u_new_nz] = col;
+                u_new_vals[state.u_new_nz] = out_val;
+                state.u_new_nz++;
+            }
+            // advance entry of L + U if we used it
+            if (state.finished_l) {
+                state.u_old_begin += (lpu_col == col);
+            } else {
+                state.l_old_begin += (lpu_col == col);
+                state.finished_l = (state.l_old_begin == state.l_old_end);
+            }
+        },
+        [](IndexType, row_state) {});
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp
index 76f15e45766..74a0355be68 100644
--- a/reference/matrix/coo_kernels.cpp
+++ b/reference/matrix/coo_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -145,8 +145,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const ReferenceExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -165,8 +165,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto coo_val = source->get_const_values();
     auto coo_col = source->get_const_col_idxs();
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 2d2adcae1d5..18d39412c95 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
-#include <iostream>
+#include <iterator>
 #include <numeric>
 #include <utility>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -48,7 +49,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
+#include "core/base/allocator.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "reference/components/csr_spgeam.hpp"
 #include "reference/components/format_conversion.hpp"
 
 
@@ -123,6 +128,234 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void spgemm_insert_row(unordered_set<IndexType> &cols,
+                       const matrix::Csr<ValueType, IndexType> *c,
+                       size_type row)
+{
+    auto row_ptrs = c->get_const_row_ptrs();
+    auto col_idxs = c->get_const_col_idxs();
+    cols.insert(col_idxs + row_ptrs[row], col_idxs + row_ptrs[row + 1]);
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_insert_row2(unordered_set<IndexType> &cols,
+                        const matrix::Csr<ValueType, IndexType> *a,
+                        const matrix::Csr<ValueType, IndexType> *b,
+                        size_type row)
+{
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    for (size_type a_nz = a_row_ptrs[row];
+         a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) {
+        auto a_col = a_col_idxs[a_nz];
+        auto b_row = a_col;
+        cols.insert(b_col_idxs + b_row_ptrs[b_row],
+                    b_col_idxs + b_row_ptrs[b_row + 1]);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_accumulate_row(map<IndexType, ValueType> &cols,
+                           const matrix::Csr<ValueType, IndexType> *c,
+                           ValueType scale, size_type row)
+{
+    auto row_ptrs = c->get_const_row_ptrs();
+    auto col_idxs = c->get_const_col_idxs();
+    auto vals = c->get_const_values();
+    for (size_type c_nz = row_ptrs[row]; c_nz < size_type(row_ptrs[row + 1]);
+         ++c_nz) {
+        auto c_col = col_idxs[c_nz];
+        auto c_val = vals[c_nz];
+        cols[c_col] += scale * c_val;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm_accumulate_row2(map<IndexType, ValueType> &cols,
+                            const matrix::Csr<ValueType, IndexType> *a,
+                            const matrix::Csr<ValueType, IndexType> *b,
+                            ValueType scale, size_type row)
+{
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto b_vals = b->get_const_values();
+    for (size_type a_nz = a_row_ptrs[row];
+         a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) {
+        auto a_col = a_col_idxs[a_nz];
+        auto a_val = a_vals[a_nz];
+        auto b_row = a_col;
+        for (size_type b_nz = b_row_ptrs[b_row];
+             b_nz < size_type(b_row_ptrs[b_row + 1]); ++b_nz) {
+            auto b_col = b_col_idxs[b_nz];
+            auto b_val = b_vals[b_nz];
+            cols[b_col] += scale * a_val * b_val;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm(std::shared_ptr<const ReferenceExecutor> exec,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto num_rows = a->get_size()[0];
+
+    // first sweep: count nnz for each row
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    unordered_set<IndexType> local_col_idxs(exec);
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_col_idxs.clear();
+        spgemm_insert_row2(local_col_idxs, a, b, a_row);
+        c_row_ptrs[a_row] = local_col_idxs.size();
+    }
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = c_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto &c_col_idxs_array = c_builder.get_col_idx_array();
+    auto &c_vals_array = c_builder.get_value_array();
+    c_col_idxs_array.resize_and_reset(new_nnz);
+    c_vals_array.resize_and_reset(new_nnz);
+    auto c_col_idxs = c_col_idxs_array.get_data();
+    auto c_vals = c_vals_array.get_data();
+
+    map<IndexType, ValueType> local_row_nzs(exec);
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_row_nzs.clear();
+        spgemm_accumulate_row2(local_row_nzs, a, b, one<ValueType>(), a_row);
+        // store result
+        auto c_nz = c_row_ptrs[a_row];
+        for (auto pair : local_row_nzs) {
+            c_col_idxs[c_nz] = pair.first;
+            c_vals[c_nz] = pair.second;
+            ++c_nz;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spgemm(std::shared_ptr<const ReferenceExecutor> exec,
+                     const matrix::Dense<ValueType> *alpha,
+                     const matrix::Csr<ValueType, IndexType> *a,
+                     const matrix::Csr<ValueType, IndexType> *b,
+                     const matrix::Dense<ValueType> *beta,
+                     const matrix::Csr<ValueType, IndexType> *d,
+                     matrix::Csr<ValueType, IndexType> *c)
+{
+    auto num_rows = a->get_size()[0];
+    auto valpha = alpha->at(0, 0);
+    auto vbeta = beta->at(0, 0);
+
+    // first sweep: count nnz for each row
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    unordered_set<IndexType> local_col_idxs(exec);
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_col_idxs.clear();
+        spgemm_insert_row(local_col_idxs, d, a_row);
+        spgemm_insert_row2(local_col_idxs, a, b, a_row);
+        c_row_ptrs[a_row] = local_col_idxs.size();
+    }
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = c_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto &c_col_idxs_array = c_builder.get_col_idx_array();
+    auto &c_vals_array = c_builder.get_value_array();
+    c_col_idxs_array.resize_and_reset(new_nnz);
+    c_vals_array.resize_and_reset(new_nnz);
+    auto c_col_idxs = c_col_idxs_array.get_data();
+    auto c_vals = c_vals_array.get_data();
+
+    map<IndexType, ValueType> local_row_nzs(exec);
+    for (size_type a_row = 0; a_row < num_rows; ++a_row) {
+        local_row_nzs.clear();
+        spgemm_accumulate_row(local_row_nzs, d, vbeta, a_row);
+        spgemm_accumulate_row2(local_row_nzs, a, b, valpha, a_row);
+        // store result
+        auto c_nz = c_row_ptrs[a_row];
+        for (auto pair : local_row_nzs) {
+            c_col_idxs[c_nz] = pair.first;
+            c_vals[c_nz] = pair.second;
+            ++c_nz;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const ReferenceExecutor> exec,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto num_rows = a->get_size()[0];
+    auto valpha = alpha->at(0, 0);
+    auto vbeta = beta->at(0, 0);
+
+    // first sweep: count nnz for each row
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    abstract_spgeam(
+        a, b, [](IndexType) { return IndexType{}; },
+        [](IndexType, IndexType, ValueType, ValueType, IndexType &nnz) {
+            ++nnz;
+        },
+        [&](IndexType row, IndexType nnz) { c_row_ptrs[row] = nnz; });
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, num_rows + 1);
+
+    // second sweep: accumulate non-zeros
+    auto new_nnz = c_row_ptrs[num_rows];
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto &c_col_idxs_array = c_builder.get_col_idx_array();
+    auto &c_vals_array = c_builder.get_value_array();
+    c_col_idxs_array.resize_and_reset(new_nnz);
+    c_vals_array.resize_and_reset(new_nnz);
+    auto c_col_idxs = c_col_idxs_array.get_data();
+    auto c_vals = c_vals_array.get_data();
+
+    abstract_spgeam(
+        a, b, [&](IndexType row) { return c_row_ptrs[row]; },
+        [&](IndexType, IndexType col, ValueType a_val, ValueType b_val,
+            IndexType &nz) {
+            c_vals[nz] = valpha * a_val + vbeta * b_val;
+            c_col_idxs[nz] = col;
+            ++nz;
+        },
+        [](IndexType, IndexType) {});
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
+
 template <typename IndexType>
 void convert_row_ptrs_to_idxs(std::shared_ptr<const ReferenceExecutor> exec,
                               const IndexType *ptrs, size_type num_rows,
@@ -134,8 +367,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const ReferenceExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -150,8 +383,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -176,8 +409,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -239,8 +472,10 @@ void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
             }
         }
     }
-    slice_sets[slice_num] =
-        slice_sets[slice_num - 1] + slice_lengths[slice_num - 1];
+    if (slice_num > 0) {
+        slice_sets[slice_num] =
+            slice_sets[slice_num - 1] + slice_lengths[slice_num - 1];
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -281,8 +516,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto num_cols = source->get_size()[1];
@@ -310,7 +545,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
-template <typename IndexType, typename ValueType, typename UnaryOperator>
+template <typename ValueType, typename IndexType, typename UnaryOperator>
 inline void convert_csr_to_csc(size_type num_rows, const IndexType *row_ptrs,
                                const IndexType *col_idxs,
                                const ValueType *csr_vals, IndexType *row_idxs,
@@ -355,8 +590,8 @@ void transpose_and_transform(std::shared_ptr<const ReferenceExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return x; });
@@ -367,8 +602,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return conj(x); });
@@ -400,8 +635,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -454,6 +689,139 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void row_permute_impl(std::shared_ptr<const ReferenceExecutor> exec,
+                      const Array<IndexType> *permutation_indices,
+                      const matrix::Csr<ValueType, IndexType> *orig,
+                      matrix::Csr<ValueType, IndexType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto orig_vals = orig->get_const_values();
+    auto rp_row_ptrs = row_permuted->get_row_ptrs();
+    auto rp_col_idxs = row_permuted->get_col_idxs();
+    auto rp_vals = row_permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+    size_type num_nnz = orig->get_num_stored_elements();
+
+    size_type cur_ptr = 0;
+    rp_row_ptrs[0] = cur_ptr;
+    vector<size_type> orig_num_nnz_per_row(num_rows, 0, exec);
+    for (size_type row = 0; row < num_rows; ++row) {
+        orig_num_nnz_per_row[row] = orig_row_ptrs[row + 1] - orig_row_ptrs[row];
+    }
+    for (size_type row = 0; row < num_rows; ++row) {
+        rp_row_ptrs[row + 1] =
+            rp_row_ptrs[row] + orig_num_nnz_per_row[perm[row]];
+    }
+    rp_row_ptrs[num_rows] = orig_row_ptrs[num_rows];
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto new_row = perm[row];
+        auto new_k = orig_row_ptrs[new_row];
+        for (size_type k = rp_row_ptrs[row];
+             k < size_type(rp_row_ptrs[row + 1]); ++k) {
+            rp_col_idxs[k] = orig_col_idxs[new_k];
+            rp_vals[k] = orig_vals[new_k];
+            new_k++;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
+{
+    row_permute_impl(exec, permutation_indices, orig, row_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    Array<IndexType> inv_perm(*permutation_indices);
+    auto iperm = inv_perm.get_data();
+    for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) {
+        iperm[perm[ind]] = ind;
+    }
+
+    row_permute_impl(exec, &inv_perm, orig, row_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute_impl(const Array<IndexType> *permutation_indices,
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto orig_vals = orig->get_const_values();
+    auto cp_row_ptrs = column_permuted->get_row_ptrs();
+    auto cp_col_idxs = column_permuted->get_col_idxs();
+    auto cp_vals = column_permuted->get_values();
+    auto num_nnz = orig->get_num_stored_elements();
+    size_type num_rows = orig->get_size()[0];
+    size_type num_cols = orig->get_size()[1];
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        cp_row_ptrs[row] = orig_row_ptrs[row];
+        for (size_type k = orig_row_ptrs[row];
+             k < size_type(orig_row_ptrs[row + 1]); ++k) {
+            cp_col_idxs[k] = perm[orig_col_idxs[k]];
+            cp_vals[k] = orig_vals[k];
+        }
+    }
+    cp_row_ptrs[num_rows] = orig_row_ptrs[num_rows];
+}
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    Array<IndexType> inv_perm(*permutation_indices);
+    auto iperm = inv_perm.get_data();
+    for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) {
+        iperm[perm[ind]] = ind;
+    }
+    column_permute_impl(&inv_perm, orig, column_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
+{
+    column_permute_impl(permutation_indices, orig, column_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row(std::shared_ptr<const ReferenceExecutor> exec,
                                 const matrix::Csr<ValueType, IndexType> *source,
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 82a20a8b1a4..a55a8b1d24f 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -43,9 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include <algorithm>
-
-
 namespace gko {
 namespace kernels {
 namespace reference {
@@ -181,14 +182,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType> *x,
-                   matrix::Dense<ValueType> *result)
+                   matrix::Dense<remove_complex<ValueType>> *result)
 {
-    compute_dot(exec, x, x, result);
-    for (size_type i = 0; i < result->get_size()[0]; ++i) {
-        for (size_type j = 0; j < result->get_size()[1]; ++j) {
-            result->at(i, j) = sqrt(abs(result->at(i, j)));
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = zero<remove_complex<ValueType>>();
+    }
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            result->at(0, j) += squared_norm(x->at(i, j));
         }
     }
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = sqrt(result->at(0, j));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
@@ -196,8 +202,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -227,8 +233,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -259,8 +265,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -291,8 +297,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -346,8 +352,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -407,8 +413,11 @@ void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
             }
         }
     }
-    slice_sets[slice_num] =
-        slice_sets[slice_num - 1] + slice_lengths[slice_num - 1];
+
+    if (slice_num > 0) {
+        slice_sets[slice_num] =
+            slice_sets[slice_num - 1] + slice_lengths[slice_num - 1];
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -417,8 +426,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -546,8 +555,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
@@ -561,8 +570,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
@@ -574,6 +583,77 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                 const Array<IndexType> *permutation_indices,
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            row_permuted->at(i, j) = orig->at(perm[i], j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                    const Array<IndexType> *permutation_indices,
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+        for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+            column_permuted->at(i, j) = orig->at(i, perm[j]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                         const Array<IndexType> *permutation_indices,
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            row_permuted->at(perm[i], j) = orig->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                            const Array<IndexType> *permutation_indices,
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
+{
+    auto perm = permutation_indices->get_const_data();
+    for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+        for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+            column_permuted->at(i, perm[j]) = orig->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL);
+
+
 }  // namespace dense
 }  // namespace reference
 }  // namespace kernels
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 318969ea258..0f21a6c2f3a 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -106,8 +106,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -130,8 +130,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto max_nnz_per_row = source->get_num_stored_elements_per_row();
diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp
index 7e0acc4bc50..74e126334e2 100644
--- a/reference/matrix/hybrid_kernels.cpp
+++ b/reference/matrix/hybrid_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -58,8 +58,8 @@ namespace hybrid {
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Hybrid<ValueType, IndexType> *source)
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -93,8 +93,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto csr_val = result->get_values();
     auto csr_col_idxs = result->get_col_idxs();
diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp
index 85d2a705982..43e01b51fb1 100644
--- a/reference/matrix/sellp_kernels.cpp
+++ b/reference/matrix/sellp_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -125,8 +125,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -161,8 +161,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto slice_size = source->get_slice_size();
diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp
index 42b4edd88a2..70ab3b15aff 100644
--- a/reference/matrix/sparsity_csr_kernels.cpp
+++ b/reference/matrix/sparsity_csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -143,9 +143,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void remove_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
                               const IndexType *row_ptrs,
-                              const IndexType *col_idxs)
+                              const IndexType *col_idxs,
+                              matrix::SparsityCsr<ValueType, IndexType> *matrix)
 {
     auto num_rows = matrix->get_size()[0];
     auto adj_ptrs = matrix->get_row_ptrs();
@@ -193,8 +193,8 @@ inline void convert_sparsity_to_csc(size_type num_rows,
 template <typename ValueType, typename IndexType>
 void transpose_and_transform(
     std::shared_ptr<const ReferenceExecutor> exec,
-    matrix::SparsityCsr<ValueType, IndexType> *trans,
-    const matrix::SparsityCsr<ValueType, IndexType> *orig)
+    const matrix::SparsityCsr<ValueType, IndexType> *orig,
+    matrix::SparsityCsr<ValueType, IndexType> *trans)
 {
     auto trans_row_ptrs = trans->get_row_ptrs();
     auto orig_row_ptrs = orig->get_const_row_ptrs();
@@ -216,10 +216,10 @@ void transpose_and_transform(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
 {
-    transpose_and_transform(exec, trans, orig);
+    transpose_and_transform(exec, orig, trans);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp
new file mode 100644
index 00000000000..6a3a682e395
--- /dev/null
+++ b/reference/preconditioner/isai_kernels.cpp
@@ -0,0 +1,316 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/isai_kernels.hpp"
+
+
+#include <algorithm>
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The Isai preconditioner namespace.
+ *
+ * @ingroup isai
+ */
+namespace isai {
+
+
+template <typename IndexType, typename Callback>
+void forall_matching(const IndexType *fst, IndexType fst_size,
+                     const IndexType *snd, IndexType snd_size, Callback cb)
+{
+    IndexType fst_idx{};
+    IndexType snd_idx{};
+    while (fst_idx < fst_size && snd_idx < snd_size) {
+        const auto fst_val = fst[fst_idx];
+        const auto snd_val = snd[snd_idx];
+        if (fst_val == snd_val) {
+            cb(fst_val, fst_idx, snd_idx);
+        }
+        // advance the smaller entrie(s)
+        fst_idx += (fst_val <= snd_val);
+        snd_idx += (fst_val >= snd_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename Callable>
+void generic_generate(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType> *mtx,
+                      matrix::Csr<ValueType, IndexType> *inverse_mtx,
+                      IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs,
+                      Callable trs_solve)
+{
+    /*
+    Consider: aiM := inverse_mtx; M := mtx
+    I := Identity matrix
+    e(i) := unit vector i (containing all zeros except for row i, which is one)
+    S := Sparsity pattern of the desired aiM
+    S(i) := Sparsity pattern of row i of aiM (Set of non-zero columns)
+    D(i) := M[S(i), S(i)]
+    aiM := approximate inverse of M
+
+    Target: Solving (aiM * M = I)_{S} (aiM * M = I for the sparsity pattern S)
+    aiM[i, :] * D(i) = e(i)^T
+    <=> D(i)^T * aiM[i, :]^T = e(i)   =^ Triangular system (Trs)
+    Solve Trs, fill in aiM row by row (coalesced access)
+    */
+    const auto num_rows = mtx->get_size()[0];
+    const auto m_row_ptrs = mtx->get_const_row_ptrs();
+    const auto m_cols = mtx->get_const_col_idxs();
+    const auto m_vals = mtx->get_const_values();
+    const auto i_row_ptrs = inverse_mtx->get_const_row_ptrs();
+    const auto i_cols = inverse_mtx->get_const_col_idxs();
+    auto i_vals = inverse_mtx->get_values();
+    // RHS for local trisystem
+    gko::Array<ValueType> rhs_array{exec, row_size_limit};
+    auto rhs = rhs_array.get_data();
+    // memory for dense trisystem
+    gko::Array<ValueType> trisystem_array{exec,
+                                          row_size_limit * row_size_limit};
+    auto trisystem_ptr = trisystem_array.get_data();
+    // stores the next free index in the excess rhs/solution
+    IndexType excess_rhs_begin{};
+    // stores the next free non-zero index in the excess system
+    IndexType excess_nz_begin{};
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        const auto i_begin = i_row_ptrs[row];
+        const auto i_size = i_row_ptrs[row + 1] - i_begin;
+        excess_rhs_ptrs[row] = excess_rhs_begin;
+        excess_nz_ptrs[row] = excess_nz_begin;
+
+        if (i_size <= row_size_limit) {
+            // short rows: treat directly as dense system
+            // we need this ugly workaround to get rid of a few
+            // warnings and compilation issues
+            auto trisystem = range<accessor::row_major<ValueType, 2>>(
+                trisystem_ptr, static_cast<size_type>(i_size),
+                static_cast<size_type>(i_size), static_cast<size_type>(i_size));
+            std::fill_n(trisystem_ptr, i_size * i_size, zero<ValueType>());
+
+            for (size_type i = 0; i < i_size; ++i) {
+                const auto col = i_cols[i_begin + i];
+                const auto m_begin = m_row_ptrs[col];
+                const auto m_size = m_row_ptrs[col + 1] - m_begin;
+                forall_matching(
+                    m_cols + m_begin, m_size, i_cols + i_begin, i_size,
+                    [&](IndexType, IndexType m_idx, IndexType i_idx) {
+                        trisystem(i, i_idx) = m_vals[m_idx + m_begin];
+                    });
+            }
+
+            // solve dense triangular system
+            trs_solve(trisystem, rhs);
+
+            // write triangular solution to inverse
+            for (size_type i = 0; i < i_size; ++i) {
+                const auto new_val = rhs[i];
+                const auto idx = i_begin + i;
+                // check for non-finite elements which should not be copied over
+                if (is_finite(new_val)) {
+                    i_vals[idx] = new_val;
+                } else {
+                    // ensure the preconditioner does not prevent convergence
+                    i_vals[idx] = i_cols[idx] == row ? one<ValueType>()
+                                                     : zero<ValueType>();
+                }
+            }
+        } else {
+            // count non-zeros and dimension in the excess system
+            for (size_type i = 0; i < i_size; ++i) {
+                const auto col = i_cols[i_begin + i];
+                const auto m_begin = m_row_ptrs[col];
+                const auto m_size = m_row_ptrs[col + 1] - m_begin;
+                forall_matching(m_cols + m_begin, m_size, i_cols + i_begin,
+                                i_size, [&](IndexType, IndexType, IndexType) {
+                                    ++excess_nz_begin;
+                                });
+                ++excess_rhs_begin;
+            }
+        }
+    }
+    excess_rhs_ptrs[num_rows] = excess_rhs_begin;
+    excess_nz_ptrs[num_rows] = excess_nz_begin;
+}
+
+
+template <typename ValueType, typename IndexType>
+void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *mtx,
+                          matrix::Csr<ValueType, IndexType> *inverse_mtx,
+                          IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs,
+                          bool lower)
+{
+    auto trs_solve =
+        [lower](const range<accessor::row_major<ValueType, 2>> trisystem,
+                ValueType *rhs) {
+            const IndexType size = trisystem.length(0);
+            if (size <= 0) {
+                return;
+            }
+            // RHS is the identity: zero everywhere except for the diagonal
+            // entry
+            std::fill_n(rhs, size, zero<ValueType>());
+            rhs[lower ? size - 1 : 0] = one<ValueType>();
+
+            // solve transposed triangular system
+            if (lower) {
+                for (auto col = size - 1; col >= 0; --col) {
+                    const auto diag = trisystem(col, col);
+                    const auto bot = rhs[col] / diag;
+                    rhs[col] = bot;
+                    // do a backwards substitution
+                    for (auto row = col - 1; row >= 0; --row) {
+                        rhs[row] -= bot * trisystem(col, row);
+                    }
+                }
+            } else {
+                for (IndexType col = 0; col < size; ++col) {
+                    const auto diag = trisystem(col, col);
+                    const auto top = rhs[col] / diag;
+                    rhs[col] = top;
+                    // do a forward substitution
+                    for (auto row = col + 1; row < size; ++row) {
+                        rhs[row] -= top * trisystem(col, row);
+                    }
+                }
+            }
+        };
+
+    generic_generate(exec, mtx, inverse_mtx, excess_rhs_ptrs, excess_nz_ptrs,
+                     trs_solve);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
+                            const matrix::Csr<ValueType, IndexType> *input,
+                            const matrix::Csr<ValueType, IndexType> *inverse,
+                            const IndexType *, const IndexType *,
+                            matrix::Csr<ValueType, IndexType> *excess_system,
+                            matrix::Dense<ValueType> *excess_rhs)
+{
+    const auto num_rows = input->get_size()[0];
+    const auto m_row_ptrs = input->get_const_row_ptrs();
+    const auto m_cols = input->get_const_col_idxs();
+    const auto m_vals = input->get_const_values();
+    const auto i_row_ptrs = inverse->get_const_row_ptrs();
+    const auto i_cols = inverse->get_const_col_idxs();
+    const auto e_dim = excess_rhs->get_size()[0];
+    auto e_row_ptrs = excess_system->get_row_ptrs();
+    auto e_cols = excess_system->get_col_idxs();
+    auto e_vals = excess_system->get_values();
+    auto e_rhs = excess_rhs->get_values();
+    IndexType e_block_begin{};
+    IndexType e_nz{};
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        const auto i_begin = i_row_ptrs[row];
+        const auto i_size = i_row_ptrs[row + 1] - i_begin;
+
+        if (i_size > row_size_limit) {
+            // count non-zeros and dimension in the excess system
+            for (size_type i = 0; i < i_size; ++i) {
+                // current row in the excess system
+                const auto e_row = e_block_begin + i;
+                const auto col = i_cols[i_begin + i];
+                const auto m_begin = m_row_ptrs[col];
+                const auto m_size = m_row_ptrs[col + 1] - m_begin;
+                // store row pointers: one row per non-zero of inverse row
+                e_row_ptrs[e_row] = e_nz;
+                // build right-hand side: identity row
+                e_rhs[e_row] =
+                    row == col ? one<ValueType>() : zero<ValueType>();
+                // build sparse block
+                forall_matching(
+                    m_cols + m_begin, m_size, i_cols + i_begin, i_size,
+                    [&](IndexType, IndexType m_idx, IndexType i_idx) {
+                        // trisystem(i, i_idx) = m_vals[m_idx + m_begin]
+                        // just in sparse
+                        e_cols[e_nz] = i_idx + e_block_begin;
+                        e_vals[e_nz] = m_vals[m_idx + m_begin];
+                        ++e_nz;
+                    });
+            }
+            e_block_begin += i_size;
+        }
+    }
+    e_row_ptrs[e_dim] = e_nz;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
+                             const IndexType *excess_block_ptrs,
+                             const matrix::Dense<ValueType> *excess_solution,
+                             matrix::Csr<ValueType, IndexType> *inverse)
+{
+    const auto num_rows = inverse->get_size()[0];
+    auto excess_values = excess_solution->get_const_values();
+    auto values = inverse->get_values();
+    auto row_ptrs = inverse->get_const_row_ptrs();
+    for (size_type row = 0; row < num_rows; ++row) {
+        const auto excess_begin = excess_values + excess_block_ptrs[row];
+        const auto excess_end = excess_values + excess_block_ptrs[row + 1];
+        auto values_begin = values + row_ptrs[row];
+        std::copy(excess_begin, excess_end, values_begin);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index f2972965273..d72065452e0 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,9 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
+#include <algorithm>
 #include <cmath>
+#include <iterator>
 #include <numeric>
-#include <vector>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -44,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/allocator.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "reference/components/matrix_operations.hpp"
@@ -66,15 +68,9 @@ inline bool has_same_nonzero_pattern(const IndexType *prev_row_ptr,
                                      const IndexType *curr_row_ptr,
                                      const IndexType *next_row_ptr)
 {
-    if (next_row_ptr - curr_row_ptr != curr_row_ptr - prev_row_ptr) {
-        return false;
-    }
-    for (; curr_row_ptr < next_row_ptr; ++prev_row_ptr, ++curr_row_ptr) {
-        if (*curr_row_ptr != *prev_row_ptr) {
-            return false;
-        }
-    }
-    return true;
+    return std::distance(curr_row_ptr, next_row_ptr) ==
+               std::distance(prev_row_ptr, curr_row_ptr) &&
+           std::equal(curr_row_ptr, next_row_ptr, prev_row_ptr);
 }
 
 
@@ -253,6 +249,24 @@ inline void transpose_block(IndexType block_size, const SourceValueType *from,
 }
 
 
+template <typename SourceValueType, typename ResultValueType,
+          typename IndexType,
+          typename ValueConverter =
+              default_converter<SourceValueType, ResultValueType>>
+inline void conj_transpose_block(IndexType block_size,
+                                 const SourceValueType *from,
+                                 size_type from_stride, ResultValueType *to,
+                                 size_type to_stride,
+                                 ValueConverter converter = {}) noexcept
+{
+    for (IndexType i = 0; i < block_size; ++i) {
+        for (IndexType j = 0; j < block_size; ++j) {
+            to[i * to_stride + j] = conj(converter(from[i + j * from_stride]));
+        }
+    }
+}
+
+
 template <typename SourceValueType, typename ResultValueType,
           typename IndexType,
           typename ValueConverter =
@@ -295,13 +309,13 @@ inline bool invert_block(IndexType block_size, IndexType *perm,
 
 
 template <typename ReducedType, typename ValueType, typename IndexType>
-inline bool validate_precision_reduction_feasibility(IndexType block_size,
-                                                     const ValueType *block,
-                                                     size_type stride)
+inline bool validate_precision_reduction_feasibility(
+    std::shared_ptr<const ReferenceExecutor> exec, IndexType block_size,
+    const ValueType *block, size_type stride)
 {
     using gko::detail::float_traits;
-    std::vector<ValueType> tmp(block_size * block_size);
-    std::vector<IndexType> perm(block_size);
+    vector<ValueType> tmp(block_size * block_size, {}, exec);
+    vector<IndexType> perm(block_size, {}, exec);
     std::iota(begin(perm), end(perm), IndexType{0});
     for (IndexType i = 0; i < block_size; ++i) {
         for (IndexType j = 0; j < block_size; ++j) {
@@ -341,9 +355,9 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
     const auto group_size = storage_scheme.get_group_size();
     const auto cond = conditioning.get_data();
     for (size_type g = 0; g < num_blocks; g += group_size) {
-        std::vector<Array<ValueType>> block(group_size);
-        std::vector<Array<IndexType>> perm(group_size);
-        std::vector<uint32> pr_descriptors(group_size, uint32{} - 1);
+        vector<Array<ValueType>> block(group_size, {}, exec);
+        vector<Array<IndexType>> perm(group_size, {}, exec);
+        vector<uint32> pr_descriptors(group_size, uint32{} - 1, exec);
         // extract group of blocks, invert them, figure out storage precision
         for (size_type b = 0; b < group_size; ++b) {
             if (b + g >= num_blocks) {
@@ -373,16 +387,18 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
                 using preconditioner::detail::get_supported_storage_reductions;
                 pr_descriptors[b] = get_supported_storage_reductions<ValueType>(
                     accuracy, cond[g + b],
-                    [&block_size, &block, &b] {
+                    [&exec, &block_size, &block, &b] {
                         using target = reduce_precision<ValueType>;
                         return validate_precision_reduction_feasibility<target>(
-                            block_size, block[b].get_const_data(), block_size);
+                            exec, block_size, block[b].get_const_data(),
+                            block_size);
                     },
-                    [&block_size, &block, &b] {
+                    [&exec, &block_size, &block, &b] {
                         using target =
                             reduce_precision<reduce_precision<ValueType>>;
                         return validate_precision_reduction_feasibility<target>(
-                            block_size, block[b].get_const_data(), block_size);
+                            exec, block_size, block[b].get_const_data(),
+                            block_size);
                     });
             } else {
                 pr_descriptors[b] = preconditioner::detail::
@@ -544,6 +560,78 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    const auto ptrs = block_pointers.get_const_data();
+    const auto prec = block_precisions.get_const_data();
+    const size_type matrix_size = ptrs[num_blocks];
+
+    for (size_type i = 0; i < num_blocks; ++i) {
+        const auto group_ofs = storage_scheme.get_group_offset(i);
+        const auto block_ofs = storage_scheme.get_block_offset(i);
+        const auto block_stride = storage_scheme.get_stride();
+        const auto group = blocks.get_const_data() + group_ofs;
+        auto out_group = out_blocks.get_data() + group_ofs;
+        const auto block_size = ptrs[i + 1] - ptrs[i];
+        const auto p = prec ? prec[i] : precision_reduction();
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, p,
+            transpose_block(
+                block_size,
+                reinterpret_cast<const resolved_precision *>(group) + block_ofs,
+                block_stride,
+                reinterpret_cast<resolved_precision *>(out_group) + block_ofs,
+                block_stride));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose_jacobi(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    uint32 max_block_size, const Array<precision_reduction> &block_precisions,
+    const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>
+        &storage_scheme,
+    Array<ValueType> &out_blocks)
+{
+    const auto ptrs = block_pointers.get_const_data();
+    const auto prec = block_precisions.get_const_data();
+    const size_type matrix_size = ptrs[num_blocks];
+
+    for (size_type i = 0; i < num_blocks; ++i) {
+        const auto group_ofs = storage_scheme.get_group_offset(i);
+        const auto block_ofs = storage_scheme.get_block_offset(i);
+        const auto block_stride = storage_scheme.get_stride();
+        const auto group = blocks.get_const_data() + group_ofs;
+        auto out_group = out_blocks.get_data() + group_ofs;
+        const auto block_size = ptrs[i + 1] - ptrs[i];
+        const auto p = prec ? prec[i] : precision_reduction();
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, p,
+            conj_transpose_block(
+                block_size,
+                reinterpret_cast<const resolved_precision *>(group) + block_ofs,
+                block_stride,
+                reinterpret_cast<resolved_precision *>(out_group) + block_ofs,
+                block_stride));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
     std::shared_ptr<const ReferenceExecutor> exec, size_type num_blocks,
diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp
new file mode 100644
index 00000000000..5142b9461fd
--- /dev/null
+++ b/reference/solver/bicg_kernels.cpp
@@ -0,0 +1,140 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/bicg_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The BICG solver namespace.
+ *
+ * @ingroup bicg
+ */
+namespace bicg {
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const ReferenceExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *p,
+                matrix::Dense<ValueType> *q, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *r2,
+                matrix::Dense<ValueType> *z2, matrix::Dense<ValueType> *p2,
+                matrix::Dense<ValueType> *q2,
+                Array<stopping_status> *stop_status)
+{
+    for (size_type j = 0; j < b->get_size()[1]; ++j) {
+        rho->at(j) = zero<ValueType>();
+        prev_rho->at(j) = one<ValueType>();
+        stop_status->get_data()[j].reset();
+    }
+    for (size_type i = 0; i < b->get_size()[0]; ++i) {
+        for (size_type j = 0; j < b->get_size()[1]; ++j) {
+            r->at(i, j) = b->at(i, j);
+            r2->at(i, j) = b->at(i, j);
+            z->at(i, j) = p->at(i, j) = q->at(i, j) = zero<ValueType>();
+            z2->at(i, j) = p2->at(i, j) = q2->at(i, j) = zero<ValueType>();
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const ReferenceExecutor> exec,
+            matrix::Dense<ValueType> *p, const matrix::Dense<ValueType> *z,
+            matrix::Dense<ValueType> *p2, const matrix::Dense<ValueType> *z2,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const Array<stopping_status> *stop_status)
+{
+    for (size_type i = 0; i < p->get_size()[0]; ++i) {
+        for (size_type j = 0; j < p->get_size()[1]; ++j) {
+            if (stop_status->get_const_data()[j].has_stopped()) {
+                continue;
+            }
+            if (prev_rho->at(j) == zero<ValueType>()) {
+                p->at(i, j) = z->at(i, j);
+                p2->at(i, j) = z2->at(i, j);
+            } else {
+                auto tmp = rho->at(j) / prev_rho->at(j);
+                p->at(i, j) = z->at(i, j) + tmp * p->at(i, j);
+                p2->at(i, j) = z2->at(i, j) + tmp * p2->at(i, j);
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const ReferenceExecutor> exec,
+            matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *r,
+            matrix::Dense<ValueType> *r2, const matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *q,
+            const matrix::Dense<ValueType> *q2,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho,
+            const Array<stopping_status> *stop_status)
+{
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            if (stop_status->get_const_data()[j].has_stopped()) {
+                continue;
+            }
+            if (beta->at(j) != zero<ValueType>()) {
+                auto tmp = rho->at(j) / beta->at(j);
+                x->at(i, j) += tmp * p->at(i, j);
+                r->at(i, j) -= tmp * q->at(i, j);
+                r2->at(i, j) -= tmp * q2->at(i, j);
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+
+
+}  // namespace bicg
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp
index 415f0ee427a..29927d18953 100644
--- a/reference/solver/bicgstab_kernels.cpp
+++ b/reference/solver/bicgstab_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -32,12 +32,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/solver/bicgstab_kernels.hpp"
 
+
+#include <algorithm>
+
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-#include <algorithm>
-
 
 namespace gko {
 namespace kernels {
diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp
index 616d95bcd7b..bf4625de9de 100644
--- a/reference/solver/cg_kernels.cpp
+++ b/reference/solver/cg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp
index b134d2d8513..f393bb4ecd5 100644
--- a/reference/solver/cgs_kernels.cpp
+++ b/reference/solver/cgs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp
index 252dada8123..24c758d1140 100644
--- a/reference/solver/fcg_kernels.cpp
+++ b/reference/solver/fcg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp
index c5b7a2636d7..fd79e7d8574 100644
--- a/reference/solver/gmres_kernels.cpp
+++ b/reference/solver/gmres_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -55,28 +55,27 @@ namespace {
 
 
 template <typename ValueType>
-void finish_arnoldi(matrix::Dense<ValueType> *next_krylov_basis,
-                    matrix::Dense<ValueType> *krylov_bases,
+void finish_arnoldi(size_type num_rows, matrix::Dense<ValueType> *krylov_bases,
                     matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
                     const stopping_status *stop_status)
 {
-    for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) {
+    const auto krylov_bases_rowoffset = num_rows;
+    const auto next_krylov_rowoffset = (iter + 1) * krylov_bases_rowoffset;
+    for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) {
         if (stop_status[i].has_stopped()) {
             continue;
         }
         for (size_type k = 0; k < iter + 1; ++k) {
             hessenberg_iter->at(k, i) = 0;
-            for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
+            for (size_type j = 0; j < num_rows; ++j) {
                 hessenberg_iter->at(k, i) +=
-                    next_krylov_basis->at(j, i) *
-                    krylov_bases->at(j,
-                                     next_krylov_basis->get_size()[1] * k + i);
+                    krylov_bases->at(j + next_krylov_rowoffset, i) *
+                    conj(krylov_bases->at(j + k * krylov_bases_rowoffset, i));
             }
-            for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
-                next_krylov_basis->at(j, i) -=
+            for (size_type j = 0; j < num_rows; ++j) {
+                krylov_bases->at(j + next_krylov_rowoffset, i) -=
                     hessenberg_iter->at(k, i) *
-                    krylov_bases->at(j,
-                                     next_krylov_basis->get_size()[1] * k + i);
+                    krylov_bases->at(j + k * krylov_bases_rowoffset, i);
             }
         }
         // for i in 1:iter
@@ -85,20 +84,19 @@ void finish_arnoldi(matrix::Dense<ValueType> *next_krylov_basis,
         // end
 
         hessenberg_iter->at(iter + 1, i) = 0;
-        for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
+        for (size_type j = 0; j < num_rows; ++j) {
             hessenberg_iter->at(iter + 1, i) +=
-                next_krylov_basis->at(j, i) * next_krylov_basis->at(j, i);
+                krylov_bases->at(j + next_krylov_rowoffset, i) *
+                krylov_bases->at(j + next_krylov_rowoffset, i);
         }
         hessenberg_iter->at(iter + 1, i) =
             sqrt(hessenberg_iter->at(iter + 1, i));
-        // hessenberg(iter, iter + 1) = norm(next_krylov_basis)
-        for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) {
-            next_krylov_basis->at(j, i) /= hessenberg_iter->at(iter + 1, i);
-            krylov_bases->at(j, next_krylov_basis->get_size()[1] * (iter + 1) +
-                                    i) = next_krylov_basis->at(j, i);
+        // hessenberg(iter + 1, iter) = norm(krylov_bases)
+        for (size_type j = 0; j < num_rows; ++j) {
+            krylov_bases->at(j + next_krylov_rowoffset, i) /=
+                hessenberg_iter->at(iter + 1, i);
         }
         // next_krylov_basis /= hessenberg(iter, iter + 1)
-        // krylov_bases(:, iter + 1) = next_krylov_basis
         // End of arnoldi
     }
 }
@@ -114,27 +112,25 @@ void calculate_sin_and_cos(matrix::Dense<ValueType> *givens_sin,
         givens_cos->at(iter, rhs) = zero<ValueType>();
         givens_sin->at(iter, rhs) = one<ValueType>();
     } else {
-        auto hypotenuse = sqrt(hessenberg_iter->at(iter, rhs) *
-                                   hessenberg_iter->at(iter, rhs) +
-                               hessenberg_iter->at(iter + 1, rhs) *
-                                   hessenberg_iter->at(iter + 1, rhs));
-        givens_cos->at(iter, rhs) =
-            abs(hessenberg_iter->at(iter, rhs)) / hypotenuse;
-        givens_sin->at(iter, rhs) = givens_cos->at(iter, rhs) *
-                                    hessenberg_iter->at(iter + 1, rhs) /
-                                    hessenberg_iter->at(iter, rhs);
+        auto this_hess = hessenberg_iter->at(iter, rhs);
+        auto next_hess = hessenberg_iter->at(iter + 1, rhs);
+        const auto scale = abs(this_hess) + abs(next_hess);
+        const auto hypotenuse =
+            scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) +
+                         abs(next_hess / scale) * abs(next_hess / scale));
+        givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse;
+        givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse;
     }
 }
 
 
 template <typename ValueType>
-void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
-                     matrix::Dense<ValueType> *givens_sin,
+void givens_rotation(matrix::Dense<ValueType> *givens_sin,
                      matrix::Dense<ValueType> *givens_cos,
                      matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
                      const stopping_status *stop_status)
 {
-    for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) {
+    for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) {
         if (stop_status[i].has_stopped()) {
             continue;
         }
@@ -142,13 +138,13 @@ void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
             auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) +
                         givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i);
             hessenberg_iter->at(j + 1, i) =
-                -givens_sin->at(j, i) * hessenberg_iter->at(j, i) +
-                givens_cos->at(j, i) * hessenberg_iter->at(j + 1, i);
+                -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) +
+                conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i);
             hessenberg_iter->at(j, i) = temp;
             // temp             =  cos(j)*hessenberg(j) +
             //                     sin(j)*hessenberg(j+1)
-            // hessenberg(j+1)  = -sin(j)*hessenberg(j) +
-            //                     cos(j)*hessenberg(j+1)
+            // hessenberg(j+1)  = -conj(sin(j))*hessenberg(j) +
+            //                     conj(cos(j))*hessenberg(j+1)
             // hessenberg(j)    =  temp;
         }
 
@@ -159,7 +155,7 @@ void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
             givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i);
         hessenberg_iter->at(iter + 1, i) = zero<ValueType>();
         // hessenberg(iter)   = cos(iter)*hessenberg(iter) +
-        //                      sin(iter)*hessenberg(iter)
+        //                      sin(iter)*hessenberg(iter + 1)
         // hessenberg(iter+1) = 0
     }
 }
@@ -168,9 +164,8 @@ void givens_rotation(matrix::Dense<ValueType> *next_krylov_basis,
 template <typename ValueType>
 void calculate_next_residual_norm(
     matrix::Dense<ValueType> *givens_sin, matrix::Dense<ValueType> *givens_cos,
-    matrix::Dense<ValueType> *residual_norm,
-    matrix::Dense<ValueType> *residual_norm_collection,
-    const matrix::Dense<ValueType> *b_norm, size_type iter,
+    matrix::Dense<remove_complex<ValueType>> *residual_norm,
+    matrix::Dense<ValueType> *residual_norm_collection, size_type iter,
     const stopping_status *stop_status)
 {
     for (size_type i = 0; i < residual_norm->get_size()[1]; ++i) {
@@ -178,11 +173,12 @@ void calculate_next_residual_norm(
             continue;
         }
         residual_norm_collection->at(iter + 1, i) =
-            -givens_sin->at(iter, i) * residual_norm_collection->at(iter, i);
+            -conj(givens_sin->at(iter, i)) *
+            residual_norm_collection->at(iter, i);
         residual_norm_collection->at(iter, i) =
             givens_cos->at(iter, i) * residual_norm_collection->at(iter, i);
         residual_norm->at(0, i) =
-            abs(residual_norm_collection->at(iter + 1, i)) / b_norm->at(0, i);
+            abs(residual_norm_collection->at(iter + 1, i));
     }
 }
 
@@ -216,13 +212,13 @@ void calculate_qy(const matrix::Dense<ValueType> *krylov_bases,
                   matrix::Dense<ValueType> *before_preconditioner,
                   const size_type *final_iter_nums)
 {
+    const auto krylov_bases_rowoffset = before_preconditioner->get_size()[0];
     for (size_type k = 0; k < before_preconditioner->get_size()[1]; ++k) {
         for (size_type i = 0; i < before_preconditioner->get_size()[0]; ++i) {
             before_preconditioner->at(i, k) = zero<ValueType>();
             for (size_type j = 0; j < final_iter_nums[k]; ++j) {
                 before_preconditioner->at(i, k) +=
-                    krylov_bases->at(
-                        i, j * before_preconditioner->get_size()[1] + k) *
+                    krylov_bases->at(i + j * krylov_bases_rowoffset, k) *
                     y->at(j, k);
             }
         }
@@ -236,20 +232,13 @@ void calculate_qy(const matrix::Dense<ValueType> *krylov_bases,
 template <typename ValueType>
 void initialize_1(std::shared_ptr<const ReferenceExecutor> exec,
                   const matrix::Dense<ValueType> *b,
-                  matrix::Dense<ValueType> *b_norm,
                   matrix::Dense<ValueType> *residual,
                   matrix::Dense<ValueType> *givens_sin,
                   matrix::Dense<ValueType> *givens_cos,
                   Array<stopping_status> *stop_status, size_type krylov_dim)
 {
+    using NormValueType = remove_complex<ValueType>;
     for (size_type j = 0; j < b->get_size()[1]; ++j) {
-        // Calculate b norm
-        b_norm->at(0, j) = zero<ValueType>();
-        for (size_type i = 0; i < b->get_size()[0]; ++i) {
-            b_norm->at(0, j) += b->at(i, j) * b->at(i, j);
-        }
-        b_norm->at(0, j) = sqrt(b_norm->at(0, j));
-
         for (size_type i = 0; i < b->get_size()[0]; ++i) {
             residual->at(i, j) = b->at(i, j);
         }
@@ -267,7 +256,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL);
 template <typename ValueType>
 void initialize_2(std::shared_ptr<const ReferenceExecutor> exec,
                   const matrix::Dense<ValueType> *residual,
-                  matrix::Dense<ValueType> *residual_norm,
+                  matrix::Dense<remove_complex<ValueType>> *residual_norm,
                   matrix::Dense<ValueType> *residual_norm_collection,
                   matrix::Dense<ValueType> *krylov_bases,
                   Array<size_type> *final_iter_nums, size_type krylov_dim)
@@ -276,45 +265,29 @@ void initialize_2(std::shared_ptr<const ReferenceExecutor> exec,
         // Calculate residual norm
         residual_norm->at(0, j) = 0;
         for (size_type i = 0; i < residual->get_size()[0]; ++i) {
-            residual_norm->at(0, j) += residual->at(i, j) * residual->at(i, j);
+            residual_norm->at(0, j) += squared_norm(residual->at(i, j));
         }
         residual_norm->at(0, j) = sqrt(residual_norm->at(0, j));
-
-        for (size_type i = 0; i < krylov_dim + 1; ++i) {
-            if (i == 0) {
-                residual_norm_collection->at(i, j) = residual_norm->at(0, j);
-            } else {
-                residual_norm_collection->at(i, j) = zero<ValueType>();
-            }
-        }
+        residual_norm_collection->at(0, j) = residual_norm->at(0, j);
         for (size_type i = 0; i < residual->get_size()[0]; ++i) {
             krylov_bases->at(i, j) =
                 residual->at(i, j) / residual_norm->at(0, j);
         }
         final_iter_nums->get_data()[j] = 0;
     }
-
-    for (size_type j = residual->get_size()[1]; j < krylov_bases->get_size()[1];
-         ++j) {
-        for (size_type i = 0; i < krylov_bases->get_size()[0]; ++i) {
-            krylov_bases->at(i, j) = zero<ValueType>();
-        }
-    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL);
 
 
 template <typename ValueType>
-void step_1(std::shared_ptr<const ReferenceExecutor> exec,
-            matrix::Dense<ValueType> *next_krylov_basis,
+void step_1(std::shared_ptr<const ReferenceExecutor> exec, size_type num_rows,
             matrix::Dense<ValueType> *givens_sin,
             matrix::Dense<ValueType> *givens_cos,
-            matrix::Dense<ValueType> *residual_norm,
+            matrix::Dense<remove_complex<ValueType>> *residual_norm,
             matrix::Dense<ValueType> *residual_norm_collection,
             matrix::Dense<ValueType> *krylov_bases,
-            matrix::Dense<ValueType> *hessenberg_iter,
-            const matrix::Dense<ValueType> *b_norm, size_type iter,
+            matrix::Dense<ValueType> *hessenberg_iter, size_type iter,
             Array<size_type> *final_iter_nums,
             const Array<stopping_status> *stop_status)
 {
@@ -323,12 +296,12 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
             (1 - stop_status->get_const_data()[i].has_stopped());
     }
 
-    finish_arnoldi(next_krylov_basis, krylov_bases, hessenberg_iter, iter,
+    finish_arnoldi(num_rows, krylov_bases, hessenberg_iter, iter,
                    stop_status->get_const_data());
-    givens_rotation(next_krylov_basis, givens_sin, givens_cos, hessenberg_iter,
-                    iter, stop_status->get_const_data());
+    givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter,
+                    stop_status->get_const_data());
     calculate_next_residual_norm(givens_sin, givens_cos, residual_norm,
-                                 residual_norm_collection, b_norm, iter,
+                                 residual_norm_collection, iter,
                                  stop_status->get_const_data());
 }
 
diff --git a/reference/solver/ir_kernels.cpp b/reference/solver/ir_kernels.cpp
index 48d6a9c219a..1febced33ad 100644
--- a/reference/solver/ir_kernels.cpp
+++ b/reference/solver/ir_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp
index c8f698ed711..b1678d06f83 100644
--- a/reference/solver/lower_trs_kernels.cpp
+++ b/reference/solver/lower_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp
index 56626e367bd..02f3666ecf4 100644
--- a/reference/solver/upper_trs_kernels.cpp
+++ b/reference/solver/upper_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/stop/criterion_kernels.cpp b/reference/stop/criterion_kernels.cpp
index e730aaa301b..050ebab01af 100644
--- a/reference/stop/criterion_kernels.cpp
+++ b/reference/stop/criterion_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/stop/residual_norm_reduction_kernels.cpp b/reference/stop/residual_norm_kernels.cpp
similarity index 71%
rename from reference/stop/residual_norm_reduction_kernels.cpp
rename to reference/stop/residual_norm_kernels.cpp
index 1f3cb5b3fdf..fb968e0eae3 100644
--- a/reference/stop/residual_norm_reduction_kernels.cpp
+++ b/reference/stop/residual_norm_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,42 +30,44 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/stop/residual_norm_reduction_kernels.hpp"
+#include "core/stop/residual_norm_kernels.hpp"
+
+
+#include <algorithm>
 
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
-
-
-#include <algorithm>
+#include <ginkgo/core/base/types.hpp>
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
 /**
- * @brief The Residual norm reduction stopping criterion.
+ * @brief The Residual norm stopping criterion.
  * @ref resnorm
  * @ingroup resnorm
  */
-namespace residual_norm_reduction {
+namespace residual_norm {
 
 
 template <typename ValueType>
-void residual_norm_reduction(std::shared_ptr<const ReferenceExecutor> exec,
-                             const matrix::Dense<ValueType> *tau,
-                             const matrix::Dense<ValueType> *orig_tau,
-                             remove_complex<ValueType> rel_residual_goal,
-                             uint8 stoppingId, bool setFinalized,
-                             Array<stopping_status> *stop_status,
-                             Array<bool> *device_storage, bool *all_converged,
-                             bool *one_changed)
+void residual_norm(std::shared_ptr<const ReferenceExecutor> exec,
+                   const matrix::Dense<ValueType> *tau,
+                   const matrix::Dense<ValueType> *orig_tau,
+                   ValueType rel_residual_goal, uint8 stoppingId,
+                   bool setFinalized, Array<stopping_status> *stop_status,
+                   Array<bool> *device_storage, bool *all_converged,
+                   bool *one_changed)
 {
+    static_assert(is_complex_s<ValueType>::value == false,
+                  "ValueType must not be complex in this function!");
     *all_converged = true;
     *one_changed = false;
     for (size_type i = 0; i < tau->get_size()[1]; ++i) {
-        if (abs(tau->at(i)) < rel_residual_goal * abs(orig_tau->at(i))) {
+        if (tau->at(i) < rel_residual_goal * orig_tau->at(i)) {
             stop_status->get_data()[i].converge(stoppingId, setFinalized);
             *one_changed = true;
         }
@@ -78,10 +80,11 @@ void residual_norm_reduction(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
-}  // namespace residual_norm_reduction
+}  // namespace residual_norm
 }  // namespace reference
 }  // namespace kernels
 }  // namespace gko
diff --git a/reference/test/CMakeLists.txt b/reference/test/CMakeLists.txt
index 322bf38e6a7..b359d8146ed 100644
--- a/reference/test/CMakeLists.txt
+++ b/reference/test/CMakeLists.txt
@@ -1,4 +1,7 @@
+include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
+
 add_subdirectory(base)
+add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(log)
 add_subdirectory(matrix)
diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp
index fcc89c69bcf..830b031e83e 100644
--- a/reference/test/base/combination.cpp
+++ b/reference/test/base/combination.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,23 +39,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Combination : public ::testing::Test {
 protected:
-    using mtx = gko::matrix::Dense<>;
+    using Mtx = gko::matrix::Dense<T>;
 
     Combination()
         : exec{gko::ReferenceExecutor::create()},
-          coefficients{gko::initialize<mtx>({1}, exec),
-                       gko::initialize<mtx>({2}, exec)},
-          operators{gko::initialize<mtx>({{2.0, 3.0}, {1.0, 4.0}}, exec),
-                    gko::initialize<mtx>({{3.0, 2.0}, {2.0, 0.0}}, exec)}
+          coefficients{gko::initialize<Mtx>({1}, exec),
+                       gko::initialize<Mtx>({2}, exec)},
+          operators{
+              gko::initialize<Mtx>({I<T>({2.0, 3.0}), I<T>({1.0, 4.0})}, exec),
+              gko::initialize<Mtx>({I<T>({3.0, 2.0}), I<T>({2.0, 0.0})}, exec)}
     {}
 
     std::shared_ptr<const gko::Executor> exec;
@@ -63,40 +67,46 @@ class Combination : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> operators;
 };
 
+TYPED_TEST_CASE(Combination, gko::test::ValueTypes);
+
 
-TEST_F(Combination, AppliesToVector)
+TYPED_TEST(Combination, AppliesToVector)
 {
     /*
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    auto cmb = gko::Combination<>::create(coefficients[0], operators[0],
-                                          coefficients[1], operators[1]);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto cmb = gko::Combination<TypeParam>::create(
+        this->coefficients[0], this->operators[0], this->coefficients[1],
+        this->operators[1]);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
     auto res = clone(x);
 
     cmb->apply(lend(x), lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({22.0, 13.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({22.0, 13.0}), r<TypeParam>::value);
 }
 
 
-TEST_F(Combination, AppliesLinearCombinationToVector)
+TYPED_TEST(Combination, AppliesLinearCombinationToVector)
 {
     /*
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    auto cmb = gko::Combination<>::create(coefficients[0], operators[0],
-                                          coefficients[1], operators[1]);
-    auto alpha = gko::initialize<mtx>({3.0}, exec);
-    auto beta = gko::initialize<mtx>({-1.0}, exec);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto cmb = gko::Combination<TypeParam>::create(
+        this->coefficients[0], this->operators[0], this->coefficients[1],
+        this->operators[1]);
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
     auto res = clone(x);
 
     cmb->apply(lend(alpha), lend(x), lend(beta), lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({65.0, 37.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({65.0, 37.0}), r<TypeParam>::value);
 }
 
 
diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp
index d9f00665432..76d71734d12 100644
--- a/reference/test/base/composition.cpp
+++ b/reference/test/base/composition.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,60 +39,371 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueType>
+class DummyLinOp : public gko::EnableLinOp<DummyLinOp<ValueType>>,
+                   public gko::EnableCreateMethod<DummyLinOp<ValueType>> {
+    friend class gko::EnablePolymorphicObject<DummyLinOp, gko::LinOp>;
+    friend class gko::EnableCreateMethod<DummyLinOp>;
+
+public:
+    using value_type = ValueType;
+
+    bool apply_uses_initial_guess() const override { return true; }
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override {}
+
+    void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b,
+                    const gko::LinOp *beta, gko::LinOp *x) const override
+    {}
+
+    explicit DummyLinOp(std::shared_ptr<const gko::Executor> exec)
+        : gko::EnableLinOp<DummyLinOp>(exec)
+    {}
+
+    explicit DummyLinOp(std::shared_ptr<const gko::Executor> exec,
+                        gko::dim<2> size)
+        : gko::EnableLinOp<DummyLinOp>(exec, size)
+    {}
+};
+
+
+template <typename T>
 class Composition : public ::testing::Test {
 protected:
-    using mtx = gko::matrix::Dense<>;
+    using Mtx = gko::matrix::Dense<T>;
+    using value_type = T;
 
     Composition()
         : exec{gko::ReferenceExecutor::create()},
-          operators{gko::initialize<mtx>({2.0, 1.0}, exec),
-                    gko::initialize<mtx>({{3.0, 2.0}}, exec)}
+          operators{
+              gko::initialize<Mtx>(I<T>({2.0, 1.0}), exec),
+              gko::initialize<Mtx>({I<T>({3.0, 2.0})}, exec),
+              gko::initialize<Mtx>(
+                  {I<T>({-1.0, 1.0, 2.0}), I<T>({5.0, -3.0, 0.0})}, exec),
+              gko::initialize<Mtx>(
+                  {I<T>({9.0, 4.0}), I<T>({6.0, -2.0}), I<T>({-3.0, 2.0})},
+                  exec),
+              gko::initialize<Mtx>({I<T>({1.0, 0.0}), I<T>({0.0, 1.0})}, exec),
+              gko::initialize<Mtx>({I<T>({1.0, 0.0}), I<T>({0.0, 1.0})}, exec)},
+          identity{
+              gko::initialize<Mtx>({I<T>({1.0, 0.0}), I<T>({0.0, 1.0})}, exec)},
+          product{gko::initialize<Mtx>({I<T>({-9.0, -2.0}), I<T>({27.0, 26.0})},
+                                       exec)}
     {}
 
     std::shared_ptr<const gko::Executor> exec;
     std::vector<std::shared_ptr<gko::LinOp>> coefficients;
     std::vector<std::shared_ptr<gko::LinOp>> operators;
+    std::shared_ptr<Mtx> identity;
+    std::shared_ptr<Mtx> product;
 };
 
+TYPED_TEST_CASE(Composition, gko::test::ValueTypes);
 
-TEST_F(Composition, AppliesToVector)
+
+TYPED_TEST(Composition, AppliesSingleToVector)
+{
+    /*
+        cmp = [ -9 -2 ]
+              [ 27 26 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->product);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({-13.0, 79.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesSingleLinearCombinationToVector)
+{
+    /*
+        cmp = [ -9 -2 ]
+              [ 27 26 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->product);
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(alpha), lend(x), lend(beta), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({-40.0, 235.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesToVector)
 {
     /*
         cmp = [ 2 ] * [ 3 2 ]
               [ 1 ]
     */
-    auto cmp = gko::Composition<>::create(operators[0], operators[1]);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->operators[0],
+                                                   this->operators[1]);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
     auto res = clone(x);
 
     cmp->apply(lend(x), lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({14.0, 7.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({14.0, 7.0}), r<TypeParam>::value);
 }
 
 
-TEST_F(Composition, AppliesLinearCombinationToVector)
+TYPED_TEST(Composition, AppliesLinearCombinationToVector)
 {
     /*
         cmp = [ 2 ] * [ 3 2 ]
               [ 1 ]
     */
-    auto cmp = gko::Composition<>::create(operators[0], operators[1]);
-    auto alpha = gko::initialize<mtx>({3.0}, exec);
-    auto beta = gko::initialize<mtx>({-1.0}, exec);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->operators[0],
+                                                   this->operators[1]);
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(alpha), lend(x), lend(beta), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({41.0, 19.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesLongerToVector)
+{
+    /*
+        cmp = [ 2 ] * [ 3 2 ] * [ -9  -2 ]
+              [ 1 ]             [ 27  26 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(
+        this->operators[0], this->operators[1], this->product);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({238.0, 119.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesLongerLinearCombinationToVector)
+{
+    /*
+        cmp = [ 2 ] * [ 3 2 ] * [ -9  -2 ]
+              [ 1 ]             [ 27  26 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(
+        this->operators[0], this->operators[1], this->product);
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(alpha), lend(x), lend(beta), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({713.0, 355.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesLongestToVector)
+{
+    /*
+        cmp = [ 2 ] * [ 3 2 ] * [ -1  1  2 ] * [  9  4 ] * [ 1 0 ]^2
+              [ 1 ]             [  5 -3  0 ]   [  6 -2 ]   [ 0 1 ]
+                                               [ -3  2 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->operators.begin(),
+                                                   this->operators.end());
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({238.0, 119.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesLongestLinearCombinationToVector)
+{
+    /*
+        cmp = [ 2 ] * [ 3 2 ] * [ -1  1  2 ] * [  9  4 ] * [ 1 0 ]^2
+              [ 1 ]             [  5 -3  0 ]   [  6 -2 ]   [ 0 1 ]
+                                               [ -3  2 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->operators.begin(),
+                                                   this->operators.end());
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
     auto res = clone(x);
 
     cmp->apply(lend(alpha), lend(x), lend(beta), lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({41.0, 19.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({713.0, 355.0}), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesLongestToVectorMultipleRhs)
+{
+    /*
+        cmp = [ 2 ] * [ 3 2 ] * [ -1  1  2 ] * [  9  4 ] * [ 1 0 ]^2
+              [ 1 ]             [  5 -3  0 ]   [  6 -2 ]   [ 0 1 ]
+                                               [ -3  2 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->operators.begin(),
+                                                   this->operators.end());
+    auto x = clone(this->identity);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({{54.0, 92.0}, {27.0, 46.0}}),
+                        r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesLongestLinearCombinationToVectorMultipleRhs)
+{
+    /*
+        cmp = [ 2 ] * [ 3 2 ] * [ -1  1  2 ] * [  9  4 ] * [ 1 0 ]^2
+              [ 1 ]             [  5 -3  0 ]   [  6 -2 ]   [ 0 1 ]
+                                               [ -3  2 ]
+    */
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Composition<TypeParam>::create(this->operators.begin(),
+                                                   this->operators.end());
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = clone(this->identity);
+    auto res = clone(x);
+
+    cmp->apply(lend(alpha), lend(x), lend(beta), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({{161.0, 276.0}, {81.0, 137.0}}),
+                        r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Composition, AppliesToVectorWithInitialGuess)
+{
+    /*
+        cmp = I * DummyLinOp * I
+    */
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto cmp = gko::Composition<TypeParam>::create(
+        this->identity,
+        DummyLinOp<value_type>::create(this->exec, this->identity->get_size()),
+        this->identity);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({1.0, 2.0}), 0);
+}
+
+
+TYPED_TEST(Composition, AppliesToVectorWithInitialGuess2)
+{
+    /*
+        cmp = I * DummyLinOp(2x3) * DummyLinOp(3x2) * I
+    */
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto size1 = gko::dim<2>(3, 2);
+    auto size2 = gko::dim<2>(2, 3);
+    auto cmp = gko::Composition<TypeParam>::create(
+        this->identity, DummyLinOp<value_type>::create(this->exec, size2),
+        DummyLinOp<value_type>::create(this->exec, size1), this->identity);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({0.0, 0.0}), 0);
+}
+
+
+TYPED_TEST(Composition, AppliesToVectorWithInitialGuess3)
+{
+    /*
+        cmp = I * DummyLinOp
+    */
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto cmp = gko::Composition<TypeParam>::create(
+        DummyLinOp<value_type>::create(this->exec, this->identity->get_size()),
+        this->identity);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({1.0, 2.0}), 0);
+}
+
+
+TYPED_TEST(Composition, AppliesToVectorWithInitialGuess4)
+{
+    /*
+        cmp = I * DummyLinOp(2x3) * DummyLinOp(3x2)
+    */
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto size1 = gko::dim<2>(3, 2);
+    auto size2 = gko::dim<2>(2, 3);
+    auto cmp = gko::Composition<TypeParam>::create(
+        this->identity, DummyLinOp<value_type>::create(this->exec, size2),
+        DummyLinOp<value_type>::create(this->exec, size1));
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({0.0, 0.0}), 0);
+}
+
+
+TYPED_TEST(Composition, AppliesToVectorWithInitialGuess5)
+{
+    /*
+        cmp = DummyLinOp(2x3) * DummyLinOp(3x2) * I
+    */
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto size1 = gko::dim<2>(3, 2);
+    auto size2 = gko::dim<2>(2, 3);
+    auto cmp = gko::Composition<TypeParam>::create(
+        DummyLinOp<value_type>::create(this->exec, size2),
+        DummyLinOp<value_type>::create(this->exec, size1), this->identity);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = clone(x);
+
+    cmp->apply(lend(x), lend(res));
+
+    GKO_ASSERT_MTX_NEAR(res, l({1.0, 2.0}), 0);
 }
 
 
diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp
index f265776d935..fe1cc8692fe 100644
--- a/reference/test/base/perturbation.cpp
+++ b/reference/test/base/perturbation.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,21 +42,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Perturbation : public ::testing::Test {
 protected:
-    using mtx = gko::matrix::Dense<>;
+    using Mtx = gko::matrix::Dense<T>;
 
     Perturbation()
         : exec{gko::ReferenceExecutor::create()},
-          basis{gko::initialize<mtx>({2.0, 1.0}, exec)},
-          projector{gko::initialize<mtx>({{3.0, 2.0}}, exec)},
-          scalar{gko::initialize<mtx>({2.0}, exec)}
+          basis{gko::initialize<Mtx>({2.0, 1.0}, exec)},
+          projector{gko::initialize<Mtx>({I<T>({3.0, 2.0})}, exec)},
+          scalar{gko::initialize<Mtx>({2.0}, exec)}
     {}
 
     std::shared_ptr<const gko::Executor> exec;
@@ -65,72 +66,80 @@ class Perturbation : public ::testing::Test {
     std::shared_ptr<gko::LinOp> scalar;
 };
 
+TYPED_TEST_CASE(Perturbation, gko::test::ValueTypes);
 
-TEST_F(Perturbation, AppliesToVector)
+
+TYPED_TEST(Perturbation, AppliesToVector)
 {
     /*
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    auto cmp = gko::Perturbation<>::create(scalar, basis, projector);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
-    auto res = mtx::create_with_config_of(gko::lend(x));
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
+                                                    this->projector);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = Mtx::create_with_config_of(gko::lend(x));
 
     cmp->apply(gko::lend(x), gko::lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({29.0, 16.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({29.0, 16.0}), r<TypeParam>::value);
 }
 
 
-TEST_F(Perturbation, AppliesLinearCombinationToVector)
+TYPED_TEST(Perturbation, AppliesLinearCombinationToVector)
 {
     /*
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    auto cmp = gko::Perturbation<>::create(scalar, basis, projector);
-    auto alpha = gko::initialize<mtx>({3.0}, exec);
-    auto beta = gko::initialize<mtx>({-1.0}, exec);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
+                                                    this->projector);
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
     auto res = gko::clone(x);
 
     cmp->apply(gko::lend(alpha), gko::lend(x), gko::lend(beta), gko::lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({86.0, 46.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({86.0, 46.0}), r<TypeParam>::value);
 }
 
 
-TEST_F(Perturbation, ConstructionByBasisAppliesToVector)
+TYPED_TEST(Perturbation, ConstructionByBasisAppliesToVector)
 {
     /*
         cmp = I + 2 * [ 2 ] * [ 2 1 ]
                       [ 1 ]
     */
-    auto cmp = gko::Perturbation<>::create(scalar, basis);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
-    auto res = mtx::create_with_config_of(gko::lend(x));
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
+    auto res = Mtx::create_with_config_of(gko::lend(x));
 
     cmp->apply(gko::lend(x), gko::lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({17.0, 10.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({17.0, 10.0}), r<TypeParam>::value);
 }
 
 
-TEST_F(Perturbation, ConstructionByBasisAppliesLinearCombinationToVector)
+TYPED_TEST(Perturbation, ConstructionByBasisAppliesLinearCombinationToVector)
 {
     /*
         cmp = I + 2 * [ 2 ] * [ 2 1 ]
                       [ 1 ]
     */
-    auto cmp = gko::Perturbation<>::create(scalar, basis);
-    auto alpha = gko::initialize<mtx>({3.0}, exec);
-    auto beta = gko::initialize<mtx>({-1.0}, exec);
-    auto x = gko::initialize<mtx>({1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis);
+    auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
     auto res = gko::clone(x);
 
     cmp->apply(gko::lend(alpha), gko::lend(x), gko::lend(beta), gko::lend(res));
 
-    GKO_ASSERT_MTX_NEAR(res, l({50.0, 28.0}), 1e-15);
+    GKO_ASSERT_MTX_NEAR(res, l({50.0, 28.0}), r<TypeParam>::value);
 }
 
 
diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..9c1dca5bcfa
--- /dev/null
+++ b/reference/test/components/CMakeLists.txt
@@ -0,0 +1,3 @@
+ginkgo_create_test(fill_array)
+ginkgo_create_test(precision_conversion)
+ginkgo_create_test(prefix_sum)
diff --git a/reference/test/components/fill_array.cpp b/reference/test/components/fill_array.cpp
new file mode 100644
index 00000000000..51ec5d8dd09
--- /dev/null
+++ b/reference/test/components/fill_array.cpp
@@ -0,0 +1,84 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/fill_array.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class FillArray : public ::testing::Test {
+protected:
+    using value_type = T;
+    FillArray()
+        : ref(gko::ReferenceExecutor::create()),
+          total_size(6344),
+          expected(ref, total_size),
+          vals(ref, total_size)
+    {
+        std::fill_n(expected.get_data(), total_size, T(6453));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    gko::size_type total_size;
+    gko::Array<value_type> expected;
+    gko::Array<value_type> vals;
+};
+
+TYPED_TEST_CASE(FillArray, gko::test::ValueAndIndexTypes);
+
+
+TYPED_TEST(FillArray, EqualsReference)
+{
+    using T = typename TestFixture::value_type;
+    gko::kernels::reference::components::fill_array(
+        this->ref, this->vals.get_data(), this->total_size, T(6453));
+    GKO_ASSERT_ARRAY_EQ(this->vals, this->expected);
+}
+
+
+}  // namespace
diff --git a/reference/test/components/precision_conversion.cpp b/reference/test/components/precision_conversion.cpp
new file mode 100644
index 00000000000..10c96e82f23
--- /dev/null
+++ b/reference/test/components/precision_conversion.cpp
@@ -0,0 +1,150 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+class PrecisionConversion : public ::testing::Test {
+protected:
+    PrecisionConversion()
+        : ref(gko::ReferenceExecutor::create()),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          cvals(ref, total_size),
+          vals2(ref, 1),
+          expected_float(ref, 1),
+          expected_double(ref, 1)
+    {
+        auto maxval = 1e10f;
+        std::uniform_real_distribution<float> dist(-maxval, maxval);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+            cvals.get_data()[i] = {dist(rand), dist(rand)};
+        }
+        gko::uint64 rawdouble{0x4218888000889111ULL};
+        gko::uint32 rawfloat{0x50c44400UL};
+        gko::uint64 rawrounded{0x4218888000000000ULL};
+        std::memcpy(vals2.get_data(), &rawdouble, sizeof(double));
+        std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float));
+        std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<float> vals;
+    gko::Array<double> vals2;
+    gko::Array<float> expected_float;
+    gko::Array<double> expected_double;
+    gko::Array<std::complex<float>> cvals;
+};
+
+
+TEST_F(PrecisionConversion, ConvertsReal)
+{
+    gko::Array<double> tmp;
+    gko::Array<float> out;
+
+    tmp = vals;
+    out = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(vals, out);
+}
+
+
+TEST_F(PrecisionConversion, ConversionRounds)
+{
+    gko::Array<float> tmp;
+    gko::Array<double> out;
+
+    tmp = vals2;
+    out = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(tmp, expected_float);
+    GKO_ASSERT_ARRAY_EQ(out, expected_double);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealWithSetExecutor)
+{
+    gko::Array<double> tmp{ref};
+    gko::Array<float> out{ref};
+
+    tmp = vals;
+    out = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(vals, out);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsRealFromView)
+{
+    gko::Array<double> tmp{ref};
+    gko::Array<float> out{ref};
+
+    tmp = gko::Array<float>::view(ref, vals.get_num_elems(), vals.get_data());
+    out = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(vals, out);
+}
+
+
+TEST_F(PrecisionConversion, ConvertsComplex)
+{
+    gko::Array<std::complex<double>> tmp;
+    gko::Array<std::complex<float>> out;
+
+    tmp = cvals;
+    out = tmp;
+
+    GKO_ASSERT_ARRAY_EQ(cvals, out);
+}
+
+
+}  // namespace
diff --git a/reference/test/components/prefix_sum.cpp b/reference/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..2766326bc4a
--- /dev/null
+++ b/reference/test/components/prefix_sum.cpp
@@ -0,0 +1,77 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = T;
+    PrefixSum()
+        : exec(gko::ReferenceExecutor::create()),
+          vals{3, 5, 6, 7, 1, 5, 9, 7, 2, 0, 5},
+          expected{0, 3, 8, 14, 21, 22, 27, 36, 43, 45, 45}
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::vector<index_type> vals;
+    std::vector<index_type> expected;
+};
+
+TYPED_TEST_CASE(PrefixSum, gko::test::IndexTypes);
+
+
+TYPED_TEST(PrefixSum, Works)
+{
+    gko::kernels::reference::components::prefix_sum(
+        this->exec, this->vals.data(), this->vals.size());
+
+    ASSERT_EQ(this->vals, this->expected);
+}
+
+
+}  // namespace
diff --git a/reference/test/factorization/CMakeLists.txt b/reference/test/factorization/CMakeLists.txt
index 36c21b93eea..b52c2d938d7 100644
--- a/reference/test/factorization/CMakeLists.txt
+++ b/reference/test/factorization/CMakeLists.txt
@@ -1 +1,3 @@
+ginkgo_create_test(par_ict_kernels)
 ginkgo_create_test(par_ilu_kernels)
+ginkgo_create_test(par_ilut_kernels)
diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp
new file mode 100644
index 00000000000..9be9045492b
--- /dev/null
+++ b/reference/test/factorization/par_ict_kernels.cpp
@@ -0,0 +1,390 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/par_ict.hpp>
+
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/factorization/par_ict_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+class DummyLinOp : public gko::EnableLinOp<DummyLinOp>,
+                   public gko::EnableCreateMethod<DummyLinOp> {
+public:
+    DummyLinOp(std::shared_ptr<const gko::Executor> exec,
+               gko::dim<2> size = gko::dim<2>{})
+        : EnableLinOp<DummyLinOp>(exec, size)
+    {}
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override {}
+
+    void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b,
+                    const gko::LinOp *beta, gko::LinOp *x) const override
+    {}
+};
+
+
+template <typename ValueIndexType>
+class ParIct : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using factorization_type =
+        gko::factorization::ParIct<value_type, index_type>;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+
+    ParIct()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(std::static_pointer_cast<const gko::Executor>(ref)),
+          identity(gko::initialize<Csr>(
+              {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)),
+          lower_tri(gko::initialize<Csr>(
+              {{1., 0., 0.}, {1., 1., 0.}, {1., 1., 1.}}, ref)),
+          upper_tri(gko::initialize<Csr>(
+              {{2., 1., 1.}, {0., -3., 1.}, {0., 0., 4.}}, ref)),
+          mtx_system(gko::initialize<Csr>({{9., 0., -6., 3.},
+                                           {0., 36., 18., 24.},
+                                           {-6., 18., 17., 14.},
+                                           {-3., 24., 14., 18.}},
+                                          ref)),
+          mtx_init(gko::initialize<Csr>({{9., 0., -6., 3.},
+                                         {0., 0., 18., 24.},
+                                         {-6., 18., 17., 14.},
+                                         {-3., 24., 14., 18.}},
+                                        ref)),
+          mtx_l_system(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                             {0., 1., 0., 0.},
+                                             {1., 1., 1., 0.},
+                                             {1., 1., 0., 1.}},
+                                            ref)),
+          mtx_l(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                      {1., 2., 0., 0.},
+                                      {0., 0., 3., 0.},
+                                      {-2., 0., -3., 4.}},
+                                     ref)),
+          mtx_llt(gko::initialize<Csr>({{1., 1., 0., -2.},
+                                        {1., 5., 0., -2.},
+                                        {0., 0., 9., -9.},
+                                        {-2., -2., -9., 29.}},
+                                       ref)),
+          mtx_l_init_expect(gko::initialize<Csr>(
+              {{3., 0., 0., 0.},
+               {0., 1., 0., 0.},
+               {-6., 18., static_cast<value_type>(sqrt(17.)), 0.},
+               {-3., 24., 14., static_cast<value_type>(sqrt(18.))}},
+              ref)),
+          mtx_l_add_expect(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                                 {1., 2., 0., 0.},
+                                                 {-6., 9., 3., 0.},
+                                                 {-2., 13., -3., 4.}},
+                                                ref)),
+          mtx_l_it_expect(gko::initialize<Csr>({{3., 0., 0., 0.},
+                                                {0., 6., 0., 0.},
+                                                {-2., 3., 2., 0.},
+                                                {-1., 4., 0., 1.}},
+                                               ref)),
+          mtx_l_small_expect(gko::initialize<Csr>(
+              {{3., 0., 0., 0.},
+               {0., 6., 0., 0.},
+               {-2., 3., 2., 0.},
+               {0., 4., 0., static_cast<value_type>(sqrt(2.))}},
+              ref)),
+          mtx_l_large_expect(gko::initialize<Csr>({{3., 0., 0., 0.},
+                                                   {0., 6., 0., 0.},
+                                                   {-2., 3., 2., 0.},
+                                                   {-1., 4., 0., 1.}},
+                                                  ref)),
+          fact_fact(factorization_type::build().on(exec)),
+          tol{r<value_type>::value}
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<Csr> identity;
+    std::shared_ptr<Csr> lower_tri;
+    std::shared_ptr<Csr> upper_tri;
+    std::shared_ptr<Csr> mtx_system;
+    std::unique_ptr<Csr> mtx_l_system;
+    std::unique_ptr<Csr> mtx_init;
+    std::unique_ptr<Csr> mtx_l;
+    std::unique_ptr<Csr> mtx_llt;
+    std::unique_ptr<Csr> mtx_l_init_expect;
+    std::unique_ptr<Csr> mtx_l_add_expect;
+    std::unique_ptr<Csr> mtx_l_it_expect;
+    std::unique_ptr<Csr> mtx_l_small_expect;
+    std::unique_ptr<Csr> mtx_l_large_expect;
+    std::unique_ptr<typename factorization_type::Factory> fact_fact;
+    gko::remove_complex<value_type> tol;
+};
+
+TYPED_TEST_CASE(ParIct, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(ParIct, KernelInitializeRowPtrsL)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size());
+    auto row_ptrs = res_mtx_l->get_const_row_ptrs();
+
+    gko::kernels::reference::factorization::initialize_row_ptrs_l(
+        this->ref, this->mtx_system.get(), res_mtx_l->get_row_ptrs());
+
+    ASSERT_EQ(row_ptrs[0], 0);
+    ASSERT_EQ(row_ptrs[1], 1);
+    ASSERT_EQ(row_ptrs[2], 2);
+    ASSERT_EQ(row_ptrs[3], 5);
+    ASSERT_EQ(row_ptrs[4], 9);
+}
+
+
+TYPED_TEST(ParIct, KernelInitializeL)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size(), 9);
+    auto row_ptrs = res_mtx_l->get_const_row_ptrs();
+
+    gko::kernels::reference::factorization::initialize_row_ptrs_l(
+        this->ref, this->mtx_init.get(), res_mtx_l->get_row_ptrs());
+    gko::kernels::reference::factorization::initialize_l(
+        this->ref, this->mtx_init.get(), res_mtx_l.get(), true);
+
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, this->mtx_l_init_expect, this->tol);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, this->mtx_l_init_expect);
+}
+
+
+TYPED_TEST(ParIct, KernelAddCandidates)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size());
+
+    gko::kernels::reference::par_ict_factorization::add_candidates(
+        this->ref, this->mtx_llt.get(), this->mtx_system.get(),
+        this->mtx_l.get(), res_mtx_l.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, this->mtx_l_add_expect);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, this->mtx_l_add_expect, this->tol);
+}
+
+
+TYPED_TEST(ParIct, KernelComputeLU)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    auto mtx_l_coo = Coo::create(this->exec, this->mtx_system->get_size());
+    this->mtx_l_system->convert_to(mtx_l_coo.get());
+
+    gko::kernels::reference::par_ict_factorization::compute_factor(
+        this->ref, this->mtx_system.get(), this->mtx_l_system.get(),
+        mtx_l_coo.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx_l_system, this->mtx_l_it_expect, this->tol);
+}
+
+
+TYPED_TEST(ParIct, ThrowNotSupportedForWrongLinOp)
+{
+    auto lin_op = DummyLinOp::create(this->ref);
+
+    ASSERT_THROW(this->fact_fact->generate(gko::share(lin_op)),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(ParIct, ThrowDimensionMismatch)
+{
+    using Csr = typename TestFixture::Csr;
+    auto matrix = Csr::create(this->ref, gko::dim<2>{2, 3}, 4);
+
+    ASSERT_THROW(this->fact_fact->generate(gko::share(matrix)),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(ParIct, SetStrategies)
+{
+    using Csr = typename TestFixture::Csr;
+    using factorization_type = typename TestFixture::factorization_type;
+    auto l_strategy = std::make_shared<typename Csr::merge_path>();
+    auto lt_strategy = std::make_shared<typename Csr::classical>();
+
+    auto factory = factorization_type::build()
+                       .with_l_strategy(l_strategy)
+                       .with_lt_strategy(lt_strategy)
+                       .on(this->ref);
+    auto fact = factory->generate(this->mtx_system);
+
+    ASSERT_EQ(factory->get_parameters().l_strategy, l_strategy);
+    ASSERT_EQ(fact->get_l_factor()->get_strategy()->get_name(),
+              l_strategy->get_name());
+    ASSERT_EQ(factory->get_parameters().lt_strategy, lt_strategy);
+    ASSERT_EQ(fact->get_lt_factor()->get_strategy()->get_name(),
+              lt_strategy->get_name());
+}
+
+
+TYPED_TEST(ParIct, IsConsistentWithComposition)
+{
+    auto fact = this->fact_fact->generate(this->mtx_system);
+
+    auto lin_op_l_factor =
+        static_cast<const gko::LinOp *>(gko::lend(fact->get_l_factor()));
+    auto lin_op_lt_factor =
+        static_cast<const gko::LinOp *>(gko::lend(fact->get_lt_factor()));
+    auto first_operator = gko::lend(fact->get_operators()[0]);
+    auto second_operator = gko::lend(fact->get_operators()[1]);
+
+    ASSERT_EQ(lin_op_l_factor, first_operator);
+    ASSERT_EQ(lin_op_lt_factor, second_operator);
+}
+
+
+TYPED_TEST(ParIct, GenerateIdentity)
+{
+    auto fact = this->fact_fact->generate(this->identity);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), this->identity, this->tol);
+}
+
+
+TYPED_TEST(ParIct, GenerateDenseIdentity)
+{
+    using Dense = typename TestFixture::Dense;
+    auto dense_id = Dense::create(this->exec, this->identity->get_size());
+    this->identity->convert_to(dense_id.get());
+    auto fact = this->fact_fact->generate(gko::share(dense_id));
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), this->identity, this->tol);
+}
+
+
+TYPED_TEST(ParIct, GenerateWithExactSmallLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using Csr = typename TestFixture::Csr;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(false)
+                    .with_fill_in_limit(0.6)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(),
+                        gko::as<Csr>(this->mtx_l_small_expect->transpose()),
+                        this->tol);
+}
+
+
+TYPED_TEST(ParIct, GenerateWithApproxSmallLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using Csr = typename TestFixture::Csr;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(true)
+                    .with_fill_in_limit(0.6)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(),
+                        gko::as<Csr>(this->mtx_l_small_expect->transpose()),
+                        this->tol);
+}
+
+
+TYPED_TEST(ParIct, GenerateWithExactLargeLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using Csr = typename TestFixture::Csr;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(false)
+                    .with_fill_in_limit(1.2)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(),
+                        gko::as<Csr>(this->mtx_l_large_expect->transpose()),
+                        this->tol);
+}
+
+
+TYPED_TEST(ParIct, GenerateWithApproxLargeLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using Csr = typename TestFixture::Csr;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(true)
+                    .with_fill_in_limit(1.2)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(),
+                        gko::as<Csr>(this->mtx_l_large_expect->transpose()),
+                        this->tol);
+}
+
+
+}  // namespace
diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp
index f64895a1613..b24309de53b 100644
--- a/reference/test/factorization/par_ilu_kernels.cpp
+++ b/reference/test/factorization/par_ilu_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
+#include <initializer_list>
 #include <memory>
 #include <vector>
 
@@ -47,8 +48,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/par_ilu_kernels.hpp"
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
@@ -71,17 +73,25 @@ class DummyLinOp : public gko::EnableLinOp<DummyLinOp>,
 };
 
 
+template <typename ValueIndexType>
 class ParIlu : public ::testing::Test {
 protected:
-    using value_type = gko::default_precision;
-    using index_type = gko::int32;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using Dense = gko::matrix::Dense<value_type>;
     using Coo = gko::matrix::Coo<value_type, index_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
+    using par_ilu_type = gko::factorization::ParIlu<value_type, index_type>;
     ParIlu()
         : ref(gko::ReferenceExecutor::create()),
           exec(std::static_pointer_cast<const gko::Executor>(ref)),
           // clang-format off
+          empty_csr(gko::initialize<Csr>(
+              {{0., 0., 0.},
+               {0., 0., 0.},
+               {0., 0., 0.}}, exec)),
           identity(gko::initialize<Dense>(
               {{1., 0., 0.},
                {0., 1., 0.},
@@ -107,6 +117,19 @@ class ParIlu : public ::testing::Test {
               {{4., 6., 8.},
                {0., -1., 1.},
                {0., 0., -1.5}}, exec)),
+          mtx_small2(gko::initialize<Dense>(
+              {{8., 8., 0},
+              {2., 0., 5.},
+              {1., 1., 1}}, exec)),
+          mtx_csr_small2(nullptr),
+          small2_l_expected(gko::initialize<Dense>(
+              {{1., 0., 0},
+              {.25, 1., 0.},
+              {.125, 0., 1}}, exec)),
+          small2_u_expected(gko::initialize<Dense>(
+              {{8., 8., 0},
+              {0., -2., 5.},
+              {0., 0., 1}}, exec)),
           mtx_big(gko::initialize<Dense>({{1., 1., 1., 0., 1., 3.},
                                           {1., 2., 2., 0., 2., 0.},
                                           {0., 2., 3., 3., 3., 5.},
@@ -128,21 +151,46 @@ class ParIlu : public ::testing::Test {
                                                  {0., 0., 0., 0., 5., -15.},
                                                  {0., 0., 0., 0., 0., 6.}},
                                                 exec)),
+          mtx_big_nodiag(gko::initialize<Csr>({{1., 1., 1., 0., 1., 3.},
+                                               {1., 2., 2., 0., 2., 0.},
+                                               {0., 2., 0., 3., 3., 5.},
+                                               {1., 0., 3., 4., 4., 4.},
+                                               {1., 2., 0., 4., 1., 6.},
+                                               {0., 2., 3., 4., 5., 8.}},
+                                         exec)),
+          big_nodiag_l_expected(gko::initialize<Dense>(
+            {{1., 0., 0., 0., 0., 0.},
+             {1., 1., 0., 0., 0., 0.},
+             {0., 2., 1., 0., 0., 0.},
+             {1., 0., -1., 1., 0., 0.},
+             {1., 1., 0., 0.571428571428571, 1., 0.},
+             {0., 2., -0.5, 0.785714285714286, -0.108695652173913, 1.}},
+            exec)),
+          big_nodiag_u_expected(gko::initialize<Dense>(
+            {{1., 1., 1., 0., 1., 3.},
+             {0., 1., 1., 0., 1., 0.},
+             {0., 0., -2., 3., 1., 5.},
+             {0., 0., 0., 7., 4., 6.},
+             {0., 0., 0., 0., -3.28571428571429, -0.428571428571429},
+             {0., 0., 0., 0., 0., 5.73913043478261}},
+            exec)),
           // clang-format on
           ilu_factory_skip(
-              gko::factorization::ParIlu<>::build().with_skip_sorting(true).on(
-                  exec)),
+              par_ilu_type::build().with_skip_sorting(true).on(exec)),
           ilu_factory_sort(
-              gko::factorization::ParIlu<>::build().with_skip_sorting(false).on(
-                  exec))
+              par_ilu_type::build().with_skip_sorting(false).on(exec))
     {
         auto tmp_csr = Csr::create(exec);
         mtx_small->convert_to(gko::lend(tmp_csr));
         mtx_csr_small = std::move(tmp_csr);
+        auto tmp_csr2 = Csr::create(exec);
+        mtx_small2->convert_to(gko::lend(tmp_csr2));
+        mtx_csr_small2 = std::move(tmp_csr2);
     }
 
     std::shared_ptr<const gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<const Csr> empty_csr;
     std::shared_ptr<const Dense> identity;
     std::shared_ptr<const Dense> lower_triangular;
     std::shared_ptr<const Dense> upper_triangular;
@@ -150,28 +198,127 @@ class ParIlu : public ::testing::Test {
     std::shared_ptr<const Csr> mtx_csr_small;
     std::shared_ptr<const Dense> small_l_expected;
     std::shared_ptr<const Dense> small_u_expected;
+    std::shared_ptr<const Dense> mtx_small2;
+    std::shared_ptr<const Csr> mtx_csr_small2;
+    std::shared_ptr<const Dense> small2_l_expected;
+    std::shared_ptr<const Dense> small2_u_expected;
     std::shared_ptr<const Dense> mtx_big;
     std::shared_ptr<const Dense> big_l_expected;
     std::shared_ptr<const Dense> big_u_expected;
-    std::unique_ptr<gko::factorization::ParIlu<>::Factory> ilu_factory_skip;
-    std::unique_ptr<gko::factorization::ParIlu<>::Factory> ilu_factory_sort;
+    std::shared_ptr<const Csr> mtx_big_nodiag;
+    std::shared_ptr<const Dense> big_nodiag_l_expected;
+    std::shared_ptr<const Dense> big_nodiag_u_expected;
+    std::unique_ptr<typename par_ilu_type::Factory> ilu_factory_skip;
+    std::unique_ptr<typename par_ilu_type::Factory> ilu_factory_sort;
 };
 
+TYPED_TEST_CASE(ParIlu, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(ParIlu, KernelAddDiagonalElementsEmpty)
+{
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    using Csr = typename TestFixture::Csr;
+    auto expected_mtx =
+        Csr::create(this->ref, this->empty_csr->get_size(),
+                    std::initializer_list<value_type>{0., 0., 0.},
+                    std::initializer_list<index_type>{0, 1, 2},
+                    std::initializer_list<index_type>{0, 1, 2, 3});
+    auto empty_mtx = this->empty_csr->clone();
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(empty_mtx), true);
+
+    GKO_ASSERT_MTX_NEAR(empty_mtx, expected_mtx, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(empty_mtx, expected_mtx);
+}
+
+
+TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquare)
+{
+    using Csr = typename TestFixture::Csr;
+    auto matrix = gko::initialize<Csr>(
+        {{0., 0., 0.}, {1., 0., 0.}, {1., 1., 1.}, {1., 1., 1.}}, this->ref);
+    auto exp_values = {0., 1., 0., 1., 1., 1., 1., 1., 1.};
+    auto exp_col_idxs = {0, 0, 1, 0, 1, 2, 0, 1, 2};
+    auto exp_row_ptrs = {0, 1, 3, 6, 9};
+    auto expected_mtx =
+        Csr::create(this->ref, matrix->get_size(), std::move(exp_values),
+                    std::move(exp_col_idxs), std::move(exp_row_ptrs));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(matrix), true);
+
+    GKO_ASSERT_MTX_NEAR(matrix, expected_mtx, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(matrix, expected_mtx);
+}
+
+
+TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquare2)
+{
+    using Csr = typename TestFixture::Csr;
+    auto matrix = gko::initialize<Csr>({{1., 0., 0.}, {1., 0., 0.}}, this->ref);
+    auto exp_values = {1., 1., 0.};
+    auto exp_col_idxs = {0, 0, 1};
+    auto exp_row_ptrs = {0, 1, 3};
+    auto expected_mtx =
+        Csr::create(this->ref, matrix->get_size(), std::move(exp_values),
+                    std::move(exp_col_idxs), std::move(exp_row_ptrs));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(matrix), true);
+
+    GKO_ASSERT_MTX_NEAR(matrix, expected_mtx, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(matrix, expected_mtx);
+}
+
 
-TEST_F(ParIlu, KernelInitializeRowPtrsLU)
+TYPED_TEST(ParIlu, KernelAddDiagonalElementsUnsorted)
 {
-    auto small_csr_l_expected = Csr::create(ref);
-    small_l_expected->convert_to(gko::lend(small_csr_l_expected));
-    auto small_csr_u_expected = Csr::create(ref);
-    small_u_expected->convert_to(gko::lend(small_csr_u_expected));
-    auto num_row_ptrs = mtx_csr_small->get_size()[0] + 1;
+    using Csr = typename TestFixture::Csr;
+    auto size = gko::dim<2>{3, 3};
+    /* matrix:
+    1 2 3
+    1 0 3
+    1 2 0
+    */
+    auto mtx_values = {3., 2., 1., 3., 1., 2., 1.};
+    auto mtx_col_idxs = {2, 1, 0, 2, 0, 1, 0};
+    auto mtx_row_ptrs = {0, 3, 5, 7};
+    auto matrix = Csr::create(this->ref, size, std::move(mtx_values),
+                              std::move(mtx_col_idxs), std::move(mtx_row_ptrs));
+    auto exp_values = {1., 2., 3., 1., 0., 3., 1., 2., 0.};
+    auto exp_col_idxs = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    auto exp_row_ptrs = {0, 3, 6, 9};
+    auto expected_mtx =
+        Csr::create(this->ref, size, std::move(exp_values),
+                    std::move(exp_col_idxs), std::move(exp_row_ptrs));
+
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(matrix), false);
+
+    GKO_ASSERT_MTX_NEAR(matrix, expected_mtx, 0.);
+    GKO_ASSERT_MTX_EQ_SPARSITY(matrix, expected_mtx);
+}
+
+
+TYPED_TEST(ParIlu, KernelInitializeRowPtrsLU)
+{
+    using Csr = typename TestFixture::Csr;
+    using index_type = typename TestFixture::index_type;
+    auto small_csr_l_expected = Csr::create(this->ref);
+    this->small_l_expected->convert_to(gko::lend(small_csr_l_expected));
+    auto small_csr_u_expected = Csr::create(this->ref);
+    this->small_u_expected->convert_to(gko::lend(small_csr_u_expected));
+    auto num_row_ptrs = this->mtx_csr_small->get_size()[0] + 1;
     std::vector<index_type> l_row_ptrs_vector(num_row_ptrs);
     std::vector<index_type> u_row_ptrs_vector(num_row_ptrs);
     auto l_row_ptrs = l_row_ptrs_vector.data();
     auto u_row_ptrs = u_row_ptrs_vector.data();
 
-    gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u(
-        ref, gko::lend(mtx_csr_small), l_row_ptrs, u_row_ptrs);
+    gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
+        this->ref, gko::lend(this->mtx_csr_small), l_row_ptrs, u_row_ptrs);
 
     ASSERT_TRUE(std::equal(l_row_ptrs, l_row_ptrs + num_row_ptrs,
                            small_csr_l_expected->get_const_row_ptrs()));
@@ -180,20 +327,51 @@ TEST_F(ParIlu, KernelInitializeRowPtrsLU)
 }
 
 
-TEST_F(ParIlu, KernelInitializeLU)
+TYPED_TEST(ParIlu, KernelInitializeRowPtrsLUZeroMatrix)
 {
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    auto empty_mtx = this->empty_csr->clone();
+    gko::kernels::reference::factorization::add_diagonal_elements(
+        this->ref, gko::lend(empty_mtx), true);
+    auto empty_mtx_l_expected = Csr::create(this->ref);
+    this->identity->convert_to(gko::lend(empty_mtx_l_expected));
+    auto empty_mtx_u_expected = Csr::create(this->ref);
+    this->identity->convert_to(gko::lend(empty_mtx_u_expected));
+    auto num_row_ptrs = empty_mtx->get_size()[0] + 1;
+    std::vector<index_type> l_row_ptrs_vector(num_row_ptrs);
+    std::vector<index_type> u_row_ptrs_vector(num_row_ptrs);
+    auto l_row_ptrs = l_row_ptrs_vector.data();
+    auto u_row_ptrs = u_row_ptrs_vector.data();
+
+    gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
+        this->ref, gko::lend(empty_mtx), l_row_ptrs, u_row_ptrs);
+
+    ASSERT_TRUE(std::equal(l_row_ptrs, l_row_ptrs + num_row_ptrs,
+                           empty_mtx_l_expected->get_const_row_ptrs()));
+    ASSERT_TRUE(std::equal(u_row_ptrs, u_row_ptrs + num_row_ptrs,
+                           empty_mtx_u_expected->get_const_row_ptrs()));
+}
+
+
+TYPED_TEST(ParIlu, KernelInitializeLU)
+{
+    using Dense = typename TestFixture::Dense;
+    using Csr = typename TestFixture::Csr;
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
     // clang-format off
     auto expected_l =
         gko::initialize<Dense>({{1., 0., 0.},
                                 {2., 1., 0.},
-                                {1., 1., 1.}}, ref);
+                                {1., 1., 1.}}, this->ref);
     auto expected_u =
         gko::initialize<Dense>({{4., 6., 8.},
                                 {0., 2., 5.},
-                                {0., 0., 1.}}, ref);
+                                {0., 0., 1.}}, this->ref);
     // clang-format on
-    auto actual_l = Csr::create(ref, mtx_csr_small->get_size(), 6);
-    auto actual_u = Csr::create(ref, mtx_csr_small->get_size(), 6);
+    auto actual_l = Csr::create(this->ref, this->mtx_csr_small->get_size(), 6);
+    auto actual_u = Csr::create(this->ref, this->mtx_csr_small->get_size(), 6);
     // Copy row_ptrs into matrices, which usually come from the
     // `initialize_row_ptrs_l_u` kernel
     std::vector<index_type> l_row_ptrs{0, 1, 3, 6};
@@ -201,258 +379,359 @@ TEST_F(ParIlu, KernelInitializeLU)
     std::copy(l_row_ptrs.begin(), l_row_ptrs.end(), actual_l->get_row_ptrs());
     std::copy(u_row_ptrs.begin(), u_row_ptrs.end(), actual_u->get_row_ptrs());
 
-    gko::kernels::reference::par_ilu_factorization::initialize_l_u(
-        ref, gko::lend(mtx_csr_small), gko::lend(actual_l),
+    gko::kernels::reference::factorization::initialize_l_u(
+        this->ref, gko::lend(this->mtx_csr_small), gko::lend(actual_l),
+        gko::lend(actual_u));
+
+    GKO_ASSERT_MTX_NEAR(actual_l, expected_l, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(actual_u, expected_u, r<value_type>::value);
+}
+
+
+TYPED_TEST(ParIlu, KernelInitializeLUZeroMatrix)
+{
+    using value_type = typename TestFixture::value_type;
+    using Csr = typename TestFixture::Csr;
+    auto actual_l = Csr::create(this->ref);
+    auto actual_u = Csr::create(this->ref);
+    actual_l->copy_from(gko::lend(this->identity));
+    actual_u->copy_from(gko::lend(this->identity));
+
+    gko::kernels::reference::factorization::initialize_l_u(
+        this->ref, gko::lend(this->empty_csr), gko::lend(actual_l),
         gko::lend(actual_u));
 
-    GKO_ASSERT_MTX_NEAR(actual_l, expected_l, 1e-14);
-    GKO_ASSERT_MTX_NEAR(actual_u, expected_u, 1e-14);
+    GKO_ASSERT_MTX_NEAR(actual_l, this->identity, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(actual_u, this->identity, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, KernelComputeLU)
+TYPED_TEST(ParIlu, KernelComputeLU)
 {
+    using value_type = typename TestFixture::value_type;
+    using Dense = typename TestFixture::Dense;
+    using Coo = typename TestFixture::Coo;
+    using Csr = typename TestFixture::Csr;
     // clang-format off
     auto l_dense =
         gko::initialize<Dense>({{1., 0., 0.},
                                 {2., 1., 0.},
-                                {1., 1., 1.}}, ref);
+                                {1., 1., 1.}}, this->ref);
     // U must be transposed before calling the kernel, so we simply create it
     // transposed
     auto u_dense =
         gko::initialize<Dense>({{4., 0., 0.},
                                 {6., 2., 0.},
-                                {8., 5., 1.}}, ref);
+                                {8., 5., 1.}}, this->ref);
     // clang-format on
-    auto l_csr = Csr::create(ref);
-    auto u_csr = Csr::create(ref);
-    auto mtx_coo = Coo::create(ref);
+    auto l_csr = Csr::create(this->ref);
+    auto u_csr = Csr::create(this->ref);
+    auto mtx_coo = Coo::create(this->ref);
     constexpr unsigned int iterations = 1;
     l_dense->convert_to(gko::lend(l_csr));
     u_dense->convert_to(gko::lend(u_csr));
-    mtx_small->convert_to(gko::lend(mtx_coo));
+    this->mtx_small->convert_to(gko::lend(mtx_coo));
     // The expected result of U also needs to be transposed
-    auto u_expected_lin_op = small_u_expected->transpose();
+    auto u_expected_lin_op = this->small_u_expected->transpose();
     auto u_expected = std::unique_ptr<Dense>(
         static_cast<Dense *>(u_expected_lin_op.release()));
 
     gko::kernels::reference::par_ilu_factorization::compute_l_u_factors(
-        ref, iterations, gko::lend(mtx_coo), gko::lend(l_csr),
+        this->ref, iterations, gko::lend(mtx_coo), gko::lend(l_csr),
         gko::lend(u_csr));
 
-    GKO_ASSERT_MTX_NEAR(l_csr, small_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_csr, u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_csr, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_csr, u_expected, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, ThrowNotSupportedForWrongLinOp1)
+TYPED_TEST(ParIlu, ThrowNotSupportedForWrongLinOp1)
 {
-    auto linOp = DummyLinOp::create(ref);
+    auto linOp = DummyLinOp::create(this->ref);
 
-    ASSERT_THROW(ilu_factory_skip->generate(gko::share(linOp)),
+    ASSERT_THROW(this->ilu_factory_skip->generate(gko::share(linOp)),
                  gko::NotSupported);
 }
 
 
-TEST_F(ParIlu, ThrowNotSupportedForWrongLinOp2)
+TYPED_TEST(ParIlu, ThrowNotSupportedForWrongLinOp2)
 {
-    auto linOp = DummyLinOp::create(ref);
+    auto linOp = DummyLinOp::create(this->ref);
 
-    ASSERT_THROW(ilu_factory_sort->generate(gko::share(linOp)),
+    ASSERT_THROW(this->ilu_factory_sort->generate(gko::share(linOp)),
                  gko::NotSupported);
 }
 
 
-TEST_F(ParIlu, ThrowDimensionMismatch)
+TYPED_TEST(ParIlu, ThrowDimensionMismatch)
 {
-    auto matrix = Csr::create(ref, gko::dim<2>{2, 3}, 4);
+    using Csr = typename TestFixture::Csr;
+    auto matrix = Csr::create(this->ref, gko::dim<2>{2, 3}, 4);
 
-    ASSERT_THROW(ilu_factory_sort->generate(gko::share(matrix)),
+    ASSERT_THROW(this->ilu_factory_sort->generate(gko::share(matrix)),
                  gko::DimensionMismatch);
 }
 
 
-TEST_F(ParIlu, LUFactorFunctionsSetProperly)
+TYPED_TEST(ParIlu, SetLStrategy)
+{
+    using Csr = typename TestFixture::Csr;
+    using par_ilu_type = typename TestFixture::par_ilu_type;
+    auto l_strategy = std::make_shared<typename Csr::classical>();
+
+    auto factory =
+        par_ilu_type::build().with_l_strategy(l_strategy).on(this->ref);
+    auto par_ilu = factory->generate(this->mtx_small);
+
+    ASSERT_EQ(factory->get_parameters().l_strategy, l_strategy);
+    ASSERT_EQ(par_ilu->get_l_factor()->get_strategy()->get_name(),
+              l_strategy->get_name());
+}
+
+
+TYPED_TEST(ParIlu, SetUStrategy)
 {
-    auto factors = ilu_factory_skip->generate(mtx_small);
+    using Csr = typename TestFixture::Csr;
+    using par_ilu_type = typename TestFixture::par_ilu_type;
+    auto u_strategy = std::make_shared<typename Csr::classical>();
+
+    auto factory =
+        par_ilu_type::build().with_u_strategy(u_strategy).on(this->ref);
+    auto par_ilu = factory->generate(this->mtx_small);
+
+    ASSERT_EQ(factory->get_parameters().u_strategy, u_strategy);
+    ASSERT_EQ(par_ilu->get_u_factor()->get_strategy()->get_name(),
+              u_strategy->get_name());
+}
+
+
+TYPED_TEST(ParIlu, LUFactorFunctionsSetProperly)
+{
+    auto factors = this->ilu_factory_skip->generate(this->mtx_small);
 
     auto lin_op_l_factor =
-        static_cast<const gko::LinOp *>(factors->get_l_factor().get());
+        static_cast<const gko::LinOp *>(gko::lend(factors->get_l_factor()));
     auto lin_op_u_factor =
-        static_cast<const gko::LinOp *>(factors->get_u_factor().get());
-    auto first_operator = factors->get_operators()[0].get();
-    auto second_operator = factors->get_operators()[1].get();
+        static_cast<const gko::LinOp *>(gko::lend(factors->get_u_factor()));
+    auto first_operator = gko::lend(factors->get_operators()[0]);
+    auto second_operator = gko::lend(factors->get_operators()[1]);
 
     ASSERT_EQ(lin_op_l_factor, first_operator);
     ASSERT_EQ(lin_op_u_factor, second_operator);
 }
 
 
-TEST_F(ParIlu, GenerateForCooIdentity)
+TYPED_TEST(ParIlu, GenerateForCooIdentity)
 {
-    auto coo_mtx = gko::share(Coo::create(exec));
-    identity->convert_to(coo_mtx.get());
+    using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    auto coo_mtx = gko::share(Coo::create(this->exec));
+    this->identity->convert_to(gko::lend(coo_mtx));
 
-    auto factors = ilu_factory_skip->generate(coo_mtx);
+    auto factors = this->ilu_factory_skip->generate(coo_mtx);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForCsrIdentity)
+TYPED_TEST(ParIlu, GenerateForCsrIdentity)
 {
-    auto csr_mtx = gko::share(Csr::create(exec));
-    identity->convert_to(csr_mtx.get());
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto csr_mtx = gko::share(Csr::create(this->exec));
+    this->identity->convert_to(gko::lend(csr_mtx));
 
-    auto factors = ilu_factory_skip->generate(csr_mtx);
+    auto factors = this->ilu_factory_skip->generate(csr_mtx);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseIdentity)
+TYPED_TEST(ParIlu, GenerateForDenseIdentity)
 {
-    auto factors = ilu_factory_skip->generate(identity);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->identity);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseLowerTriangular)
+TYPED_TEST(ParIlu, GenerateForDenseLowerTriangular)
 {
-    auto factors = ilu_factory_skip->generate(lower_triangular);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->lower_triangular);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, lower_triangular, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->lower_triangular, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseUpperTriangular)
+TYPED_TEST(ParIlu, GenerateForDenseUpperTriangular)
 {
-    auto factors = ilu_factory_skip->generate(upper_triangular);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->upper_triangular);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, upper_triangular, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->upper_triangular, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, ApplyMethodDenseSmall)
+TYPED_TEST(ParIlu, ApplyMethodDenseSmall)
 {
-    const auto x = gko::initialize<Dense>({1., 2., 3.}, exec);
+    using value_type = typename TestFixture::value_type;
+    using Dense = typename TestFixture::Dense;
+    const auto x = gko::initialize<Dense>({1., 2., 3.}, this->exec);
     auto b_lu = Dense::create_with_config_of(gko::lend(x));
     auto b_ref = Dense::create_with_config_of(gko::lend(x));
 
-    auto factors = ilu_factory_skip->generate(mtx_small);
+    auto factors = this->ilu_factory_skip->generate(this->mtx_small);
     factors->apply(gko::lend(x), gko::lend(b_lu));
-    mtx_small->apply(gko::lend(x), gko::lend(b_ref));
+    this->mtx_small->apply(gko::lend(x), gko::lend(b_ref));
+
+    GKO_ASSERT_MTX_NEAR(b_lu, b_ref, r<value_type>::value);
+}
+
+
+TYPED_TEST(ParIlu, GenerateForDenseSmall)
+{
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->mtx_small);
+    auto l_factor = factors->get_l_factor();
+    auto u_factor = factors->get_u_factor();
+
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r<value_type>::value);
+}
+
+
+TYPED_TEST(ParIlu, GenerateForCsrSmall)
+{
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->mtx_csr_small);
+    auto l_factor = factors->get_l_factor();
+    auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(b_lu, b_ref, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseSmall)
+TYPED_TEST(ParIlu, GenerateForCsrSmall2ZeroDiagonal)
 {
-    auto factors = ilu_factory_skip->generate(mtx_small);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->mtx_csr_small2);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small2_l_expected,
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small2_u_expected,
+                        r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForCsrSmall)
+TYPED_TEST(ParIlu, GenerateForCsrBigWithDiagonalZeros)
 {
-    auto factors = ilu_factory_skip->generate(mtx_csr_small);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->mtx_big_nodiag);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->big_nodiag_l_expected,
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->big_nodiag_u_expected,
+                        r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseSmallWithMultipleIterations)
+TYPED_TEST(ParIlu, GenerateForDenseSmallWithMultipleIterations)
 {
-    auto multiple_iter_factory = gko::factorization::ParIlu<>::build()
-                                     .with_iterations(5u)
-                                     .with_skip_sorting(true)
-                                     .on(exec);
-    auto factors = multiple_iter_factory->generate(mtx_small);
+    using value_type = typename TestFixture::value_type;
+    using par_ilu_type = typename TestFixture::par_ilu_type;
+    auto multiple_iter_factory =
+        par_ilu_type::build().with_iterations(5u).with_skip_sorting(true).on(
+            this->exec);
+    auto factors = multiple_iter_factory->generate(this->mtx_small);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseBig)
+TYPED_TEST(ParIlu, GenerateForDenseBig)
 {
-    auto factors = ilu_factory_skip->generate(mtx_big);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->mtx_big);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, big_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, big_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->big_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->big_u_expected, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForDenseBigSort)
+TYPED_TEST(ParIlu, GenerateForDenseBigSort)
 {
-    auto factors = ilu_factory_skip->generate(mtx_big);
+    using value_type = typename TestFixture::value_type;
+    auto factors = this->ilu_factory_skip->generate(this->mtx_big);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, big_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, big_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->big_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->big_u_expected, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForReverseCooSmall)
+TYPED_TEST(ParIlu, GenerateForReverseCooSmall)
 {
-    const auto size = mtx_small->get_size();
+    using value_type = typename TestFixture::value_type;
+    using Coo = typename TestFixture::Coo;
+    const auto size = this->mtx_small->get_size();
     const auto nnz = size[0] * size[1];
-    auto reverse_coo = gko::share(Coo::create(exec, size, nnz));
+    auto reverse_coo = gko::share(Coo::create(this->exec, size, nnz));
     // Fill the Coo matrix in reversed row order (right to left)
     for (size_t i = 0; i < size[0]; ++i) {
         for (size_t j = 0; j < size[1]; ++j) {
             const auto coo_idx = i * size[1] + (size[1] - 1 - j);
             reverse_coo->get_row_idxs()[coo_idx] = i;
             reverse_coo->get_col_idxs()[coo_idx] = j;
-            reverse_coo->get_values()[coo_idx] = mtx_small->at(i, j);
+            reverse_coo->get_values()[coo_idx] = this->mtx_small->at(i, j);
         }
     }
 
-    auto factors = ilu_factory_sort->generate(reverse_coo);
+    auto factors = this->ilu_factory_sort->generate(reverse_coo);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(reverse_coo, mtx_small, 1e-14);
-    GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(reverse_coo, this->mtx_small, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r<value_type>::value);
 }
 
 
-TEST_F(ParIlu, GenerateForReverseCsrSmall)
+TYPED_TEST(ParIlu, GenerateForReverseCsrSmall)
 {
-    const auto size = mtx_csr_small->get_size();
+    using value_type = typename TestFixture::value_type;
+    using Csr = typename TestFixture::Csr;
+    const auto size = this->mtx_csr_small->get_size();
     const auto nnz = size[0] * size[1];
-    auto reverse_csr = gko::share(Csr::create(exec));
-    reverse_csr->copy_from(mtx_csr_small.get());
+    auto reverse_csr = gko::share(Csr::create(this->exec));
+    reverse_csr->copy_from(gko::lend(this->mtx_csr_small));
     // Fill the Csr matrix rows in reverse order
     for (size_t i = 0; i < size[0]; ++i) {
         const auto row_start = reverse_csr->get_row_ptrs()[i];
@@ -460,18 +739,18 @@ TEST_F(ParIlu, GenerateForReverseCsrSmall)
         for (size_t j = row_start; j < row_end; ++j) {
             const auto reverse_j = row_end - 1 - (j - row_start);
             reverse_csr->get_values()[reverse_j] =
-                mtx_csr_small->get_const_values()[j];
+                this->mtx_csr_small->get_const_values()[j];
             reverse_csr->get_col_idxs()[reverse_j] =
-                mtx_csr_small->get_const_col_idxs()[j];
+                this->mtx_csr_small->get_const_col_idxs()[j];
         }
     }
 
-    auto factors = ilu_factory_sort->generate(reverse_csr);
+    auto factors = this->ilu_factory_sort->generate(reverse_csr);
     auto l_factor = factors->get_l_factor();
     auto u_factor = factors->get_u_factor();
 
-    GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14);
-    GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r<value_type>::value);
 }
 
 
diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp
new file mode 100644
index 00000000000..a72dd6206f7
--- /dev/null
+++ b/reference/test/factorization/par_ilut_kernels.cpp
@@ -0,0 +1,675 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/factorization/par_ilut.hpp>
+
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+class DummyLinOp : public gko::EnableLinOp<DummyLinOp>,
+                   public gko::EnableCreateMethod<DummyLinOp> {
+public:
+    DummyLinOp(std::shared_ptr<const gko::Executor> exec,
+               gko::dim<2> size = gko::dim<2>{})
+        : EnableLinOp<DummyLinOp>(exec, size)
+    {}
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override {}
+
+    void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b,
+                    const gko::LinOp *beta, gko::LinOp *x) const override
+    {}
+};
+
+
+template <typename ValueIndexType>
+class ParIlut : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using factorization_type =
+        gko::factorization::ParIlut<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using ComplexCsr =
+        gko::matrix::Csr<std::complex<gko::remove_complex<value_type>>,
+                         index_type>;
+
+    ParIlut()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(std::static_pointer_cast<const gko::Executor>(ref)),
+
+          mtx1(gko::initialize<Csr>({{.1, 0., 0., 0.},
+                                     {.1, .1, 0., 0.},
+                                     {-1., -2., -1., 0.},
+                                     {-2., -3., -1., 1.}},
+                                    ref)),
+          mtx1_expect_thrm2(gko::initialize<Csr>({{.1, 0., 0., 0.},
+                                                  {0., .1, 0., 0.},
+                                                  {0., -2., -1., 0.},
+                                                  {-2., -3., 0., 1.}},
+                                                 ref)),
+          mtx1_expect_thrm3(gko::initialize<Csr>({{.1, 0., 0., 0.},
+                                                  {0., .1, 0., 0.},
+                                                  {0., 0., -1., 0.},
+                                                  {0., -3., 0., 1.}},
+                                                 ref)),
+          mtx1_complex(gko::initialize<ComplexCsr>(
+              {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}},
+               {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}},
+               {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}},
+               {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}},
+              ref)),
+          mtx1_expect_complex_thrm(gko::initialize<ComplexCsr>(
+              {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}},
+               {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}},
+               {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}},
+               {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}},
+              ref)),
+          identity(gko::initialize<Csr>(
+              {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)),
+          lower_tri(gko::initialize<Csr>(
+              {{1., 0., 0.}, {1., 1., 0.}, {1., 1., 1.}}, ref)),
+          upper_tri(gko::initialize<Csr>(
+              {{2., 1., 1.}, {0., -3., 1.}, {0., 0., 4.}}, ref)),
+          mtx_system(gko::initialize<Csr>({{1., 6., 4., 7.},
+                                           {2., -5., 0., 8.},
+                                           {.5, -3., 6., 0.},
+                                           {.2, -.5, -9., 0.}},
+                                          ref)),
+          mtx_l_system(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                             {2., 1., 0., 0.},
+                                             {.5, -3., 1., 0.},
+                                             {.2, -.5, -9., 1.}},
+                                            ref)),
+          mtx_u_system(gko::initialize<Csr>({{1., 6., 4., 7.},
+                                             {0., 1., 0., 8.},
+                                             {0., 0., 6., 0.},
+                                             {0., 0., 0., 1.}},
+                                            ref)),
+          mtx_l(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                      {4., 1., 0., 0.},
+                                      {-1., 0., 1., 0.},
+                                      {0., -3., -1., 1.}},
+                                     ref)),
+          mtx_u(gko::initialize<Csr>({{2., 0., 1., 1.},
+                                      {0., 3., 0., 2.},
+                                      {0., 0., .5, 0.},
+                                      {0., 0., 0., 4.}},
+                                     ref)),
+          mtx_lu(gko::initialize<Csr>({{1., 2., 3., 4.},
+                                       {0., 6., 7., 8.},
+                                       {9., .1, .2, 0.},
+                                       {.3, .4, .5, .6}},
+                                      ref)),
+          mtx_l_add_expect(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                                 {4., 1., 0., 0.},
+                                                 {-1., -3.1 / 3., 1., 0.},
+                                                 {-.05, -3., -1., 1.}},
+                                                ref)),
+          mtx_u_add_expect(gko::initialize<Csr>({{2., 4., 1., 1.},
+                                                 {0., 3., -7., 2.},
+                                                 {0., 0., .5, 0.},
+                                                 {0., 0., 0., 4.}},
+                                                ref)),
+          mtx_l_it_expect(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                                {2., 1., 0., 0.},
+                                                {.5, 6. / 17., 1., 0.},
+                                                {.2, .1, -2.45, 1.}},
+                                               ref)),
+          mtx_u_it_expect(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                                {6., -17., 0., 0.},
+                                                {4., 0., 4., 0.},
+                                                {7., -6., 0., -.8}},
+                                               ref)),
+          mtx_l_small_expect(gko::initialize<Csr>({{1., 0., 0., 0.},
+                                                   {2., 1., 0., 0.},
+                                                   {.5, 6. / 17., 1., 0.},
+                                                   {0., 0., -153. / 116., 1.}},
+                                                  ref)),
+          mtx_u_small_expect(gko::initialize<Csr>({{1., 6., 4., 7.},
+                                                   {0., -17., -8., -6.},
+                                                   {0., 0., 116. / 17., 0.},
+                                                   {0., 0., 0., .0}},
+                                                  ref)),
+          mtx_l_large_expect(
+              gko::initialize<Csr>({{1., 0., 0., 0.},
+                                    {2., 1., 0., 0.},
+                                    {.5, 6. / 17., 1., 0.},
+                                    {0.2, 0.1, -153. / 116., 1.}},
+                                   ref)),
+          mtx_u_large_expect(
+              gko::initialize<Csr>({{1., 6., 4., 7.},
+                                    {0., -17., -8., -6.},
+                                    {0., 0., 116. / 17., -47. / 34.},
+                                    {0., 0., 0., -3043. / 1160.}},
+                                   ref)),
+          fact_fact(factorization_type::build().on(exec)),
+          tol{r<value_type>::value}
+    {}
+
+    template <typename Mtx>
+    void test_select(const std::unique_ptr<Mtx> &mtx, index_type rank,
+                     gko::remove_complex<value_type> expected,
+                     gko::remove_complex<value_type> tolerance = 0.0)
+    {
+        using ValueType = typename Mtx::value_type;
+        gko::remove_complex<ValueType> result{};
+
+        gko::remove_complex<ValueType> res{};
+        gko::remove_complex<ValueType> dres{};
+        gko::Array<ValueType> tmp(ref);
+        gko::Array<gko::remove_complex<ValueType>> tmp2(ref);
+        gko::kernels::reference::par_ilut_factorization::threshold_select(
+            ref, mtx.get(), rank, tmp, tmp2, result);
+
+        ASSERT_NEAR(result, expected, tolerance);
+    }
+
+    template <typename Mtx,
+              typename Coo = gko::matrix::Coo<typename Mtx::value_type,
+                                              typename Mtx::index_type>>
+    void test_filter(const std::unique_ptr<Mtx> &mtx,
+                     gko::remove_complex<value_type> threshold,
+                     const std::unique_ptr<Mtx> &expected, bool lower)
+    {
+        auto res_mtx = Mtx::create(exec, mtx->get_size());
+        auto res_mtx_coo = Coo::create(exec, mtx->get_size());
+
+        auto local_mtx = gko::as<Mtx>(lower ? mtx->clone() : mtx->transpose());
+        auto local_expected =
+            gko::as<Mtx>(lower ? expected->clone() : expected->transpose());
+
+        gko::kernels::reference::par_ilut_factorization::threshold_filter(
+            ref, local_mtx.get(), threshold, res_mtx.get(), res_mtx_coo.get(),
+            lower);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(local_expected, res_mtx);
+        GKO_ASSERT_MTX_NEAR(local_expected, res_mtx, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx, res_mtx_coo);
+        GKO_ASSERT_MTX_NEAR(res_mtx, res_mtx_coo, 0);
+    }
+
+    template <typename Mtx,
+              typename Coo = gko::matrix::Coo<typename Mtx::value_type,
+                                              typename Mtx::index_type>>
+    void test_filter_approx(const std::unique_ptr<Mtx> &mtx, index_type rank,
+                            const std::unique_ptr<Mtx> &expected)
+    {
+        auto res_mtx = Mtx::create(exec, mtx->get_size());
+        auto res_mtx_coo = Coo::create(exec, mtx->get_size());
+        auto res_mtx2 = Mtx::create(exec, mtx->get_size());
+        auto res_mtx_coo2 = Coo::create(exec, mtx->get_size());
+
+        auto tmp = gko::Array<typename Mtx::value_type>{exec};
+        gko::remove_complex<typename Mtx::value_type> threshold{};
+        gko::kernels::reference::par_ilut_factorization::
+            threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold,
+                                    res_mtx.get(), res_mtx_coo.get());
+        gko::kernels::reference::par_ilut_factorization::threshold_filter(
+            ref, mtx.get(), threshold, res_mtx2.get(), res_mtx_coo2.get(),
+            true);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(expected, res_mtx);
+        GKO_ASSERT_MTX_EQ_SPARSITY(expected, res_mtx2);
+        GKO_ASSERT_MTX_NEAR(expected, res_mtx, 0);
+        GKO_ASSERT_MTX_NEAR(expected, res_mtx2, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx, res_mtx_coo);
+        GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx, res_mtx_coo2);
+        GKO_ASSERT_MTX_NEAR(res_mtx, res_mtx_coo, 0);
+        GKO_ASSERT_MTX_NEAR(res_mtx, res_mtx_coo2, 0);
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<Csr> mtx1;
+    std::unique_ptr<Csr> mtx1_expect_thrm2;
+    std::unique_ptr<Csr> mtx1_expect_thrm3;
+    std::unique_ptr<ComplexCsr> mtx1_complex;
+    std::unique_ptr<ComplexCsr> mtx1_expect_complex_thrm;
+    std::shared_ptr<Csr> identity;
+    std::shared_ptr<Csr> lower_tri;
+    std::shared_ptr<Csr> upper_tri;
+    std::shared_ptr<Csr> mtx_system;
+    std::unique_ptr<Csr> mtx_l_system;
+    std::unique_ptr<Csr> mtx_u_system;
+    std::unique_ptr<Csr> mtx_l;
+    std::unique_ptr<Csr> mtx_u;
+    std::unique_ptr<Csr> mtx_lu;
+    std::unique_ptr<Csr> mtx_l_add_expect;
+    std::unique_ptr<Csr> mtx_u_add_expect;
+    std::unique_ptr<Csr> mtx_l_it_expect;
+    std::unique_ptr<Csr> mtx_u_it_expect;
+    std::unique_ptr<Csr> mtx_l_small_expect;
+    std::unique_ptr<Csr> mtx_u_small_expect;
+    std::unique_ptr<Csr> mtx_l_large_expect;
+    std::unique_ptr<Csr> mtx_u_large_expect;
+    std::unique_ptr<typename factorization_type::Factory> fact_fact;
+    gko::remove_complex<value_type> tol;
+};  // namespace
+
+TYPED_TEST_CASE(ParIlut, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(ParIlut, KernelThresholdSelect)
+{
+    this->test_select(this->mtx1, 7, 2.0);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdSelectMin)
+{
+    this->test_select(this->mtx1, 0, 0.1);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdSelectMax)
+{
+    this->test_select(this->mtx1, 9, 3.0);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdSelect)
+{
+    using value_type = typename TestFixture::value_type;
+    this->test_select(this->mtx1_complex, 5, sqrt(2), this->tol);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdSelectMin)
+{
+    using value_type = typename TestFixture::value_type;
+    this->test_select(this->mtx1_complex, 0, 0.1, this->tol);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdSelectMax)
+{
+    using value_type = typename TestFixture::value_type;
+    this->test_select(this->mtx1_complex, 9, sqrt(9.01), this->tol);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCoo)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    auto res_mtx = Csr::create(this->exec, this->mtx1->get_size());
+    Coo *null_coo = nullptr;
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter(
+        this->ref, this->mtx1.get(), 0.0, res_mtx.get(), null_coo, true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(this->mtx1, res_mtx);
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res_mtx, 0);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNoneLower)
+{
+    this->test_filter(this->mtx1, 0.0, this->mtx1, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNoneUpper)
+{
+    this->test_filter(this->mtx1, 0.0, this->mtx1, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterSomeAtThresholdLower)
+{
+    this->test_filter(this->mtx1, 2.0, this->mtx1_expect_thrm2, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterSomeAtThresholdUpper)
+{
+    this->test_filter(this->mtx1, 2.0, this->mtx1_expect_thrm2, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterSomeAboveThresholdLower)
+{
+    this->test_filter(this->mtx1, 3.0, this->mtx1_expect_thrm3, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterSomeAboveThresholdUpper)
+{
+    this->test_filter(this->mtx1, 3.0, this->mtx1_expect_thrm3, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdFilterNoneLower)
+{
+    this->test_filter(this->mtx1_complex, 0.0, this->mtx1_complex, true);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdFilterNoneUpper)
+{
+    this->test_filter(this->mtx1_complex, 0.0, this->mtx1_complex, false);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdFilterSomeAtThresholdLower)
+{
+    this->test_filter(this->mtx1_complex, 1.01, this->mtx1_expect_complex_thrm,
+                      true);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdFilterSomeAtThresholdUpper)
+{
+    this->test_filter(this->mtx1_complex, 1.01, this->mtx1_expect_complex_thrm,
+                      false);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCoo)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto res_mtx = Csr::create(this->exec, this->mtx1->get_size());
+    auto tmp = gko::Array<value_type>{this->ref};
+    gko::remove_complex<value_type> threshold{};
+    Coo *null_coo = nullptr;
+    index_type rank{};
+
+    gko::kernels::reference::par_ilut_factorization::threshold_filter_approx(
+        this->ref, this->mtx1.get(), rank, tmp, threshold, res_mtx.get(),
+        null_coo);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(this->mtx1, res_mtx);
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res_mtx, 0);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterSomeApprox1)
+{
+    this->test_filter_approx(this->mtx1, 7, this->mtx1_expect_thrm2);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterSomeApprox2)
+{
+    this->test_filter_approx(this->mtx1, 8, this->mtx1_expect_thrm2);
+}
+
+
+TYPED_TEST(ParIlut, KernelThresholdFilterNoneApprox)
+{
+    this->test_filter_approx(this->mtx1, 0, this->mtx1);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdFilterSomeApprox)
+{
+    this->test_filter_approx(this->mtx1_complex, 4,
+                             this->mtx1_expect_complex_thrm);
+}
+
+
+TYPED_TEST(ParIlut, KernelComplexThresholdFilterNoneApprox)
+{
+    this->test_filter_approx(this->mtx1_complex, 0, this->mtx1_complex);
+}
+
+
+TYPED_TEST(ParIlut, KernelAddCandidates)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size());
+    auto res_mtx_u = Csr::create(this->exec, this->mtx_system->get_size());
+
+    gko::kernels::reference::par_ilut_factorization::add_candidates(
+        this->ref, this->mtx_lu.get(), this->mtx_system.get(),
+        this->mtx_l.get(), this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, this->mtx_l_add_expect);
+    GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, this->mtx_u_add_expect);
+    GKO_ASSERT_MTX_NEAR(res_mtx_l, this->mtx_l_add_expect, this->tol);
+    GKO_ASSERT_MTX_NEAR(res_mtx_u, this->mtx_u_add_expect, this->tol);
+}
+
+
+TYPED_TEST(ParIlut, KernelComputeLU)
+{
+    using Csr = typename TestFixture::Csr;
+    using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    auto mtx_l_coo = Coo::create(this->exec, this->mtx_system->get_size());
+    this->mtx_l_system->convert_to(mtx_l_coo.get());
+    auto mtx_u_transp = this->mtx_u_system->transpose();
+    auto mtx_u_coo = Coo::create(this->exec, this->mtx_system->get_size());
+    this->mtx_u_system->convert_to(mtx_u_coo.get());
+    auto mtx_u_csc = gko::as<Csr>(mtx_u_transp.get());
+
+    gko::kernels::reference::par_ilut_factorization::compute_l_u_factors(
+        this->ref, this->mtx_system.get(), this->mtx_l_system.get(),
+        mtx_l_coo.get(), this->mtx_u_system.get(), mtx_u_coo.get(), mtx_u_csc);
+    auto mtx_utt = gko::as<Csr>(mtx_u_csc->transpose());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx_l_system, this->mtx_l_it_expect, this->tol);
+    GKO_ASSERT_MTX_NEAR(mtx_u_csc, this->mtx_u_it_expect, this->tol);
+    GKO_ASSERT_MTX_NEAR(this->mtx_u_system, mtx_utt, 0);
+}
+
+
+TYPED_TEST(ParIlut, ThrowNotSupportedForWrongLinOp)
+{
+    auto lin_op = DummyLinOp::create(this->ref);
+
+    ASSERT_THROW(this->fact_fact->generate(gko::share(lin_op)),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(ParIlut, ThrowDimensionMismatch)
+{
+    using Csr = typename TestFixture::Csr;
+    auto matrix = Csr::create(this->ref, gko::dim<2>{2, 3}, 4);
+
+    ASSERT_THROW(this->fact_fact->generate(gko::share(matrix)),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(ParIlut, SetStrategies)
+{
+    using Csr = typename TestFixture::Csr;
+    using factorization_type = typename TestFixture::factorization_type;
+    auto l_strategy = std::make_shared<typename Csr::merge_path>();
+    auto u_strategy = std::make_shared<typename Csr::classical>();
+
+    auto factory = factorization_type::build()
+                       .with_l_strategy(l_strategy)
+                       .with_u_strategy(u_strategy)
+                       .on(this->ref);
+    auto fact = factory->generate(this->mtx_system);
+
+    ASSERT_EQ(factory->get_parameters().l_strategy, l_strategy);
+    ASSERT_EQ(fact->get_l_factor()->get_strategy()->get_name(),
+              l_strategy->get_name());
+    ASSERT_EQ(factory->get_parameters().u_strategy, u_strategy);
+    ASSERT_EQ(fact->get_u_factor()->get_strategy()->get_name(),
+              u_strategy->get_name());
+}
+
+
+TYPED_TEST(ParIlut, IsConsistentWithComposition)
+{
+    auto fact = this->fact_fact->generate(this->mtx_system);
+
+    auto lin_op_l_factor =
+        static_cast<const gko::LinOp *>(gko::lend(fact->get_l_factor()));
+    auto lin_op_u_factor =
+        static_cast<const gko::LinOp *>(gko::lend(fact->get_u_factor()));
+    auto first_operator = gko::lend(fact->get_operators()[0]);
+    auto second_operator = gko::lend(fact->get_operators()[1]);
+
+    ASSERT_EQ(lin_op_l_factor, first_operator);
+    ASSERT_EQ(lin_op_u_factor, second_operator);
+}
+
+
+TYPED_TEST(ParIlut, GenerateIdentity)
+{
+    auto fact = this->fact_fact->generate(this->identity);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->identity, this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateDenseIdentity)
+{
+    using Dense = typename TestFixture::Dense;
+    auto dense_id = Dense::create(this->exec, this->identity->get_size());
+    this->identity->convert_to(dense_id.get());
+    auto fact = this->fact_fact->generate(gko::share(dense_id));
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->identity, this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateLowerTri)
+{
+    auto fact = this->fact_fact->generate(this->lower_tri);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->lower_tri, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->identity, this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateUpperTri)
+{
+    auto fact = this->fact_fact->generate(this->upper_tri);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->upper_tri, this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateWithExactSmallLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(false)
+                    .with_fill_in_limit(0.75)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_small_expect,
+                        this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateWithApproxSmallLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(true)
+                    .with_fill_in_limit(0.75)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_small_expect,
+                        this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateWithExactLargeLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(false)
+                    .with_fill_in_limit(1.2)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_large_expect,
+                        this->tol);
+}
+
+
+TYPED_TEST(ParIlut, GenerateWithApproxLargeLimit)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::build()
+                    .with_approximate_select(true)
+                    .with_fill_in_limit(1.2)
+                    .on(this->exec)
+                    ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect,
+                        this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_large_expect,
+                        this->tol);
+}
+
+
+}  // namespace
diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp
index 637761ee19b..01a9b17c303 100644
--- a/reference/test/log/convergence.cpp
+++ b/reference/test/log/convergence.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,37 +36,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
-TEST(Record, CatchesCriterionCheckCompleted)
+template <typename T>
+class Convergence : public ::testing::Test {};
+
+TYPED_TEST_CASE(Convergence, gko::test::ValueTypes);
+
+
+TYPED_TEST(Convergence, CatchesCriterionCheckCompleted)
 {
     auto exec = gko::ReferenceExecutor::create();
-    auto logger = gko::log::Convergence<>::create(
+    auto logger = gko::log::Convergence<TypeParam>::create(
         exec, gko::log::Logger::criterion_check_completed_mask);
     auto criterion =
         gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate(
             nullptr, nullptr, nullptr);
     constexpr gko::uint8 RelativeStoppingId{42};
     gko::Array<gko::stopping_status> stop_status(exec, 1);
-    using Mtx = gko::matrix::Dense<>;
+    using Mtx = gko::matrix::Dense<TypeParam>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<TypeParam>>;
     auto residual = gko::initialize<Mtx>({1.0, 2.0, 2.0}, exec);
 
-    logger->on<gko::log::Logger::criterion_check_completed>(
+    logger->template on<gko::log::Logger::criterion_check_completed>(
         criterion.get(), 1, residual.get(), nullptr, nullptr,
         RelativeStoppingId, true, &stop_status, true, true);
 
     ASSERT_EQ(logger->get_num_iterations(), 1);
     GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(logger->get_residual()),
                         l({1.0, 2.0, 2.0}), 0.0);
-    GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(logger->get_residual_norm()), l({3.0}),
-                        0.0);
+    GKO_ASSERT_MTX_NEAR(gko::as<NormVector>(logger->get_residual_norm()),
+                        l({3.0}), 0.0);
 }
 
 
diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp
index 3482f3aaa10..842b6214374 100644
--- a/reference/test/log/papi.cpp
+++ b/reference/test/log/papi.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include <ginkgo/core/log/papi.hpp>
 
 
@@ -38,18 +37,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <papi.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Papi : public ::testing::Test {
 protected:
-    using Dense = gko::matrix::Dense<>;
+    using Dense = gko::matrix::Dense<T>;
 
     Papi() : exec(gko::ReferenceExecutor::create()), eventset(PAPI_NULL) {}
 
@@ -67,11 +69,11 @@ class Papi : public ::testing::Test {
 
     void TearDown() { eventset = PAPI_NULL; }
 
-    template <typename T>
+    template <typename U>
     const std::string init(const gko::log::Logger::mask_type &event,
-                           const std::string &event_name, T *ptr)
+                           const std::string &event_name, U *ptr)
     {
-        logger = gko::log::Papi<>::create(exec, event);
+        logger = gko::log::Papi<T>::create(exec, event);
         std::ostringstream os;
         os << "sde:::" << logger->get_handle_name() << "::" << event_name << "_"
            << reinterpret_cast<gko::uintptr>(ptr);
@@ -108,29 +110,33 @@ class Papi : public ::testing::Test {
         }
     }
 
-    std::shared_ptr<const gko::log::Papi<>> logger;
+    std::shared_ptr<const gko::log::Papi<T>> logger;
     std::shared_ptr<const gko::Executor> exec;
     int eventset;
 };
 
+TYPED_TEST_CASE(Papi, gko::test::ValueTypes);
+
 
-TEST_F(Papi, CatchesCriterionCheckCompleted)
+TYPED_TEST(Papi, CatchesCriterionCheckCompleted)
 {
-    auto residual_norm = gko::initialize<Dense>({4.0}, exec);
-    auto criterion =
-        gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate(
-            nullptr, nullptr, nullptr);
-    auto str = init(gko::log::Logger::criterion_check_completed_mask,
-                    "criterion_check_completed", criterion.get());
-    add_event(str + ":CNT");
-    add_event(str);
-
-    start();
-    logger->on<gko::log::Logger::criterion_check_completed>(
+    using Dense = typename TestFixture::Dense;
+    auto residual_norm = gko::initialize<Dense>({4.0}, this->exec);
+    auto criterion = gko::stop::Iteration::build()
+                         .with_max_iters(3u)
+                         .on(this->exec)
+                         ->generate(nullptr, nullptr, nullptr);
+    auto str = this->init(gko::log::Logger::criterion_check_completed_mask,
+                          "criterion_check_completed", criterion.get());
+    this->add_event(str + ":CNT");
+    this->add_event(str);
+
+    this->start();
+    this->logger->template on<gko::log::Logger::criterion_check_completed>(
         criterion.get(), 0, nullptr, residual_norm.get(), nullptr, 0, false,
         nullptr, false, false);
     long long int values[2];
-    stop(values);
+    this->stop(values);
     double *sde_ptr = GET_SDE_RECORDER_ADDRESS(values[1], double);
 
     ASSERT_EQ(values[0], 1);
diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt
index 7c0b5742eed..d6878d864f3 100644
--- a/reference/test/matrix/CMakeLists.txt
+++ b/reference/test/matrix/CMakeLists.txt
@@ -4,6 +4,7 @@ ginkgo_create_test(dense_kernels)
 ginkgo_create_test(ell_kernels)
 ginkgo_create_test(hybrid_kernels)
 ginkgo_create_test(identity)
+ginkgo_create_test(permutation)
 ginkgo_create_test(sellp_kernels)
 ginkgo_create_test(sparsity_csr)
 ginkgo_create_test(sparsity_csr_kernels)
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index 0c0889df6ed..629a06dde23 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/coo_kernels.hpp"
+#include <ginkgo/core/matrix/coo.hpp>
 
 
 #include <memory>
@@ -41,22 +41,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/matrix/coo_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class Coo : public ::testing::Test {
 protected:
-    using Csr = gko::matrix::Csr<>;
-    using Mtx = gko::matrix::Coo<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Mtx = gko::matrix::Coo<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec))
     {
@@ -80,61 +85,110 @@ class Coo : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{3.0});
+        EXPECT_EQ(v[2], value_type{2.0});
+        EXPECT_EQ(v[3], value_type{5.0});
     }
 
     std::shared_ptr<const gko::Executor> exec;
     std::unique_ptr<Mtx> mtx;
 };
 
+TYPED_TEST_CASE(Coo, gko::test::ValueIndexTypes);
+
 
-TEST_F(Coo, ConvertsToCsr)
+TYPED_TEST(Coo, ConvertsToPrecision)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_merge);
-
-    mtx->convert_to(csr_mtx_c.get());
-    mtx->convert_to(csr_mtx_m.get());
-
-    assert_equal_to_mtx_in_csr_format(csr_mtx_c.get());
-    assert_equal_to_mtx_in_csr_format(csr_mtx_m.get());
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Coo = typename TestFixture::Mtx;
+    using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
+    auto tmp = OtherCoo::create(this->exec);
+    auto res = Coo::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx, res, residual);
 }
 
 
-TEST_F(Coo, MovesToCsr)
+TYPED_TEST(Coo, MovesToPrecision)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_merge);
-    auto mtx_clone = mtx->clone();
-
-    mtx->move_to(csr_mtx_c.get());
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Coo = typename TestFixture::Mtx;
+    using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
+    auto tmp = OtherCoo::create(this->exec);
+    auto res = Coo::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx, res, residual);
+}
+
+
+TYPED_TEST(Coo, ConvertsToCsr)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx->get_executor(), csr_s_merge);
+
+    this->mtx->convert_to(csr_mtx_c.get());
+    this->mtx->convert_to(csr_mtx_m.get());
+
+    this->assert_equal_to_mtx_in_csr_format(csr_mtx_c.get());
+    this->assert_equal_to_mtx_in_csr_format(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+}
+
+
+TYPED_TEST(Coo, MovesToCsr)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx->clone();
+
+    this->mtx->move_to(csr_mtx_c.get());
     mtx_clone->move_to(csr_mtx_m.get());
 
-    assert_equal_to_mtx_in_csr_format(csr_mtx_c.get());
-    assert_equal_to_mtx_in_csr_format(csr_mtx_m.get());
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    this->assert_equal_to_mtx_in_csr_format(csr_mtx_c.get());
+    this->assert_equal_to_mtx_in_csr_format(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Coo, ConvertsToDense)
+TYPED_TEST(Coo, ConvertsToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Dense = typename TestFixture::Vec;
+    auto dense_mtx = Dense::create(this->mtx->get_executor());
 
-    mtx->convert_to(dense_mtx.get());
+    this->mtx->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -144,11 +198,13 @@ TEST_F(Coo, ConvertsToDense)
 }
 
 
-TEST_F(Coo, MovesToDense)
+TYPED_TEST(Coo, MovesToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
+    using value_type = typename TestFixture::value_type;
+    using Dense = typename TestFixture::Vec;
+    auto dense_mtx = Dense::create(this->mtx->get_executor());
 
-    mtx->move_to(dense_mtx.get());
+    this->mtx->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -158,193 +214,309 @@ TEST_F(Coo, MovesToDense)
 }
 
 
-TEST_F(Coo, AppliesToDenseVector)
+TYPED_TEST(Coo, ConvertsEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Coo = typename TestFixture::Mtx;
+    using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
+    auto empty = OtherCoo::create(this->exec);
+    auto res = Coo::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Coo, MovesEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Coo = typename TestFixture::Mtx;
+    using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
+    auto empty = OtherCoo::create(this->exec);
+    auto res = Coo::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Coo, ConvertsEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Coo = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto empty = Coo::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Coo, MovesEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Coo = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto empty = Coo::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Coo, ConvertsEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Coo = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Coo::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Coo, MovesEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Coo = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Coo::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Coo, AppliesToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx->apply(x.get(), y.get());
+    this->mtx->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Coo, AppliesToDenseMatrix)
+TYPED_TEST(Coo, AppliesToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 2});
 
-    mtx->apply(x.get(), y.get());
+    this->mtx->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{13.0,  3.5},
-                       { 5.0, -7.5}}), 0.0);
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Coo, AppliesLinearCombinationToDenseVector)
+TYPED_TEST(Coo, AppliesLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Coo, AppliesLinearCombinationToDenseMatrix)
+TYPED_TEST(Coo, AppliesLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{-11.0, -2.5},
-                       { -1.0,  4.5}}), 0.0);
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Coo, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(Coo, ApplyFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Coo, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(Coo, ApplyFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Coo, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(Coo, ApplyFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Coo, AppliesAddToDenseVector)
+TYPED_TEST(Coo, AppliesAddToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({2.0, 1.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({2.0, 1.0}, this->exec);
 
-    mtx->apply2(x.get(), y.get());
+    this->mtx->apply2(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({15.0, 6.0}), 0.0);
 }
 
 
-TEST_F(Coo, AppliesAddToDenseMatrix)
+TYPED_TEST(Coo, AppliesAddToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx->apply2(x.get(), y.get());
+    this->mtx->apply2(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{14.0,  4.0},
-                       { 7.0, -9.0}}), 0.0);
+                        l({{14.0,  4.0},
+                           { 7.0, -9.0}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Coo, AppliesLinearCombinationAddToDenseVector)
+TYPED_TEST(Coo, AppliesLinearCombinationAddToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx->apply2(alpha.get(), x.get(), y.get());
+    this->mtx->apply2(alpha.get(), x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-12.0, -3.0}), 0.0);
 }
 
 
-TEST_F(Coo, AppliesLinearCombinationAddToDenseMatrix)
+TYPED_TEST(Coo, AppliesLinearCombinationAddToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx->apply2(alpha.get(), x.get(), y.get());
+    this->mtx->apply2(alpha.get(), x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{-12.0, -3.0},
-                       { -3.0,  6.0}}), 0.0);
+                        l({{-12.0, -3.0},
+                           { -3.0,  6.0}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Coo, ApplyAddFailsOnWrongInnerDimension)
+TYPED_TEST(Coo, ApplyAddFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply2(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply2(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Coo, ApplyAddFailsOnWrongNumberOfRows)
+TYPED_TEST(Coo, ApplyAddFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx->apply2(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply2(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Coo, ApplyAddFailsOnWrongNumberOfCols)
+TYPED_TEST(Coo, ApplyAddFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply2(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply2(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index f775ea018ee..736f90349ad 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/csr_kernels.hpp"
+#include <ginkgo/core/matrix/csr.hpp>
 
 
 #include <algorithm>
@@ -43,41 +43,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class Csr : public ::testing::Test {
 protected:
-    using Coo = gko::matrix::Coo<>;
-    using Mtx = gko::matrix::Csr<>;
-    using Sellp = gko::matrix::Sellp<>;
-    using SparsityCsr = gko::matrix::SparsityCsr<>;
-    using Ell = gko::matrix::Ell<>;
-    using Hybrid = gko::matrix::Hybrid<>;
-    using ComplexMtx = gko::matrix::Csr<std::complex<double>>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+    using Sellp = gko::matrix::Sellp<value_type, index_type>;
+    using SparsityCsr = gko::matrix::SparsityCsr<value_type, index_type>;
+    using Ell = gko::matrix::Ell<value_type, index_type>;
+    using Hybrid = gko::matrix::Hybrid<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Csr()
         : exec(gko::ReferenceExecutor::create()),
           mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4,
-                          std::make_shared<Mtx::load_balance>(2))),
+                          std::make_shared<typename Mtx::load_balance>(2))),
           mtx2(Mtx::create(exec, gko::dim<2>{2, 3}, 5,
-                           std::make_shared<Mtx::classical>())),
+                           std::make_shared<typename Mtx::classical>())),
           mtx3_sorted(Mtx::create(exec, gko::dim<2>(3, 3), 7,
-                                  std::make_shared<Mtx::classical>())),
-          mtx3_unsorted(Mtx::create(exec, gko::dim<2>(3, 3), 7,
-                                    std::make_shared<Mtx::classical>()))
+                                  std::make_shared<typename Mtx::classical>())),
+          mtx3_unsorted(
+              Mtx::create(exec, gko::dim<2>(3, 3), 7,
+                          std::make_shared<typename Mtx::classical>()))
     {
         this->create_mtx(mtx.get());
         this->create_mtx2(mtx2.get());
@@ -86,9 +92,9 @@ class Csr : public ::testing::Test {
 
     void create_mtx(Mtx *m)
     {
-        Mtx::value_type *v = m->get_values();
-        Mtx::index_type *c = m->get_col_idxs();
-        Mtx::index_type *r = m->get_row_ptrs();
+        value_type *v = m->get_values();
+        index_type *c = m->get_col_idxs();
+        index_type *r = m->get_row_ptrs();
         auto *s = m->get_srow();
         /*
          * 1   3   2
@@ -110,9 +116,9 @@ class Csr : public ::testing::Test {
 
     void create_mtx2(Mtx *m)
     {
-        Mtx::value_type *v = m->get_values();
-        Mtx::index_type *c = m->get_col_idxs();
-        Mtx::index_type *r = m->get_row_ptrs();
+        value_type *v = m->get_values();
+        index_type *c = m->get_col_idxs();
+        index_type *r = m->get_row_ptrs();
         // It keeps an explict zero
         /*
          *  1    3   2
@@ -206,10 +212,10 @@ class Csr : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{3.0});
+        EXPECT_EQ(v[2], value_type{2.0});
+        EXPECT_EQ(v[3], value_type{5.0});
     }
 
     void assert_equal_to_mtx(const Sellp *m)
@@ -232,12 +238,12 @@ class Csr : public ::testing::Test {
         EXPECT_EQ(c[65], 0);
         EXPECT_EQ(c[128], 2);
         EXPECT_EQ(c[129], 0);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 5.0);
-        EXPECT_EQ(v[64], 3.0);
-        EXPECT_EQ(v[65], 0.0);
-        EXPECT_EQ(v[128], 2.0);
-        EXPECT_EQ(v[129], 0.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{5.0});
+        EXPECT_EQ(v[64], value_type{3.0});
+        EXPECT_EQ(v[65], value_type{0.0});
+        EXPECT_EQ(v[128], value_type{2.0});
+        EXPECT_EQ(v[129], value_type{0.0});
     }
 
     void assert_equal_to_mtx(const SparsityCsr *m)
@@ -269,12 +275,12 @@ class Csr : public ::testing::Test {
         EXPECT_EQ(c[3], 0);
         EXPECT_EQ(c[4], 2);
         EXPECT_EQ(c[5], 0);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 5.0);
-        EXPECT_EQ(v[2], 3.0);
-        EXPECT_EQ(v[3], 0.0);
-        EXPECT_EQ(v[4], 2.0);
-        EXPECT_EQ(v[5], 0.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{5.0});
+        EXPECT_EQ(v[2], value_type{3.0});
+        EXPECT_EQ(v[3], value_type{0.0});
+        EXPECT_EQ(v[4], value_type{2.0});
+        EXPECT_EQ(v[5], value_type{0.0});
     }
 
     void assert_equal_to_mtx(const Hybrid *m)
@@ -298,10 +304,10 @@ class Csr : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], value_type{1.0});
+        EXPECT_EQ(v[1], value_type{3.0});
+        EXPECT_EQ(v[2], value_type{2.0});
+        EXPECT_EQ(v[3], value_type{5.0});
     }
 
     void assert_equal_to_mtx2(const Hybrid *m)
@@ -319,22 +325,21 @@ class Csr : public ::testing::Test {
         ASSERT_EQ(m->get_coo_num_stored_elements(), 1);
         EXPECT_EQ(r[0], 0);
         EXPECT_EQ(c[0], 2);
-        EXPECT_EQ(v[0], 2.0);
+        EXPECT_EQ(v[0], value_type{2.0});
         // Test Ell values
         ASSERT_EQ(m->get_ell_num_stored_elements(), 4);
         EXPECT_EQ(n, 2);
         EXPECT_EQ(p, 2);
-        EXPECT_EQ(ell_v[0], 1);
-        EXPECT_EQ(ell_v[1], 0);
-        EXPECT_EQ(ell_v[2], 3);
-        EXPECT_EQ(ell_v[3], 5);
+        EXPECT_EQ(ell_v[0], value_type{1});
+        EXPECT_EQ(ell_v[1], value_type{0});
+        EXPECT_EQ(ell_v[2], value_type{3});
+        EXPECT_EQ(ell_v[3], value_type{5});
         EXPECT_EQ(ell_c[0], 0);
         EXPECT_EQ(ell_c[1], 0);
         EXPECT_EQ(ell_c[2], 1);
         EXPECT_EQ(ell_c[3], 1);
     }
 
-    std::complex<double> i{0, 1};
     std::shared_ptr<const gko::ReferenceExecutor> exec;
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Mtx> mtx2;
@@ -342,232 +347,641 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3_unsorted;
 };
 
+TYPED_TEST_CASE(Csr, gko::test::ValueIndexTypes);
+
 
-TEST_F(Csr, AppliesToDenseVector)
+TYPED_TEST(Csr, AppliesToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
+
+    this->mtx->apply(x.get(), y.get());
+
+    EXPECT_EQ(y->at(0), T{13.0});
+    EXPECT_EQ(y->at(1), T{5.0});
+}
 
-    mtx->apply(x.get(), y.get());
 
-    EXPECT_EQ(y->at(0), 13.0);
-    EXPECT_EQ(y->at(1), 5.0);
+TYPED_TEST(Csr, AppliesToDenseMatrix)
+{
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0}, I<T>{1.0, -1.5}, I<T>{4.0, 2.5}}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    this->mtx->apply(x.get(), y.get());
+
+    EXPECT_EQ(y->at(0, 0), T{13.0});
+    EXPECT_EQ(y->at(1, 0), T{5.0});
+    EXPECT_EQ(y->at(0, 1), T{3.5});
+    EXPECT_EQ(y->at(1, 1), T{-7.5});
 }
 
 
-TEST_F(Csr, AppliesToDenseMatrix)
+TYPED_TEST(Csr, AppliesLinearCombinationToDenseVector)
 {
-    auto x = gko::initialize<Vec>({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx->apply(x.get(), y.get());
+    this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
 
-    EXPECT_EQ(y->at(0, 0), 13.0);
-    EXPECT_EQ(y->at(1, 0), 5.0);
-    EXPECT_EQ(y->at(0, 1), 3.5);
-    EXPECT_EQ(y->at(1, 1), -7.5);
+    EXPECT_EQ(y->at(0), T{-11.0});
+    EXPECT_EQ(y->at(1), T{-1.0});
 }
 
 
-TEST_F(Csr, AppliesLinearCombinationToDenseVector)
+TYPED_TEST(Csr, AppliesLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0}, I<T>{1.0, -1.5}, I<T>{4.0, 2.5}}, this->exec);
+    auto y =
+        gko::initialize<Vec>({I<T>{1.0, 0.5}, I<T>{2.0, -1.5}}, this->exec);
+
+    this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    EXPECT_EQ(y->at(0, 0), T{-11.0});
+    EXPECT_EQ(y->at(1, 0), T{-1.0});
+    EXPECT_EQ(y->at(0, 1), T{-2.5});
+    EXPECT_EQ(y->at(1, 1), T{4.5});
+}
 
-    mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
 
-    EXPECT_EQ(y->at(0), -11.0);
-    EXPECT_EQ(y->at(1), -1.0);
+TYPED_TEST(Csr, AppliesToCsrMatrix)
+{
+    using T = typename TestFixture::value_type;
+    this->mtx->apply(this->mtx3_unsorted.get(), this->mtx2.get());
+
+    ASSERT_EQ(this->mtx2->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx2->get_num_stored_elements(), 6);
+    ASSERT_TRUE(this->mtx2->is_sorted_by_column_index());
+    auto r = this->mtx2->get_const_row_ptrs();
+    auto c = this->mtx2->get_const_col_idxs();
+    auto v = this->mtx2->get_const_values();
+    // 13  5 31
+    // 15  5 40
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 6);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 0);
+    EXPECT_EQ(c[4], 1);
+    EXPECT_EQ(c[5], 2);
+    EXPECT_EQ(v[0], T{13});
+    EXPECT_EQ(v[1], T{5});
+    EXPECT_EQ(v[2], T{31});
+    EXPECT_EQ(v[3], T{15});
+    EXPECT_EQ(v[4], T{5});
+    EXPECT_EQ(v[5], T{40});
 }
 
 
-TEST_F(Csr, AppliesLinearCombinationToDenseMatrix)
+TYPED_TEST(Csr, AppliesLinearCombinationToCsrMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec);
-    auto y = gko::initialize<Vec>({{1.0, 0.5}, {2.0, -1.5}}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+
+    this->mtx->apply(alpha.get(), this->mtx3_unsorted.get(), beta.get(),
+                     this->mtx2.get());
+
+    ASSERT_EQ(this->mtx2->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(this->mtx2->get_num_stored_elements(), 6);
+    ASSERT_TRUE(this->mtx2->is_sorted_by_column_index());
+    auto r = this->mtx2->get_const_row_ptrs();
+    auto c = this->mtx2->get_const_col_idxs();
+    auto v = this->mtx2->get_const_values();
+    // -11 1 -27
+    // -15 5 -40
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 6);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 0);
+    EXPECT_EQ(c[4], 1);
+    EXPECT_EQ(c[5], 2);
+    EXPECT_EQ(v[0], T{-11});
+    EXPECT_EQ(v[1], T{1});
+    EXPECT_EQ(v[2], T{-27});
+    EXPECT_EQ(v[3], T{-15});
+    EXPECT_EQ(v[4], T{5});
+    EXPECT_EQ(v[5], T{-40});
+}
 
-    mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
 
-    EXPECT_EQ(y->at(0, 0), -11.0);
-    EXPECT_EQ(y->at(1, 0), -1.0);
-    EXPECT_EQ(y->at(0, 1), -2.5);
-    EXPECT_EQ(y->at(1, 1), 4.5);
+TYPED_TEST(Csr, AppliesLinearCombinationToIdentityMatrix)
+{
+    using T = typename TestFixture::value_type;
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    auto alpha = gko::initialize<Vec>({-3.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto a = gko::initialize<Mtx>(
+        {I<T>{2.0, 0.0, 3.0}, I<T>{0.0, 1.0, -1.5}, I<T>{0.0, -2.0, 0.0},
+         I<T>{5.0, 0.0, 0.0}, I<T>{1.0, 0.0, 4.0}, I<T>{2.0, -2.0, 0.0},
+         I<T>{0.0, 0.0, 0.0}},
+        this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{2.0, -2.0, 0.0}, I<T>{1.0, 0.0, 4.0}, I<T>{2.0, 0.0, 3.0},
+         I<T>{0.0, 1.0, -1.5}, I<T>{1.0, 0.0, 0.0}, I<T>{0.0, 0.0, 0.0},
+         I<T>{0.0, 0.0, 0.0}},
+        this->exec);
+    auto expect = gko::initialize<Mtx>(
+        {I<T>{-2.0, -4.0, -9.0}, I<T>{2.0, -3.0, 12.5}, I<T>{4.0, 6.0, 6.0},
+         I<T>{-15.0, 2.0, -3.0}, I<T>{-1.0, 0.0, -12.0}, I<T>{-6.0, 6.0, 0.0},
+         I<T>{0.0, 0.0, 0.0}},
+        this->exec);
+    auto id = gko::matrix::Identity<T>::create(this->exec, a->get_size()[1]);
+
+    a->apply(gko::lend(alpha), gko::lend(id), gko::lend(beta), gko::lend(b));
+
+    GKO_ASSERT_MTX_NEAR(b, expect, r<T>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(b, expect);
+    ASSERT_TRUE(b->is_sorted_by_column_index());
 }
 
 
-TEST_F(Csr, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(Csr, ApplyFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Csr, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(Csr, ApplyFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Csr, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(Csr, ApplyFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+}
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+
+TYPED_TEST(Csr, ConvertsToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Csr = typename TestFixture::Mtx;
+    using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
+    auto tmp = OtherCsr::create(this->exec);
+    auto res = Csr::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    // use mtx2 as mtx's strategy would involve creating a CudaExecutor
+    this->mtx2->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual);
+    ASSERT_EQ(typeid(*this->mtx2->get_strategy()),
+              typeid(*res->get_strategy()));
 }
 
 
-TEST_F(Csr, ConvertsToDense)
+TYPED_TEST(Csr, MovesToPrecision)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
-    auto dense_other = gko::initialize<gko::matrix::Dense<>>(
-        4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, exec);
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Csr = typename TestFixture::Mtx;
+    using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
+    auto tmp = OtherCsr::create(this->exec);
+    auto res = Csr::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    // use mtx2 as mtx's strategy would involve creating a CudaExecutor
+    this->mtx2->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual);
+    ASSERT_EQ(typeid(*this->mtx2->get_strategy()),
+              typeid(*res->get_strategy()));
+}
 
-    mtx->convert_to(dense_mtx.get());
+
+TYPED_TEST(Csr, ConvertsToDense)
+{
+    using Dense = typename TestFixture::Vec;
+    auto dense_mtx = Dense::create(this->mtx->get_executor());
+    auto dense_other = gko::initialize<Dense>(
+        4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, this->exec);
+
+    this->mtx->convert_to(dense_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(dense_mtx, dense_other, 0.0);
 }
 
 
-TEST_F(Csr, MovesToDense)
+TYPED_TEST(Csr, MovesToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
-    auto dense_other = gko::initialize<gko::matrix::Dense<>>(
-        4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, exec);
+    using Dense = typename TestFixture::Vec;
+    auto dense_mtx = Dense::create(this->mtx->get_executor());
+    auto dense_other = gko::initialize<Dense>(
+        4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, this->exec);
 
-    mtx->move_to(dense_mtx.get());
+    this->mtx->move_to(dense_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(dense_mtx, dense_other, 0.0);
 }
 
 
-TEST_F(Csr, ConvertsToCoo)
+TYPED_TEST(Csr, ConvertsToCoo)
 {
-    auto coo_mtx = gko::matrix::Coo<>::create(mtx->get_executor());
+    using Coo = typename TestFixture::Coo;
+    auto coo_mtx = Coo::create(this->mtx->get_executor());
 
-    mtx->convert_to(coo_mtx.get());
+    this->mtx->convert_to(coo_mtx.get());
 
-    assert_equal_to_mtx(coo_mtx.get());
+    this->assert_equal_to_mtx(coo_mtx.get());
 }
 
 
-TEST_F(Csr, MovesToCoo)
+TYPED_TEST(Csr, MovesToCoo)
 {
-    auto coo_mtx = gko::matrix::Coo<>::create(mtx->get_executor());
+    using Coo = typename TestFixture::Coo;
+    auto coo_mtx = Coo::create(this->mtx->get_executor());
 
-    mtx->move_to(coo_mtx.get());
+    this->mtx->move_to(coo_mtx.get());
 
-    assert_equal_to_mtx(coo_mtx.get());
+    this->assert_equal_to_mtx(coo_mtx.get());
 }
 
 
-TEST_F(Csr, ConvertsToSellp)
+TYPED_TEST(Csr, ConvertsToSellp)
 {
-    auto sellp_mtx = gko::matrix::Sellp<>::create(mtx->get_executor());
+    using Sellp = typename TestFixture::Sellp;
+    auto sellp_mtx = Sellp::create(this->mtx->get_executor());
 
-    mtx->convert_to(sellp_mtx.get());
+    this->mtx->convert_to(sellp_mtx.get());
 
-    assert_equal_to_mtx(sellp_mtx.get());
+    this->assert_equal_to_mtx(sellp_mtx.get());
 }
 
 
-TEST_F(Csr, MovesToSellp)
+TYPED_TEST(Csr, MovesToSellp)
 {
-    auto sellp_mtx = gko::matrix::Sellp<>::create(mtx->get_executor());
-    auto csr_ref = gko::matrix::Csr<>::create(mtx->get_executor());
+    using Sellp = typename TestFixture::Sellp;
+    using Csr = typename TestFixture::Mtx;
+    auto sellp_mtx = Sellp::create(this->mtx->get_executor());
+    auto csr_ref = Csr::create(this->mtx->get_executor());
 
-    csr_ref->copy_from(mtx.get());
+    csr_ref->copy_from(this->mtx.get());
     csr_ref->move_to(sellp_mtx.get());
 
-    assert_equal_to_mtx(sellp_mtx.get());
+    this->assert_equal_to_mtx(sellp_mtx.get());
 }
 
 
-TEST_F(Csr, ConvertsToSparsityCsr)
+TYPED_TEST(Csr, ConvertsToSparsityCsr)
 {
-    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(mtx->get_executor());
+    using SparsityCsr = typename TestFixture::SparsityCsr;
+    auto sparsity_mtx = SparsityCsr::create(this->mtx->get_executor());
 
-    mtx->convert_to(sparsity_mtx.get());
+    this->mtx->convert_to(sparsity_mtx.get());
 
-    assert_equal_to_mtx(sparsity_mtx.get());
+    this->assert_equal_to_mtx(sparsity_mtx.get());
 }
 
 
-TEST_F(Csr, MovesToSparsityCsr)
+TYPED_TEST(Csr, MovesToSparsityCsr)
 {
-    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(mtx->get_executor());
-    auto csr_ref = gko::matrix::Csr<>::create(mtx->get_executor());
+    using SparsityCsr = typename TestFixture::SparsityCsr;
+    using Csr = typename TestFixture::Mtx;
+    auto sparsity_mtx = SparsityCsr::create(this->mtx->get_executor());
+    auto csr_ref = Csr::create(this->mtx->get_executor());
 
-    csr_ref->copy_from(mtx.get());
+    csr_ref->copy_from(this->mtx.get());
     csr_ref->move_to(sparsity_mtx.get());
 
-    assert_equal_to_mtx(sparsity_mtx.get());
+    this->assert_equal_to_mtx(sparsity_mtx.get());
 }
 
 
-TEST_F(Csr, ConvertsToHybridAutomatically)
+TYPED_TEST(Csr, ConvertsToHybridAutomatically)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx->get_executor());
+    using Hybrid = typename TestFixture::Hybrid;
+    auto hybrid_mtx = Hybrid::create(this->mtx->get_executor());
 
-    mtx->convert_to(hybrid_mtx.get());
+    this->mtx->convert_to(hybrid_mtx.get());
 
-    assert_equal_to_mtx(hybrid_mtx.get());
+    this->assert_equal_to_mtx(hybrid_mtx.get());
 }
 
 
-TEST_F(Csr, MovesToHybridAutomatically)
+TYPED_TEST(Csr, MovesToHybridAutomatically)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx->get_executor());
-    auto csr_ref = gko::matrix::Csr<>::create(mtx->get_executor());
+    using Hybrid = typename TestFixture::Hybrid;
+    using Csr = typename TestFixture::Mtx;
+    auto hybrid_mtx = Hybrid::create(this->mtx->get_executor());
+    auto csr_ref = Csr::create(this->mtx->get_executor());
 
-    csr_ref->copy_from(mtx.get());
+    csr_ref->copy_from(this->mtx.get());
     csr_ref->move_to(hybrid_mtx.get());
 
-    assert_equal_to_mtx(hybrid_mtx.get());
+    this->assert_equal_to_mtx(hybrid_mtx.get());
 }
 
 
-TEST_F(Csr, ConvertsToHybridByColumn2)
+TYPED_TEST(Csr, ConvertsToHybridByColumn2)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(
-        mtx2->get_executor(),
-        std::make_shared<gko::matrix::Hybrid<>::column_limit>(2));
+    using Hybrid = typename TestFixture::Hybrid;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx2->get_executor(),
+                       std::make_shared<typename Hybrid::column_limit>(2));
 
-    mtx2->convert_to(hybrid_mtx.get());
+    this->mtx2->convert_to(hybrid_mtx.get());
 
-    assert_equal_to_mtx2(hybrid_mtx.get());
+    this->assert_equal_to_mtx2(hybrid_mtx.get());
 }
 
 
-TEST_F(Csr, MovesToHybridByColumn2)
+TYPED_TEST(Csr, MovesToHybridByColumn2)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(
-        mtx2->get_executor(),
-        std::make_shared<gko::matrix::Hybrid<>::column_limit>(2));
-    auto csr_ref = gko::matrix::Csr<>::create(mtx2->get_executor());
-
-    csr_ref->copy_from(mtx2.get());
+    using Hybrid = typename TestFixture::Hybrid;
+    using Csr = typename TestFixture::Mtx;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx2->get_executor(),
+                       std::make_shared<typename Hybrid::column_limit>(2));
+    auto csr_ref = Csr::create(this->mtx2->get_executor());
+
+    csr_ref->copy_from(this->mtx2.get());
     csr_ref->move_to(hybrid_mtx.get());
 
-    assert_equal_to_mtx2(hybrid_mtx.get());
+    this->assert_equal_to_mtx2(hybrid_mtx.get());
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Csr = typename TestFixture::Mtx;
+    using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
+    auto empty = OtherCsr::create(this->exec);
+    empty->get_row_ptrs()[0] = 0;
+    auto res = Csr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Csr = typename TestFixture::Mtx;
+    using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
+    auto empty = OtherCsr::create(this->exec);
+    empty->get_row_ptrs()[0] = 0;
+    auto res = Csr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using Csr = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using Csr = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToCoo)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Coo = gko::matrix::Coo<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Coo::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToCoo)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Coo = gko::matrix::Coo<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Coo::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToEll)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Ell = gko::matrix::Ell<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Ell::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToEll)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Ell = gko::matrix::Ell<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Ell::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToSellp)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Sellp = gko::matrix::Sellp<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Sellp::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToSellp)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Sellp = gko::matrix::Sellp<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Sellp::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToSparsityCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using SparsityCsr = gko::matrix::SparsityCsr<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    empty->get_row_ptrs()[0] = 0;
+    auto res = SparsityCsr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_nonzeros(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToSparsityCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using SparsityCsr = gko::matrix::SparsityCsr<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    empty->get_row_ptrs()[0] = 0;
+    auto res = SparsityCsr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_nonzeros(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+}
+
+
+TYPED_TEST(Csr, ConvertsEmptyToHybrid)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Hybrid = gko::matrix::Hybrid<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Hybrid::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Csr, MovesEmptyToHybrid)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Mtx;
+    using Hybrid = gko::matrix::Hybrid<ValueType, IndexType>;
+    auto empty = Csr::create(this->exec);
+    auto res = Hybrid::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TEST_F(Csr, CalculatesNonzerosPerRow)
+TYPED_TEST(Csr, CalculatesNonzerosPerRow)
 {
-    gko::Array<gko::size_type> row_nnz(exec, mtx->get_size()[0]);
+    gko::Array<gko::size_type> row_nnz(this->exec, this->mtx->get_size()[0]);
 
-    gko::kernels::reference::csr::calculate_nonzeros_per_row(exec, mtx.get(),
-                                                             &row_nnz);
+    gko::kernels::reference::csr::calculate_nonzeros_per_row(
+        this->exec, this->mtx.get(), &row_nnz);
 
     auto row_nnz_val = row_nnz.get_data();
     ASSERT_EQ(row_nnz_val[0], 3);
@@ -575,54 +989,59 @@ TEST_F(Csr, CalculatesNonzerosPerRow)
 }
 
 
-TEST_F(Csr, CalculatesTotalCols)
+TYPED_TEST(Csr, CalculatesTotalCols)
 {
     gko::size_type total_cols;
     gko::size_type stride_factor = gko::matrix::default_stride_factor;
     gko::size_type slice_size = gko::matrix::default_slice_size;
 
     gko::kernels::reference::csr::calculate_total_cols(
-        exec, mtx.get(), &total_cols, stride_factor, slice_size);
+        this->exec, this->mtx.get(), &total_cols, stride_factor, slice_size);
 
     ASSERT_EQ(total_cols, 3);
 }
 
 
-TEST_F(Csr, ConvertsToEll)
+TYPED_TEST(Csr, ConvertsToEll)
 {
-    auto ell_mtx = gko::matrix::Ell<>::create(mtx->get_executor());
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
-    auto ref_dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
+    using Ell = typename TestFixture::Ell;
+    using Dense = typename TestFixture::Vec;
+    auto ell_mtx = Ell::create(this->mtx->get_executor());
+    auto dense_mtx = Dense::create(this->mtx->get_executor());
+    auto ref_dense_mtx = Dense::create(this->mtx->get_executor());
 
-    mtx->convert_to(ell_mtx.get());
+    this->mtx->convert_to(ell_mtx.get());
 
-    assert_equal_to_mtx(ell_mtx.get());
+    this->assert_equal_to_mtx(ell_mtx.get());
 }
 
 
-TEST_F(Csr, MovesToEll)
+TYPED_TEST(Csr, MovesToEll)
 {
-    auto ell_mtx = gko::matrix::Ell<>::create(mtx->get_executor());
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
-    auto ref_dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor());
+    using Ell = typename TestFixture::Ell;
+    using Dense = typename TestFixture::Vec;
+    auto ell_mtx = Ell::create(this->mtx->get_executor());
+    auto dense_mtx = Dense::create(this->mtx->get_executor());
+    auto ref_dense_mtx = Dense::create(this->mtx->get_executor());
 
-    mtx->move_to(ell_mtx.get());
+    this->mtx->move_to(ell_mtx.get());
 
-    assert_equal_to_mtx(ell_mtx.get());
+    this->assert_equal_to_mtx(ell_mtx.get());
 }
 
 
-TEST_F(Csr, SquareMtxIsTransposable)
+TYPED_TEST(Csr, SquareMtxIsTransposable)
 {
+    using Csr = typename TestFixture::Mtx;
     // clang-format off
-    auto mtx2 = gko::initialize<gko::matrix::Csr<>>(
+    auto mtx2 = gko::initialize<Csr>(
                 {{1.0, 3.0, 2.0},
                  {0.0, 5.0, 0.0},
-                 {0.0, 1.5, 2.0}}, exec);
+                 {0.0, 1.5, 2.0}}, this->exec);
     // clang-format on
 
     auto trans = mtx2->transpose();
-    auto trans_as_csr = static_cast<gko::matrix::Csr<> *>(trans.get());
+    auto trans_as_csr = static_cast<Csr *>(trans.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(trans_as_csr,
@@ -633,10 +1052,11 @@ TEST_F(Csr, SquareMtxIsTransposable)
 }
 
 
-TEST_F(Csr, NonSquareMtxIsTransposable)
+TYPED_TEST(Csr, NonSquareMtxIsTransposable)
 {
-    auto trans = mtx->transpose();
-    auto trans_as_csr = static_cast<gko::matrix::Csr<> *>(trans.get());
+    using Csr = typename TestFixture::Mtx;
+    auto trans = this->mtx->transpose();
+    auto trans_as_csr = static_cast<Csr *>(trans.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(trans_as_csr,
@@ -647,59 +1067,266 @@ TEST_F(Csr, NonSquareMtxIsTransposable)
 }
 
 
-TEST_F(Csr, MtxIsConjugateTransposable)
+TYPED_TEST(Csr, SquareMatrixIsRowPermutable)
 {
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
     // clang-format off
-    auto mtx2 = gko::initialize<gko::matrix::Csr<std::complex<double>>>(
-        {{1.0 + 2.0 * i, 3.0 + 0.0 * i, 2.0 + 0.0 * i},
-         {0.0 + 0.0 * i, 5.0 - 3.5 * i, 0.0 + 0.0 * i},
-         {0.0 + 0.0 * i, 0.0 + 1.5 * i, 2.0 + 0.0 * i}}, exec);
+    auto p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                       {0.0, 5.0, 0.0},
+                                       {0.0, 1.5, 2.0}}, this->exec);
     // clang-format on
+    gko::Array<index_type> permute_idxs{this->exec, {1, 2, 0}};
 
-    auto trans = mtx2->conj_transpose();
-    auto trans_as_csr =
-        static_cast<gko::matrix::Csr<std::complex<double>> *>(trans.get());
+    auto row_permute = p_mtx->row_permute(&permute_idxs);
 
+    auto row_permute_csr = static_cast<Csr *>(row_permute.get());
     // clang-format off
-    GKO_ASSERT_MTX_NEAR(trans_as_csr,
-                    l({{1.0 - 2.0 * i, 0.0 + 0.0 * i, 0.0 + 0.0 * i},
-                       {3.0 + 0.0 * i, 5.0 + 3.5 * i, 0.0 - 1.5 * i},
-                       {2.0 + 0.0 * i, 0.0 + 0.0 * i, 2.0 + 0.0 * i}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(row_permute_csr,
+                        l({{0.0, 5.0, 0.0},
+                           {0.0, 1.5, 2.0},
+                           {1.0, 3.0, 2.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, NonSquareMatrixIsRowPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                       {0.0, 5.0, 0.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> permute_idxs{this->exec, {1, 0}};
+
+    auto row_permute = p_mtx->row_permute(&permute_idxs);
+
+    auto row_permute_csr = static_cast<Csr *>(row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(row_permute_csr,
+                        l({{0.0, 5.0, 0.0},
+                           {1.0, 3.0, 2.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, SquareMatrixIsColPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                       {0.0, 5.0, 0.0},
+                                       {0.0, 1.5, 2.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> permute_idxs{this->exec, {1, 2, 0}};
+
+    auto c_permute = p_mtx->column_permute(&permute_idxs);
+
+    auto c_permute_csr = static_cast<Csr *>(c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(c_permute_csr,
+                        l({{3.0, 2.0, 1.0},
+                           {5.0, 0.0, 0.0},
+                           {1.5, 2.0, 0.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, NonSquareMatrixIsColPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto p_mtx = gko::initialize<Csr>({{1.0, 0.0, 2.0},
+                                       {0.0, 5.0, 0.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> permute_idxs{this->exec, {1, 2, 0}};
+
+    auto c_permute = p_mtx->column_permute(&permute_idxs);
+
+    auto c_permute_csr = static_cast<Csr *>(c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(c_permute_csr,
+                        l({{0.0, 2.0, 1.0},
+                           {5.0, 0.0, 0.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, SquareMatrixIsInverseRowPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto inverse_p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                               {0.0, 5.0, 0.0},
+                                               {0.0, 1.5, 2.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> inverse_permute_idxs{this->exec, {1, 2, 0}};
+
+    auto inverse_row_permute =
+        inverse_p_mtx->inverse_row_permute(&inverse_permute_idxs);
+
+    auto inverse_row_permute_csr =
+        static_cast<Csr *>(inverse_row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_row_permute_csr,
+                        l({{0.0, 1.5, 2.0},
+                           {1.0, 3.0, 2.0},
+                           {0.0, 5.0, 0.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, NonSquareMatrixIsInverseRowPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto inverse_p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                               {0.0, 5.0, 0.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> inverse_permute_idxs{this->exec, {1, 0}};
+
+    auto inverse_row_permute =
+        inverse_p_mtx->inverse_row_permute(&inverse_permute_idxs);
+
+    auto inverse_row_permute_csr =
+        static_cast<Csr *>(inverse_row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_row_permute_csr,
+                        l({{0.0, 5.0, 0.0},
+                           {1.0, 3.0, 2.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, SquareMatrixIsInverseColPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto inverse_p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                               {0.0, 5.0, 0.0},
+                                               {0.0, 1.5, 2.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> inverse_permute_idxs{this->exec, {1, 2, 0}};
+
+    auto inverse_c_permute =
+        inverse_p_mtx->inverse_column_permute(&inverse_permute_idxs);
+
+    auto inverse_c_permute_csr = static_cast<Csr *>(inverse_c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_c_permute_csr,
+                        l({{2.0, 1.0, 3.0},
+                           {0.0, 0.0, 5.0},
+                           {2.0, 0.0, 1.5}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Csr, NonSquareMatrixIsInverseColPermutable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    // clang-format off
+    auto inverse_p_mtx = gko::initialize<Csr>({{1.0, 3.0, 2.0},
+                                              {0.0, 5.0, 0.0}}, this->exec);
+    // clang-format on
+    gko::Array<index_type> inverse_permute_idxs{this->exec, {1, 2, 0}};
+
+    auto inverse_c_permute =
+        inverse_p_mtx->inverse_column_permute(&inverse_permute_idxs);
+
+    auto inverse_c_permute_csr = static_cast<Csr *>(inverse_c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_c_permute_csr,
+                        l({{2.0, 1.0, 3.0},
+                           {0.0, 0.0, 5.0}}),
+                        0.0);
     // clang-format on
 }
 
 
-TEST_F(Csr, RecognizeSortedMatrix)
+TYPED_TEST(Csr, RecognizeSortedMatrix)
 {
-    ASSERT_TRUE(mtx->is_sorted_by_column_index());
-    ASSERT_TRUE(mtx2->is_sorted_by_column_index());
-    ASSERT_TRUE(mtx3_sorted->is_sorted_by_column_index());
+    ASSERT_TRUE(this->mtx->is_sorted_by_column_index());
+    ASSERT_TRUE(this->mtx2->is_sorted_by_column_index());
+    ASSERT_TRUE(this->mtx3_sorted->is_sorted_by_column_index());
 }
 
 
-TEST_F(Csr, RecognizeUnsortedMatrix)
+TYPED_TEST(Csr, RecognizeUnsortedMatrix)
 {
-    ASSERT_FALSE(mtx3_unsorted->is_sorted_by_column_index());
+    ASSERT_FALSE(this->mtx3_unsorted->is_sorted_by_column_index());
 }
 
 
-TEST_F(Csr, SortSortedMatrix)
+TYPED_TEST(Csr, SortSortedMatrix)
 {
-    auto matrix = mtx3_sorted->clone();
+    auto matrix = this->mtx3_sorted->clone();
 
     matrix->sort_by_column_index();
 
-    GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0);
+    GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0);
 }
 
 
-TEST_F(Csr, SortUnsortedMatrix)
+TYPED_TEST(Csr, SortUnsortedMatrix)
 {
-    auto matrix = mtx3_unsorted->clone();
+    auto matrix = this->mtx3_unsorted->clone();
 
     matrix->sort_by_column_index();
 
-    GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0);
+    GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0);
+}
+
+
+template <typename ValueIndexType>
+class CsrComplex : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+};
+
+TYPED_TEST_CASE(CsrComplex, gko::test::ComplexValueIndexTypes);
+
+
+TYPED_TEST(CsrComplex, MtxIsConjugateTransposable)
+{
+    using Csr = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+
+    auto exec = gko::ReferenceExecutor::create();
+    // clang-format off
+        auto mtx2 = gko::initialize<Csr>(
+            {{T{1.0, 2.0}, T{3.0, 0.0}, T{2.0, 0.0}},
+             {T{0.0, 0.0}, T{5.0, - 3.5}, T{0.0,0.0}},
+             {T{0.0, 0.0}, T{0.0, 1.5}, T{2.0,0.0}}}, exec);
+    // clang-format on
+
+    auto trans = mtx2->conj_transpose();
+    auto trans_as_csr = static_cast<Csr *>(trans.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(trans_as_csr,
+                        l({{T{1.0, - 2.0}, T{0.0, 0.0}, T{0.0, 0.0}},
+                           {T{3.0, 0.0}, T{5.0, 3.5}, T{0.0, - 1.5}},
+                           {T{2.0, 0.0}, T{0.0, 0.0}, T{2.0 + 0.0}}}), 0.0);
+    // clang-format on
 }
 
 
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index ce24a8ef42a..e851d9d0dfd 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/dense_kernels.hpp"
+#include <ginkgo/core/matrix/dense.hpp>
 
 
 #include <complex>
@@ -43,54 +43,51 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/dense_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Dense : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
     Dense()
         : exec(gko::ReferenceExecutor::create()),
           mtx1(gko::initialize<Mtx>(4, {{1.0, 2.0, 3.0}, {1.5, 2.5, 3.5}},
                                     exec)),
-          mtx2(gko::initialize<Mtx>({{1.0, -1.0}, {-2.0, 2.0}}, exec)),
+          mtx2(gko::initialize<Mtx>({I<T>({1.0, -1.0}), I<T>({-2.0, 2.0})},
+                                    exec)),
           mtx3(gko::initialize<Mtx>(4, {{1.0, 2.0, 3.0}, {0.5, 1.5, 2.5}},
                                     exec)),
           mtx4(gko::initialize<Mtx>(4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}},
                                     exec)),
           mtx5(gko::initialize<Mtx>(
               {{1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}, exec)),
-          mtx6(gko::initialize<gko::matrix::Dense<std::complex<double>>>(
-              {{1.0 + 2.0 * i, -1.0 + 2.1 * i},
-               {-2.0 + 1.5 * i, 4.5 + 0.0 * i},
-               {1.0 + 0.0 * i, i}},
-              exec)),
-          mtx7(gko::initialize<Mtx>({{1.0, 2.0, 0.0}, {0.0, 1.5, 0.0}}, exec)),
-          mtx8(gko::initialize<Mtx>({{1.0, 2.0, 3.0}, {0.0, 1.5, 0.0}}, exec))
+          mtx6(gko::initialize<Mtx>({{1.0, 2.0, 0.0}, {0.0, 1.5, 0.0}}, exec)),
+          mtx7(gko::initialize<Mtx>({{1.0, 2.0, 3.0}, {0.0, 1.5, 0.0}}, exec))
     {}
 
-    std::complex<double> i{0, 1};
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<gko::matrix::Dense<>> mtx1;
-    std::unique_ptr<gko::matrix::Dense<>> mtx2;
-    std::unique_ptr<gko::matrix::Dense<>> mtx3;
-    std::unique_ptr<gko::matrix::Dense<>> mtx4;
-    std::unique_ptr<gko::matrix::Dense<>> mtx5;
-    std::unique_ptr<gko::matrix::Dense<std::complex<double>>> mtx6;
-    std::unique_ptr<gko::matrix::Dense<>> mtx7;
-    std::unique_ptr<gko::matrix::Dense<>> mtx8;
+    std::unique_ptr<Mtx> mtx1;
+    std::unique_ptr<Mtx> mtx2;
+    std::unique_ptr<Mtx> mtx3;
+    std::unique_ptr<Mtx> mtx4;
+    std::unique_ptr<Mtx> mtx5;
+    std::unique_ptr<Mtx> mtx6;
+    std::unique_ptr<Mtx> mtx7;
 
     std::ranlux48 rand_engine;
 
@@ -99,192 +96,324 @@ class Dense : public ::testing::Test {
     {
         return gko::test::generate_random_matrix<MtxType>(
             num_rows, num_cols,
-            std::uniform_int_distribution<>(num_cols, num_cols),
-            std::normal_distribution<>(0.0, 1.0), rand_engine, exec);
+            std::uniform_int_distribution<gko::size_type>(num_cols, num_cols),
+            std::normal_distribution<gko::remove_complex<value_type>>(0.0, 1.0),
+            rand_engine, exec);
     }
 };
 
 
-TEST_F(Dense, AppliesToDense)
-{
-    mtx2->apply(mtx1.get(), mtx3.get());
+TYPED_TEST_CASE(Dense, gko::test::ValueTypes);
+
 
-    EXPECT_EQ(mtx3->at(0, 0), -0.5);
-    EXPECT_EQ(mtx3->at(0, 1), -0.5);
-    EXPECT_EQ(mtx3->at(0, 2), -0.5);
-    EXPECT_EQ(mtx3->at(1, 0), 1.0);
-    EXPECT_EQ(mtx3->at(1, 1), 1.0);
-    ASSERT_EQ(mtx3->at(1, 2), 1.0);
+TYPED_TEST(Dense, AppliesToDense)
+{
+    using T = typename TestFixture::value_type;
+    this->mtx2->apply(this->mtx1.get(), this->mtx3.get());
+
+    EXPECT_EQ(this->mtx3->at(0, 0), T{-0.5});
+    EXPECT_EQ(this->mtx3->at(0, 1), T{-0.5});
+    EXPECT_EQ(this->mtx3->at(0, 2), T{-0.5});
+    EXPECT_EQ(this->mtx3->at(1, 0), T{1.0});
+    EXPECT_EQ(this->mtx3->at(1, 1), T{1.0});
+    ASSERT_EQ(this->mtx3->at(1, 2), T{1.0});
 }
 
 
-TEST_F(Dense, AppliesLinearCombinationToDense)
+TYPED_TEST(Dense, AppliesLinearCombinationToDense)
 {
-    auto alpha = gko::initialize<Mtx>({-1.0}, exec);
-    auto beta = gko::initialize<Mtx>({2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({2.0}, this->exec);
+
+    this->mtx2->apply(alpha.get(), this->mtx1.get(), beta.get(),
+                      this->mtx3.get());
+
+    EXPECT_EQ(this->mtx3->at(0, 0), T{2.5});
+    EXPECT_EQ(this->mtx3->at(0, 1), T{4.5});
+    EXPECT_EQ(this->mtx3->at(0, 2), T{6.5});
+    EXPECT_EQ(this->mtx3->at(1, 0), T{0.0});
+    EXPECT_EQ(this->mtx3->at(1, 1), T{2.0});
+    ASSERT_EQ(this->mtx3->at(1, 2), T{4.0});
+}
+
 
-    mtx2->apply(alpha.get(), mtx1.get(), beta.get(), mtx3.get());
+TYPED_TEST(Dense, ApplyFailsOnWrongInnerDimension)
+{
+    using Mtx = typename TestFixture::Mtx;
+    auto res = Mtx::create(this->exec, gko::dim<2>{2});
 
-    EXPECT_EQ(mtx3->at(0, 0), 2.5);
-    EXPECT_EQ(mtx3->at(0, 1), 4.5);
-    EXPECT_EQ(mtx3->at(0, 2), 6.5);
-    EXPECT_EQ(mtx3->at(1, 0), 0.0);
-    EXPECT_EQ(mtx3->at(1, 1), 2.0);
-    ASSERT_EQ(mtx3->at(1, 2), 4.0);
+    ASSERT_THROW(this->mtx2->apply(this->mtx1.get(), res.get()),
+                 gko::DimensionMismatch);
 }
 
 
-TEST_F(Dense, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfRows)
 {
-    auto res = gko::matrix::Dense<>::create(exec, gko::dim<2>{2});
+    using Mtx = typename TestFixture::Mtx;
+    auto res = Mtx::create(this->exec, gko::dim<2>{3});
 
-    ASSERT_THROW(mtx2->apply(mtx1.get(), res.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(this->mtx2.get(), res.get()),
+                 gko::DimensionMismatch);
 }
 
 
-TEST_F(Dense, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfCols)
 {
-    auto res = gko::matrix::Dense<>::create(exec, gko::dim<2>{3});
+    using Mtx = typename TestFixture::Mtx;
+    auto res = Mtx::create(this->exec, gko::dim<2>{2}, 3);
 
-    ASSERT_THROW(mtx1->apply(mtx2.get(), res.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(this->mtx2.get(), res.get()),
+                 gko::DimensionMismatch);
 }
 
 
-TEST_F(Dense, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(Dense, ScalesData)
 {
-    auto res = gko::matrix::Dense<>::create(exec, gko::dim<2>{2}, 3);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({I<T>{2.0, -2.0}}, this->exec);
+
+    this->mtx2->scale(alpha.get());
 
-    ASSERT_THROW(mtx1->apply(mtx2.get(), res.get()), gko::DimensionMismatch);
+    EXPECT_EQ(this->mtx2->at(0, 0), T{2.0});
+    EXPECT_EQ(this->mtx2->at(0, 1), T{2.0});
+    EXPECT_EQ(this->mtx2->at(1, 0), T{-4.0});
+    EXPECT_EQ(this->mtx2->at(1, 1), T{-4.0});
 }
 
 
-TEST_F(Dense, ScalesData)
+TYPED_TEST(Dense, ScalesDataWithScalar)
 {
-    auto alpha = gko::initialize<Mtx>({{2.0, -2.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
 
-    mtx2->scale(alpha.get());
+    this->mtx2->scale(alpha.get());
 
-    EXPECT_EQ(mtx2->at(0, 0), 2.0);
-    EXPECT_EQ(mtx2->at(0, 1), 2.0);
-    EXPECT_EQ(mtx2->at(1, 0), -4.0);
-    EXPECT_EQ(mtx2->at(1, 1), -4.0);
+    EXPECT_EQ(this->mtx2->at(0, 0), T{2.0});
+    EXPECT_EQ(this->mtx2->at(0, 1), T{-2.0});
+    EXPECT_EQ(this->mtx2->at(1, 0), T{-4.0});
+    EXPECT_EQ(this->mtx2->at(1, 1), T{4.0});
 }
 
 
-TEST_F(Dense, ScalesDataWithScalar)
+TYPED_TEST(Dense, ScalesDataWithStride)
 {
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({{-1.0, 1.0, 2.0}}, this->exec);
+
+    this->mtx1->scale(alpha.get());
+
+    EXPECT_EQ(this->mtx1->at(0, 0), T{-1.0});
+    EXPECT_EQ(this->mtx1->at(0, 1), T{2.0});
+    EXPECT_EQ(this->mtx1->at(0, 2), T{6.0});
+    EXPECT_EQ(this->mtx1->at(1, 0), T{-1.5});
+    EXPECT_EQ(this->mtx1->at(1, 1), T{2.5});
+    ASSERT_EQ(this->mtx1->at(1, 2), T{7.0});
+}
 
-    mtx2->scale(alpha.get());
 
-    EXPECT_EQ(mtx2->at(0, 0), 2.0);
-    EXPECT_EQ(mtx2->at(0, 1), -2.0);
-    EXPECT_EQ(mtx2->at(1, 0), -4.0);
-    EXPECT_EQ(mtx2->at(1, 1), 4.0);
+TYPED_TEST(Dense, AddsScaled)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({{2.0, 1.0, -2.0}}, this->exec);
+
+    this->mtx1->add_scaled(alpha.get(), this->mtx3.get());
+
+    EXPECT_EQ(this->mtx1->at(0, 0), T{3.0});
+    EXPECT_EQ(this->mtx1->at(0, 1), T{4.0});
+    EXPECT_EQ(this->mtx1->at(0, 2), T{-3.0});
+    EXPECT_EQ(this->mtx1->at(1, 0), T{2.5});
+    EXPECT_EQ(this->mtx1->at(1, 1), T{4.0});
+    ASSERT_EQ(this->mtx1->at(1, 2), T{-1.5});
 }
 
 
-TEST_F(Dense, ScalesDataWithStride)
+TYPED_TEST(Dense, AddsScaledWithScalar)
 {
-    auto alpha = gko::initialize<Mtx>({{-1.0, 1.0, 2.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+
+    this->mtx1->add_scaled(alpha.get(), this->mtx3.get());
+
+    EXPECT_EQ(this->mtx1->at(0, 0), T{3.0});
+    EXPECT_EQ(this->mtx1->at(0, 1), T{6.0});
+    EXPECT_EQ(this->mtx1->at(0, 2), T{9.0});
+    EXPECT_EQ(this->mtx1->at(1, 0), T{2.5});
+    EXPECT_EQ(this->mtx1->at(1, 1), T{5.5});
+    ASSERT_EQ(this->mtx1->at(1, 2), T{8.5});
+}
+
 
-    mtx1->scale(alpha.get());
+TYPED_TEST(Dense, AddScaledFailsOnWrongSizes)
+{
+    using Mtx = typename TestFixture::Mtx;
+    auto alpha = Mtx::create(this->exec, gko::dim<2>{1, 2});
 
-    EXPECT_EQ(mtx1->at(0, 0), -1.0);
-    EXPECT_EQ(mtx1->at(0, 1), 2.0);
-    EXPECT_EQ(mtx1->at(0, 2), 6.0);
-    EXPECT_EQ(mtx1->at(1, 0), -1.5);
-    EXPECT_EQ(mtx1->at(1, 1), 2.5);
-    ASSERT_EQ(mtx1->at(1, 2), 7.0);
+    ASSERT_THROW(this->mtx1->add_scaled(alpha.get(), this->mtx2.get()),
+                 gko::DimensionMismatch);
 }
 
 
-TEST_F(Dense, AddsScaled)
+TYPED_TEST(Dense, ComputesDot)
 {
-    auto alpha = gko::initialize<Mtx>({{2.0, 1.0, -2.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto result = Mtx::create(this->exec, gko::dim<2>{1, 3});
 
-    mtx1->add_scaled(alpha.get(), mtx3.get());
+    this->mtx1->compute_dot(this->mtx3.get(), result.get());
 
-    EXPECT_EQ(mtx1->at(0, 0), 3.0);
-    EXPECT_EQ(mtx1->at(0, 1), 4.0);
-    EXPECT_EQ(mtx1->at(0, 2), -3.0);
-    EXPECT_EQ(mtx1->at(1, 0), 2.5);
-    EXPECT_EQ(mtx1->at(1, 1), 4.0);
-    ASSERT_EQ(mtx1->at(1, 2), -1.5);
+    EXPECT_EQ(result->at(0, 0), T{1.75});
+    EXPECT_EQ(result->at(0, 1), T{7.75});
+    ASSERT_EQ(result->at(0, 2), T{17.75});
 }
 
 
-TEST_F(Dense, AddsScaledWithScalar)
+TYPED_TEST(Dense, ComputesNorm2)
 {
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using T_nc = gko::remove_complex<T>;
+    using NormVector = gko::matrix::Dense<T_nc>;
+    auto mtx(gko::initialize<Mtx>(
+        {I<T>{1.0, 0.0}, I<T>{2.0, 3.0}, I<T>{2.0, 4.0}}, this->exec));
+    auto result = NormVector::create(this->exec, gko::dim<2>{1, 2});
 
-    mtx1->add_scaled(alpha.get(), mtx3.get());
+    mtx->compute_norm2(result.get());
 
-    EXPECT_EQ(mtx1->at(0, 0), 3.0);
-    EXPECT_EQ(mtx1->at(0, 1), 6.0);
-    EXPECT_EQ(mtx1->at(0, 2), 9.0);
-    EXPECT_EQ(mtx1->at(1, 0), 2.5);
-    EXPECT_EQ(mtx1->at(1, 1), 5.5);
-    ASSERT_EQ(mtx1->at(1, 2), 8.5);
+    EXPECT_EQ(result->at(0, 0), T_nc{3.0});
+    EXPECT_EQ(result->at(0, 1), T_nc{5.0});
 }
 
 
-TEST_F(Dense, AddScaledFailsOnWrongSizes)
+TYPED_TEST(Dense, ComputDotFailsOnWrongInputSize)
 {
-    auto alpha = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 2});
+    using Mtx = typename TestFixture::Mtx;
+    auto result = Mtx::create(this->exec, gko::dim<2>{1, 3});
 
-    ASSERT_THROW(mtx1->add_scaled(alpha.get(), mtx2.get()),
+    ASSERT_THROW(this->mtx1->compute_dot(this->mtx2.get(), result.get()),
                  gko::DimensionMismatch);
 }
 
 
-TEST_F(Dense, ComputesDot)
+TYPED_TEST(Dense, ComputDotFailsOnWrongResultSize)
 {
-    auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 3});
-
-    mtx1->compute_dot(mtx3.get(), result.get());
+    using Mtx = typename TestFixture::Mtx;
+    auto result = Mtx::create(this->exec, gko::dim<2>{1, 2});
 
-    EXPECT_EQ(result->at(0, 0), 1.75);
-    EXPECT_EQ(result->at(0, 1), 7.75);
-    ASSERT_EQ(result->at(0, 2), 17.75);
+    ASSERT_THROW(this->mtx1->compute_dot(this->mtx3.get(), result.get()),
+                 gko::DimensionMismatch);
 }
 
 
-TEST_F(Dense, ComputesNorm2)
+TYPED_TEST(Dense, ConvertsToPrecision)
 {
-    auto mtx(gko::initialize<Mtx>({{1.0, 0.0}, {2.0, 3.0}, {2.0, 4.0}}, exec));
-    auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 2});
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDense = typename gko::matrix::Dense<OtherT>;
+    auto tmp = OtherDense::create(this->exec);
+    auto res = Dense::create(this->exec);
+    // If OtherT is more precise: 0, otherwise r
+    auto residual = r<OtherT>::value < r<T>::value
+                        ? gko::remove_complex<T>{0}
+                        : gko::remove_complex<T>{r<OtherT>::value};
+
+    this->mtx1->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
+}
 
-    mtx->compute_norm2(result.get());
 
-    EXPECT_EQ(result->at(0, 0), 3.0);
-    EXPECT_EQ(result->at(0, 1), 5.0);
+TYPED_TEST(Dense, MovesToPrecision)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDense = typename gko::matrix::Dense<OtherT>;
+    auto tmp = OtherDense::create(this->exec);
+    auto res = Dense::create(this->exec);
+    // If OtherT is more precise: 0, otherwise r
+    auto residual = r<OtherT>::value < r<T>::value
+                        ? gko::remove_complex<T>{0}
+                        : gko::remove_complex<T>{r<OtherT>::value};
+
+    this->mtx1->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
 }
 
 
-TEST_F(Dense, ComputDotFailsOnWrongInputSize)
+TYPED_TEST(Dense, ConvertsToCoo32)
 {
-    auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 3});
+    using T = typename TestFixture::value_type;
+    using Coo = typename gko::matrix::Coo<T, gko::int32>;
+    auto coo_mtx = Coo::create(this->mtx4->get_executor());
 
-    ASSERT_THROW(mtx1->compute_dot(mtx2.get(), result.get()),
-                 gko::DimensionMismatch);
+    this->mtx4->convert_to(coo_mtx.get());
+    auto v = coo_mtx->get_const_values();
+    auto c = coo_mtx->get_const_col_idxs();
+    auto r = coo_mtx->get_const_row_idxs();
+
+    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, ComputDotFailsOnWrongResultSize)
+TYPED_TEST(Dense, MovesToCoo32)
 {
-    auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 2});
+    using T = typename TestFixture::value_type;
+    using Coo = typename gko::matrix::Coo<T, gko::int32>;
+    auto coo_mtx = Coo::create(this->mtx4->get_executor());
 
-    ASSERT_THROW(mtx1->compute_dot(mtx3.get(), result.get()),
-                 gko::DimensionMismatch);
+    this->mtx4->move_to(coo_mtx.get());
+    auto v = coo_mtx->get_const_values();
+    auto c = coo_mtx->get_const_col_idxs();
+    auto r = coo_mtx->get_const_row_idxs();
+
+    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, ConvertsToCoo)
+TYPED_TEST(Dense, ConvertsToCoo64)
 {
-    auto coo_mtx = gko::matrix::Coo<>::create(mtx4->get_executor());
+    using T = typename TestFixture::value_type;
+    using Coo = typename gko::matrix::Coo<T, gko::int64>;
+    auto coo_mtx = Coo::create(this->mtx4->get_executor());
 
-    mtx4->convert_to(coo_mtx.get());
+    this->mtx4->convert_to(coo_mtx.get());
     auto v = coo_mtx->get_const_values();
     auto c = coo_mtx->get_const_col_idxs();
     auto r = coo_mtx->get_const_row_idxs();
@@ -299,18 +428,20 @@ TEST_F(Dense, ConvertsToCoo)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, MovesToCoo)
+TYPED_TEST(Dense, MovesToCoo64)
 {
-    auto coo_mtx = gko::matrix::Coo<>::create(mtx4->get_executor());
+    using T = typename TestFixture::value_type;
+    using Coo = typename gko::matrix::Coo<T, gko::int64>;
+    auto coo_mtx = Coo::create(this->mtx4->get_executor());
 
-    mtx4->move_to(coo_mtx.get());
+    this->mtx4->move_to(coo_mtx.get());
     auto v = coo_mtx->get_const_values();
     auto c = coo_mtx->get_const_col_idxs();
     auto r = coo_mtx->get_const_row_idxs();
@@ -325,50 +456,93 @@ TEST_F(Dense, MovesToCoo)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, ConvertsEmptyMatrixToCsr)
+TYPED_TEST(Dense, ConvertsToCsr32)
 {
-    auto strategy = std::make_shared<gko::matrix::Csr<>::load_balance>(0);
-    auto from_mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{0, 0});
-    auto to_mtx =
-        gko::matrix::Csr<>::create(exec, gko::dim<2>{0, 0}, 0, strategy);
+    using T = typename TestFixture::value_type;
+    using Csr = typename gko::matrix::Csr<T, gko::int32>;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
 
-    from_mtx->convert_to(to_mtx.get());
+    this->mtx4->convert_to(csr_mtx_c.get());
+    this->mtx4->convert_to(csr_mtx_m.get());
 
-    ASSERT_FALSE(to_mtx->get_size());
+    auto v = csr_mtx_c->get_const_values();
+    auto c = csr_mtx_c->get_const_col_idxs();
+    auto r = csr_mtx_c->get_const_row_ptrs();
+    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 4);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Dense, MovesEmptyMatrixToCsr)
+TYPED_TEST(Dense, MovesToCsr32)
 {
-    auto strategy = std::make_shared<gko::matrix::Csr<>::load_balance>(0);
-    auto from_mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{0, 0});
-    auto to_mtx =
-        gko::matrix::Csr<>::create(exec, gko::dim<2>{0, 0}, 0, strategy);
-
-    from_mtx->move_to(to_mtx.get());
+    using T = typename TestFixture::value_type;
+    using Csr = typename gko::matrix::Csr<T, gko::int32>;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx4->clone();
+
+    this->mtx4->move_to(csr_mtx_c.get());
+    mtx_clone->move_to(csr_mtx_m.get());
 
-    ASSERT_FALSE(to_mtx->get_size());
+    auto v = csr_mtx_c->get_const_values();
+    auto c = csr_mtx_c->get_const_col_idxs();
+    auto r = csr_mtx_c->get_const_row_ptrs();
+    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 4);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Dense, ConvertsToCsr)
+TYPED_TEST(Dense, ConvertsToCsr64)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_merge);
+    using T = typename TestFixture::value_type;
+    using Csr = typename gko::matrix::Csr<T, gko::int64>;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
 
-    mtx4->convert_to(csr_mtx_c.get());
-    mtx4->convert_to(csr_mtx_m.get());
+    this->mtx4->convert_to(csr_mtx_c.get());
+    this->mtx4->convert_to(csr_mtx_m.get());
 
     auto v = csr_mtx_c->get_const_values();
     auto c = csr_mtx_c->get_const_col_idxs();
@@ -382,27 +556,27 @@ TEST_F(Dense, ConvertsToCsr)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
     GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Dense, MovesToCsr)
+TYPED_TEST(Dense, MovesToCsr64)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_merge);
-    auto mtx_clone = mtx4->clone();
-
-    mtx4->move_to(csr_mtx_c.get());
+    using T = typename TestFixture::value_type;
+    using Csr = typename gko::matrix::Csr<T, gko::int64>;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx4->clone();
+
+    this->mtx4->move_to(csr_mtx_c.get());
     mtx_clone->move_to(csr_mtx_m.get());
 
     auto v = csr_mtx_c->get_const_values();
@@ -417,22 +591,23 @@ TEST_F(Dense, MovesToCsr)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
     GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Dense, ConvertsToSparsityCsr)
+TYPED_TEST(Dense, ConvertsToSparsityCsr32)
 {
-    auto sparsity_csr_mtx =
-        gko::matrix::SparsityCsr<>::create(mtx4->get_executor());
+    using T = typename TestFixture::value_type;
+    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
+    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    mtx4->convert_to(sparsity_csr_mtx.get());
+    this->mtx4->convert_to(sparsity_csr_mtx.get());
     auto v = sparsity_csr_mtx->get_const_value();
     auto c = sparsity_csr_mtx->get_const_col_idxs();
     auto r = sparsity_csr_mtx->get_const_row_ptrs();
@@ -446,16 +621,17 @@ TEST_F(Dense, ConvertsToSparsityCsr)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
+    EXPECT_EQ(v[0], T{1.0});
 }
 
 
-TEST_F(Dense, MovesToSparsityCsr)
+TYPED_TEST(Dense, MovesToSparsityCsr32)
 {
-    auto sparsity_csr_mtx =
-        gko::matrix::SparsityCsr<>::create(mtx4->get_executor());
+    using T = typename TestFixture::value_type;
+    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
+    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    mtx4->move_to(sparsity_csr_mtx.get());
+    this->mtx4->move_to(sparsity_csr_mtx.get());
     auto v = sparsity_csr_mtx->get_const_value();
     auto c = sparsity_csr_mtx->get_const_col_idxs();
     auto r = sparsity_csr_mtx->get_const_row_ptrs();
@@ -469,15 +645,115 @@ TEST_F(Dense, MovesToSparsityCsr)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
+    EXPECT_EQ(v[0], T{1.0});
 }
 
 
-TEST_F(Dense, ConvertsToEll)
+TYPED_TEST(Dense, ConvertsToSparsityCsr64)
 {
-    auto ell_mtx = gko::matrix::Ell<>::create(mtx7->get_executor());
+    using T = typename TestFixture::value_type;
+    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int64>;
+    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
+
+    this->mtx4->convert_to(sparsity_csr_mtx.get());
+    auto v = sparsity_csr_mtx->get_const_value();
+    auto c = sparsity_csr_mtx->get_const_col_idxs();
+    auto r = sparsity_csr_mtx->get_const_row_ptrs();
+
+    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 4);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+}
 
-    mtx7->convert_to(ell_mtx.get());
+
+TYPED_TEST(Dense, MovesToSparsityCsr64)
+{
+    using T = typename TestFixture::value_type;
+    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int64>;
+    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
+
+    this->mtx4->move_to(sparsity_csr_mtx.get());
+    auto v = sparsity_csr_mtx->get_const_value();
+    auto c = sparsity_csr_mtx->get_const_col_idxs();
+    auto r = sparsity_csr_mtx->get_const_row_ptrs();
+
+    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 4);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+}
+
+
+TYPED_TEST(Dense, ConvertsToEll32)
+{
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int32>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor());
+
+    this->mtx6->convert_to(ell_mtx.get());
+    auto v = ell_mtx->get_const_values();
+    auto c = ell_mtx->get_const_col_idxs();
+
+    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
+    ASSERT_EQ(ell_mtx->get_stride(), 2);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 1);
+    EXPECT_EQ(c[3], 0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{0.0});
+}
+
+
+TYPED_TEST(Dense, MovesToEll32)
+{
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int32>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor());
+
+    this->mtx6->move_to(ell_mtx.get());
+    auto v = ell_mtx->get_const_values();
+    auto c = ell_mtx->get_const_col_idxs();
+
+    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
+    ASSERT_EQ(ell_mtx->get_stride(), 2);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 1);
+    EXPECT_EQ(c[3], 0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{0.0});
+}
+
+
+TYPED_TEST(Dense, ConvertsToEll64)
+{
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int64>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor());
+
+    this->mtx6->convert_to(ell_mtx.get());
     auto v = ell_mtx->get_const_values();
     auto c = ell_mtx->get_const_col_idxs();
 
@@ -489,18 +765,20 @@ TEST_F(Dense, ConvertsToEll)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 1);
     EXPECT_EQ(c[3], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{0.0});
 }
 
 
-TEST_F(Dense, MovesToEll)
+TYPED_TEST(Dense, MovesToEll64)
 {
-    auto ell_mtx = gko::matrix::Ell<>::create(mtx7->get_executor());
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int64>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor());
 
-    mtx7->move_to(ell_mtx.get());
+    this->mtx6->move_to(ell_mtx.get());
     auto v = ell_mtx->get_const_values();
     auto c = ell_mtx->get_const_col_idxs();
 
@@ -512,19 +790,20 @@ TEST_F(Dense, MovesToEll)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 1);
     EXPECT_EQ(c[3], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{0.0});
 }
 
 
-TEST_F(Dense, ConvertsToEllWithStride)
+TYPED_TEST(Dense, ConvertsToEllWithStride)
 {
-    auto ell_mtx =
-        gko::matrix::Ell<>::create(mtx7->get_executor(), gko::dim<2>{}, 0, 3);
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int32>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{}, 0, 3);
 
-    mtx7->convert_to(ell_mtx.get());
+    this->mtx6->convert_to(ell_mtx.get());
     auto v = ell_mtx->get_const_values();
     auto c = ell_mtx->get_const_col_idxs();
 
@@ -538,21 +817,22 @@ TEST_F(Dense, ConvertsToEllWithStride)
     EXPECT_EQ(c[3], 1);
     EXPECT_EQ(c[4], 0);
     EXPECT_EQ(c[5], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[2], 0.0);
-    EXPECT_EQ(v[3], 2.0);
-    EXPECT_EQ(v[4], 0.0);
-    EXPECT_EQ(v[5], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{0.0});
+    EXPECT_EQ(v[3], T{2.0});
+    EXPECT_EQ(v[4], T{0.0});
+    EXPECT_EQ(v[5], T{0.0});
 }
 
 
-TEST_F(Dense, MovesToEllWithStride)
+TYPED_TEST(Dense, MovesToEllWithStride)
 {
-    auto ell_mtx =
-        gko::matrix::Ell<>::create(mtx7->get_executor(), gko::dim<2>{}, 0, 3);
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int32>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{}, 0, 3);
 
-    mtx7->move_to(ell_mtx.get());
+    this->mtx6->move_to(ell_mtx.get());
     auto v = ell_mtx->get_const_values();
     auto c = ell_mtx->get_const_col_idxs();
 
@@ -566,20 +846,88 @@ TEST_F(Dense, MovesToEllWithStride)
     EXPECT_EQ(c[3], 1);
     EXPECT_EQ(c[4], 0);
     EXPECT_EQ(c[5], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[2], 0.0);
-    EXPECT_EQ(v[3], 2.0);
-    EXPECT_EQ(v[4], 0.0);
-    EXPECT_EQ(v[5], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{0.0});
+    EXPECT_EQ(v[3], T{2.0});
+    EXPECT_EQ(v[4], T{0.0});
+    EXPECT_EQ(v[5], T{0.0});
+}
+
+
+TYPED_TEST(Dense, MovesToHybridAutomatically32)
+{
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
+
+    this->mtx4->move_to(hybrid_mtx.get());
+    auto v = hybrid_mtx->get_const_coo_values();
+    auto c = hybrid_mtx->get_const_coo_col_idxs();
+    auto r = hybrid_mtx->get_const_coo_row_idxs();
+    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
+    auto p = hybrid_mtx->get_ell_stride();
+
+    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
+    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
+    EXPECT_EQ(n, 0);
+    EXPECT_EQ(p, 2);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
+}
+
+
+TYPED_TEST(Dense, ConvertsToHybridAutomatically32)
+{
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
+
+    this->mtx4->convert_to(hybrid_mtx.get());
+    auto v = hybrid_mtx->get_const_coo_values();
+    auto c = hybrid_mtx->get_const_coo_col_idxs();
+    auto r = hybrid_mtx->get_const_coo_row_idxs();
+    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
+    auto p = hybrid_mtx->get_ell_stride();
+
+    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
+    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
+    EXPECT_EQ(n, 0);
+    EXPECT_EQ(p, 2);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, MovesToHybridAutomatically)
+TYPED_TEST(Dense, MovesToHybridAutomatically64)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor());
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int64>;
+    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
 
-    mtx4->move_to(hybrid_mtx.get());
+    this->mtx4->move_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_coo_values();
     auto c = hybrid_mtx->get_const_coo_col_idxs();
     auto r = hybrid_mtx->get_const_coo_row_idxs();
@@ -599,18 +947,20 @@ TEST_F(Dense, MovesToHybridAutomatically)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, ConvertsToHybridAutomatically)
+TYPED_TEST(Dense, ConvertsToHybridAutomatically64)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor());
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int64>;
+    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
 
-    mtx4->convert_to(hybrid_mtx.get());
+    this->mtx4->convert_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_coo_values();
     auto c = hybrid_mtx->get_const_coo_col_idxs();
     auto r = hybrid_mtx->get_const_coo_row_idxs();
@@ -630,19 +980,21 @@ TEST_F(Dense, ConvertsToHybridAutomatically)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, MovesToHybridWithStrideAutomatically)
+TYPED_TEST(Dense, MovesToHybridWithStrideAutomatically)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor(),
-                                                    gko::dim<2>{}, 0, 3);
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3);
 
-    mtx4->move_to(hybrid_mtx.get());
+    this->mtx4->move_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_coo_values();
     auto c = hybrid_mtx->get_const_coo_col_idxs();
     auto r = hybrid_mtx->get_const_coo_row_idxs();
@@ -662,19 +1014,21 @@ TEST_F(Dense, MovesToHybridWithStrideAutomatically)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, ConvertsToHybridWithStrideAutomatically)
+TYPED_TEST(Dense, ConvertsToHybridWithStrideAutomatically)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor(),
-                                                    gko::dim<2>{}, 0, 3);
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3);
 
-    mtx4->convert_to(hybrid_mtx.get());
+    this->mtx4->convert_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_coo_values();
     auto c = hybrid_mtx->get_const_coo_col_idxs();
     auto r = hybrid_mtx->get_const_coo_row_idxs();
@@ -694,20 +1048,22 @@ TEST_F(Dense, ConvertsToHybridWithStrideAutomatically)
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 2);
     EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 3.0);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 5.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{3.0});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{5.0});
 }
 
 
-TEST_F(Dense, MovesToHybridWithStrideAndCooLengthByColumns2)
+TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(
-        mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3,
-        std::make_shared<gko::matrix::Hybrid<>::column_limit>(2));
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3,
+                       std::make_shared<typename Hybrid::column_limit>(2));
 
-    mtx4->move_to(hybrid_mtx.get());
+    this->mtx4->move_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_ell_values();
     auto c = hybrid_mtx->get_const_ell_col_idxs();
     auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
@@ -724,15 +1080,15 @@ TEST_F(Dense, MovesToHybridWithStrideAndCooLengthByColumns2)
     EXPECT_EQ(c[3], 1);
     EXPECT_EQ(c[4], 0);
     EXPECT_EQ(c[5], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 5.0);
-    EXPECT_EQ(v[2], 0.0);
-    EXPECT_EQ(v[3], 3.0);
-    EXPECT_EQ(v[4], 0.0);
-    EXPECT_EQ(v[5], 0.0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], 2.0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], 0.0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{5.0});
+    EXPECT_EQ(v[2], T{0.0});
+    EXPECT_EQ(v[3], T{3.0});
+    EXPECT_EQ(v[4], T{0.0});
+    EXPECT_EQ(v[5], T{0.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], T{0.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], T{0.0});
     EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2);
     EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[1], 0);
     EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[2], 0);
@@ -742,13 +1098,15 @@ TEST_F(Dense, MovesToHybridWithStrideAndCooLengthByColumns2)
 }
 
 
-TEST_F(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2)
+TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(
-        mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3,
-        std::make_shared<gko::matrix::Hybrid<>::column_limit>(2));
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3,
+                       std::make_shared<typename Hybrid::column_limit>(2));
 
-    mtx4->convert_to(hybrid_mtx.get());
+    this->mtx4->convert_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_ell_values();
     auto c = hybrid_mtx->get_const_ell_col_idxs();
     auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
@@ -765,15 +1123,15 @@ TEST_F(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2)
     EXPECT_EQ(c[3], 1);
     EXPECT_EQ(c[4], 0);
     EXPECT_EQ(c[5], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 5.0);
-    EXPECT_EQ(v[2], 0.0);
-    EXPECT_EQ(v[3], 3.0);
-    EXPECT_EQ(v[4], 0.0);
-    EXPECT_EQ(v[5], 0.0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], 2.0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], 0.0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{5.0});
+    EXPECT_EQ(v[2], T{0.0});
+    EXPECT_EQ(v[3], T{3.0});
+    EXPECT_EQ(v[4], T{0.0});
+    EXPECT_EQ(v[5], T{0.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], T{0.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], T{0.0});
     EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2);
     EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[1], 0);
     EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[2], 0);
@@ -783,13 +1141,15 @@ TEST_F(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2)
 }
 
 
-TEST_F(Dense, MovesToHybridWithStrideByPercent40)
+TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(
-        mtx4->get_executor(), gko::dim<2>{}, 0, 3,
-        std::make_shared<gko::matrix::Hybrid<>::imbalance_limit>(0.4));
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3,
+                       std::make_shared<typename Hybrid::imbalance_limit>(0.4));
 
-    mtx4->move_to(hybrid_mtx.get());
+    this->mtx4->move_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_ell_values();
     auto c = hybrid_mtx->get_const_ell_col_idxs();
     auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
@@ -805,12 +1165,12 @@ TEST_F(Dense, MovesToHybridWithStrideByPercent40)
     EXPECT_EQ(c[0], 0);
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 5.0);
-    EXPECT_EQ(v[2], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{5.0});
+    EXPECT_EQ(v[2], T{0.0});
     ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2);
-    EXPECT_EQ(coo_v[0], 3.0);
-    EXPECT_EQ(coo_v[1], 2.0);
+    EXPECT_EQ(coo_v[0], T{3.0});
+    EXPECT_EQ(coo_v[1], T{2.0});
     EXPECT_EQ(coo_c[0], 1);
     EXPECT_EQ(coo_c[1], 2);
     EXPECT_EQ(coo_r[0], 0);
@@ -818,13 +1178,15 @@ TEST_F(Dense, MovesToHybridWithStrideByPercent40)
 }
 
 
-TEST_F(Dense, ConvertsToHybridWithStrideByPercent40)
+TYPED_TEST(Dense, ConvertsToHybridWithStrideByPercent40)
 {
-    auto hybrid_mtx = gko::matrix::Hybrid<>::create(
-        mtx4->get_executor(), gko::dim<2>{}, 0, 3,
-        std::make_shared<gko::matrix::Hybrid<>::imbalance_limit>(0.4));
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3,
+                       std::make_shared<typename Hybrid::imbalance_limit>(0.4));
 
-    mtx4->convert_to(hybrid_mtx.get());
+    this->mtx4->convert_to(hybrid_mtx.get());
     auto v = hybrid_mtx->get_const_ell_values();
     auto c = hybrid_mtx->get_const_ell_col_idxs();
     auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
@@ -840,12 +1202,12 @@ TEST_F(Dense, ConvertsToHybridWithStrideByPercent40)
     EXPECT_EQ(c[0], 0);
     EXPECT_EQ(c[1], 1);
     EXPECT_EQ(c[2], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 5.0);
-    EXPECT_EQ(v[2], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{5.0});
+    EXPECT_EQ(v[2], T{0.0});
     ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2);
-    EXPECT_EQ(coo_v[0], 3.0);
-    EXPECT_EQ(coo_v[1], 2.0);
+    EXPECT_EQ(coo_v[0], T{3.0});
+    EXPECT_EQ(coo_v[1], T{2.0});
     EXPECT_EQ(coo_c[0], 1);
     EXPECT_EQ(coo_c[1], 2);
     EXPECT_EQ(coo_r[0], 0);
@@ -853,11 +1215,13 @@ TEST_F(Dense, ConvertsToHybridWithStrideByPercent40)
 }
 
 
-TEST_F(Dense, ConvertsToSellp)
+TYPED_TEST(Dense, ConvertsToSellp32)
 {
-    auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor());
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
 
-    mtx8->convert_to(sellp_mtx.get());
+    this->mtx7->convert_to(sellp_mtx.get());
     auto v = sellp_mtx->get_const_values();
     auto c = sellp_mtx->get_const_col_idxs();
     auto s = sellp_mtx->get_const_slice_sets();
@@ -876,23 +1240,25 @@ TEST_F(Dense, ConvertsToSellp)
     EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0);
     EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
     EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[gko::matrix::default_slice_size], 2.0);
-    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], 0.0);
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], 3.0);
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
+    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
     EXPECT_EQ(s[0], 0);
     EXPECT_EQ(s[1], 3);
     EXPECT_EQ(l[0], 3);
 }
 
 
-TEST_F(Dense, MovesToSellp)
+TYPED_TEST(Dense, MovesToSellp32)
 {
-    auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor());
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
 
-    mtx8->move_to(sellp_mtx.get());
+    this->mtx7->move_to(sellp_mtx.get());
     auto v = sellp_mtx->get_const_values();
     auto c = sellp_mtx->get_const_col_idxs();
     auto s = sellp_mtx->get_const_slice_sets();
@@ -911,24 +1277,100 @@ TEST_F(Dense, MovesToSellp)
     EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0);
     EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
     EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[gko::matrix::default_slice_size], 2.0);
-    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], 0.0);
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], 3.0);
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
+    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
     EXPECT_EQ(s[0], 0);
     EXPECT_EQ(s[1], 3);
     EXPECT_EQ(l[0], 3);
 }
 
 
-TEST_F(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor)
+TYPED_TEST(Dense, ConvertsToSellp64)
 {
-    auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor(),
-                                                  gko::dim<2>{}, 2, 2, 0);
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int64>;
+    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
 
-    mtx8->convert_to(sellp_mtx.get());
+    this->mtx7->convert_to(sellp_mtx.get());
+    auto v = sellp_mtx->get_const_values();
+    auto c = sellp_mtx->get_const_col_idxs();
+    auto s = sellp_mtx->get_const_slice_sets();
+    auto l = sellp_mtx->get_const_slice_lengths();
+
+    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
+    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
+              3 * gko::matrix::default_slice_size);
+    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
+    ASSERT_EQ(sellp_mtx->get_stride_factor(),
+              gko::matrix::default_stride_factor);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
+    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0);
+    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
+    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
+    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
+    EXPECT_EQ(s[0], 0);
+    EXPECT_EQ(s[1], 3);
+    EXPECT_EQ(l[0], 3);
+}
+
+
+TYPED_TEST(Dense, MovesToSellp64)
+{
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int64>;
+    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
+
+    this->mtx7->move_to(sellp_mtx.get());
+    auto v = sellp_mtx->get_const_values();
+    auto c = sellp_mtx->get_const_col_idxs();
+    auto s = sellp_mtx->get_const_slice_sets();
+    auto l = sellp_mtx->get_const_slice_lengths();
+
+    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
+    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
+              3 * gko::matrix::default_slice_size);
+    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
+    ASSERT_EQ(sellp_mtx->get_stride_factor(),
+              gko::matrix::default_stride_factor);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
+    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0);
+    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
+    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
+    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
+    EXPECT_EQ(s[0], 0);
+    EXPECT_EQ(s[1], 3);
+    EXPECT_EQ(l[0], 3);
+}
+
+
+TYPED_TEST(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor)
+{
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto sellp_mtx =
+        Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0);
+
+    this->mtx7->convert_to(sellp_mtx.get());
     auto v = sellp_mtx->get_const_values();
     auto c = sellp_mtx->get_const_col_idxs();
     auto s = sellp_mtx->get_const_slice_sets();
@@ -947,26 +1389,28 @@ TEST_F(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor)
     EXPECT_EQ(c[5], 0);
     EXPECT_EQ(c[6], 0);
     EXPECT_EQ(c[7], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 0.0);
-    EXPECT_EQ(v[4], 3.0);
-    EXPECT_EQ(v[5], 0.0);
-    EXPECT_EQ(v[6], 0.0);
-    EXPECT_EQ(v[7], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{0.0});
+    EXPECT_EQ(v[4], T{3.0});
+    EXPECT_EQ(v[5], T{0.0});
+    EXPECT_EQ(v[6], T{0.0});
+    EXPECT_EQ(v[7], T{0.0});
     EXPECT_EQ(s[0], 0);
     EXPECT_EQ(s[1], 4);
     EXPECT_EQ(l[0], 4);
 }
 
 
-TEST_F(Dense, MovesToSellpWithSliceSizeAndStrideFactor)
+TYPED_TEST(Dense, MovesToSellpWithSliceSizeAndStrideFactor)
 {
-    auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor(),
-                                                  gko::dim<2>{}, 2, 2, 0);
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto sellp_mtx =
+        Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0);
 
-    mtx8->move_to(sellp_mtx.get());
+    this->mtx7->move_to(sellp_mtx.get());
     auto v = sellp_mtx->get_const_values();
     auto c = sellp_mtx->get_const_col_idxs();
     auto s = sellp_mtx->get_const_slice_sets();
@@ -985,63 +1429,562 @@ TEST_F(Dense, MovesToSellpWithSliceSizeAndStrideFactor)
     EXPECT_EQ(c[5], 0);
     EXPECT_EQ(c[6], 0);
     EXPECT_EQ(c[7], 0);
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 1.5);
-    EXPECT_EQ(v[2], 2.0);
-    EXPECT_EQ(v[3], 0.0);
-    EXPECT_EQ(v[4], 3.0);
-    EXPECT_EQ(v[5], 0.0);
-    EXPECT_EQ(v[6], 0.0);
-    EXPECT_EQ(v[7], 0.0);
+    EXPECT_EQ(v[0], T{1.0});
+    EXPECT_EQ(v[1], T{1.5});
+    EXPECT_EQ(v[2], T{2.0});
+    EXPECT_EQ(v[3], T{0.0});
+    EXPECT_EQ(v[4], T{3.0});
+    EXPECT_EQ(v[5], T{0.0});
+    EXPECT_EQ(v[6], T{0.0});
+    EXPECT_EQ(v[7], T{0.0});
     EXPECT_EQ(s[0], 0);
     EXPECT_EQ(s[1], 4);
     EXPECT_EQ(l[0], 4);
 }
 
 
-TEST_F(Dense, SquareMatrixIsTransposable)
+TYPED_TEST(Dense, ConvertsToAndFromSellpWithMoreThanOneSlice)
+{
+    using T = typename TestFixture::value_type;
+    using Mtx = typename TestFixture::Mtx;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto x = this->template gen_mtx<Mtx>(65, 25);
+
+    auto sellp_mtx = Sellp::create(this->exec);
+    auto dense_mtx = Mtx::create(this->exec);
+    x->convert_to(sellp_mtx.get());
+    sellp_mtx->convert_to(dense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), x.get(), r<TypeParam>::value);
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyToPrecision)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDense = typename gko::matrix::Dense<OtherT>;
+    auto empty = OtherDense::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyToPrecision)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDense = typename gko::matrix::Dense<OtherT>;
+    auto empty = OtherDense::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyToCoo)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Coo = typename gko::matrix::Coo<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Coo::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyToCoo)
 {
-    auto trans = mtx5->transpose();
-    auto trans_as_dense = static_cast<gko::matrix::Dense<> *>(trans.get());
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Coo = typename gko::matrix::Coo<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Coo::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyMatrixToCsr)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Csr = typename gko::matrix::Csr<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyMatrixToCsr)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Csr = typename gko::matrix::Csr<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyToSparsityCsr)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = SparsityCsr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_nonzeros(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyToSparsityCsr)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = SparsityCsr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_nonzeros(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyToEll)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Ell::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyToEll)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Ell = typename gko::matrix::Ell<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Ell::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyToHybrid)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Hybrid::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyToHybrid)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Hybrid::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, ConvertsEmptyToSellp)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Sellp::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, MovesEmptyToSellp)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
+    auto empty = Dense::create(this->exec);
+    auto res = Sellp::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Dense, SquareMatrixIsTransposable)
+{
+    using Mtx = typename TestFixture::Mtx;
+    auto trans = this->mtx5->transpose();
+    auto trans_as_dense = static_cast<Mtx *>(trans.get());
 
     GKO_ASSERT_MTX_NEAR(
         trans_as_dense,
-        l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), 0.0);
+        l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}),
+        r<TypeParam>::value);
 }
 
 
-TEST_F(Dense, NonSquareMatrixIsTransposable)
+TYPED_TEST(Dense, NonSquareMatrixIsTransposable)
 {
-    auto trans = mtx4->transpose();
-    auto trans_as_dense = static_cast<gko::matrix::Dense<> *>(trans.get());
+    using Mtx = typename TestFixture::Mtx;
+    auto trans = this->mtx4->transpose();
+    auto trans_as_dense = static_cast<Mtx *>(trans.get());
 
     GKO_ASSERT_MTX_NEAR(trans_as_dense, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}),
-                        0.0);
+                        r<TypeParam>::value);
 }
 
 
-TEST_F(Dense, NonSquareMatrixIsConjugateTransposable)
+TYPED_TEST(Dense, SquareMatrixIsRowPermutable)
 {
-    auto trans = mtx6->conj_transpose();
-    auto trans_as_dense =
-        static_cast<gko::matrix::Dense<std::complex<double>> *>(trans.get());
+    // clang-format off
+    // {1.0, -1.0, -0.5},
+    // {-2.0, 2.0, 4.5},
+    // {2.1, 3.4, 1.2}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx5->get_executor();
+    gko::Array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    auto row_permute = this->mtx5->row_permute(&permute_idxs);
+
+    auto row_permute_dense = static_cast<Mtx *>(row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(row_permute_dense,
+                        l({{-2.0, 2.0, 4.5},
+                           {2.1, 3.4, 1.2},
+                           {1.0, -1.0, -0.5}}), r<TypeParam>::value);
+    // clang-format on
+}
 
-    GKO_ASSERT_MTX_NEAR(trans_as_dense,
-                        l({{1.0 - 2.0 * i, -2.0 - 1.5 * i, 1.0 + 0.0 * i},
-                           {-1.0 - 2.1 * i, 4.5 + 0.0 * i, -i}}),
-                        0.0);
+
+TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int32> permute_idxs{exec, {1, 0}};
+    auto row_permute = this->mtx4->row_permute(&permute_idxs);
+
+    auto row_permute_dense = static_cast<Mtx *>(row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(row_permute_dense,
+                        l({{0.0, 5.0, 0.0},
+                           {1.0, 3.0, 2.0}}), r<TypeParam>::value);
+    // clang-format on
 }
 
-TEST_F(Dense, ConvertsToAndFromSellpWithMoreThanOneSlice)
+
+TYPED_TEST(Dense, SquareMatrixIsColPermutable)
 {
-    auto x = gen_mtx<Mtx>(65, 25);
+    // clang-format off
+    // {1.0, -1.0, -0.5},
+    // {-2.0, 2.0, 4.5},
+    // {2.1, 3.4, 1.2}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx5->get_executor();
+    gko::Array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    auto c_permute = this->mtx5->column_permute(&permute_idxs);
+
+    auto c_permute_dense = static_cast<Mtx *>(c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(c_permute_dense,
+                        l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, NonSquareMatrixIsColPermutable)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    auto c_permute = this->mtx4->column_permute(&permute_idxs);
+
+    auto c_permute_dense = static_cast<Mtx *>(c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(c_permute_dense,
+                        l({{3.0, 2.0, 1.0},
+                           {5.0, 0.0, 0.0}}),
+                        r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutable)
+{
+    // clang-format off
+    // {1.0, -1.0, -0.5},
+    // {-2.0, 2.0, 4.5},
+    // {2.1, 3.4, 1.2}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx5->get_executor();
+    gko::Array<gko::int32> inverse_permute_idxs{exec, {1, 2, 0}};
+    auto inverse_row_permute =
+        this->mtx5->inverse_row_permute(&inverse_permute_idxs);
+
+    auto inverse_row_permute_dense =
+        static_cast<Mtx *>(inverse_row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_row_permute_dense,
+                        l({{2.1, 3.4, 1.2},
+                           {1.0, -1.0, -0.5},
+                           {-2.0, 2.0, 4.5}}), r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int32> inverse_permute_idxs{exec, {1, 0}};
+    auto inverse_row_permute =
+        this->mtx4->inverse_row_permute(&inverse_permute_idxs);
+
+    auto inverse_row_permute_dense =
+        static_cast<Mtx *>(inverse_row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_row_permute_dense,
+                        l({{0.0, 5.0, 0.0},
+                           {1.0, 3.0, 2.0}}), r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, SquareMatrixIsInverseColPermutable)
+{
+    // clang-format off
+    // {1.0, -1.0, -0.5},
+    // {-2.0, 2.0, 4.5},
+    // {2.1, 3.4, 1.2}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx5->get_executor();
+    gko::Array<gko::int32> inverse_permute_idxs{exec, {1, 2, 0}};
+    auto inverse_c_permute =
+        this->mtx5->inverse_column_permute(&inverse_permute_idxs);
+
+    auto inverse_c_permute_dense = static_cast<Mtx *>(inverse_c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_c_permute_dense,
+                        l({{-0.5, 1.0, -1.0},
+                           {4.5, -2.0, 2.0},
+                           {1.2, 2.1, 3.4}}), r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int32> inverse_permute_idxs{exec, {1, 2, 0}};
+    auto inverse_c_permute =
+        this->mtx4->inverse_column_permute(&inverse_permute_idxs);
+
+    auto inverse_c_permute_dense = static_cast<Mtx *>(inverse_c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_c_permute_dense,
+                        l({{2.0, 1.0, 3.0},
+                           {0.0, 0.0, 5.0}}),
+                        r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable64)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int64> permute_idxs{exec, {1, 0}};
+    auto row_permute = this->mtx4->row_permute(&permute_idxs);
+
+    auto row_permute_dense = static_cast<Mtx *>(row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(row_permute_dense,
+                        l({{0.0, 5.0, 0.0},
+                           {1.0, 3.0, 2.0}}), r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, NonSquareMatrixIsColPermutable64)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    auto c_permute = this->mtx4->column_permute(&permute_idxs);
+
+    auto c_permute_dense = static_cast<Mtx *>(c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(c_permute_dense,
+                        l({{3.0, 2.0, 1.0},
+                           {5.0, 0.0, 0.0}}),
+                        r<TypeParam>::value);
+    // clang-format on
+}
 
-    auto sellp_mtx = gko::matrix::Sellp<>::create(exec);
-    auto dense_mtx = gko::matrix::Dense<>::create(exec);
-    x->convert_to(sellp_mtx.get());
-    sellp_mtx->convert_to(dense_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), x.get(), 1e-14);
+TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable64)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int64> inverse_permute_idxs{exec, {1, 0}};
+    auto inverse_row_permute =
+        this->mtx4->inverse_row_permute(&inverse_permute_idxs);
+
+    auto inverse_row_permute_dense =
+        static_cast<Mtx *>(inverse_row_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_row_permute_dense,
+                        l({{0.0, 5.0, 0.0},
+                           {1.0, 3.0, 2.0}}), r<TypeParam>::value);
+    // clang-format on
+}
+
+
+TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable64)
+{
+    // clang-format off
+    // {1.0, 3.0, 2.0},
+    // {0.0, 5.0, 0.0}
+    // clang-format on
+    using Mtx = typename TestFixture::Mtx;
+    auto exec = this->mtx4->get_executor();
+    gko::Array<gko::int64> inverse_permute_idxs{exec, {1, 2, 0}};
+    auto inverse_c_permute =
+        this->mtx4->inverse_column_permute(&inverse_permute_idxs);
+
+    auto inverse_c_permute_dense = static_cast<Mtx *>(inverse_c_permute.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(inverse_c_permute_dense,
+                        l({{2.0, 1.0, 3.0},
+                           {0.0, 0.0, 5.0}}),
+                        r<TypeParam>::value);
+    // clang-format on
+}
+
+
+template <typename T>
+class DenseComplex : public ::testing::Test {
+protected:
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+};
+
+
+TYPED_TEST_CASE(DenseComplex, gko::test::ComplexValueTypes);
+
+
+TYPED_TEST(DenseComplex, NonSquareMatrixIsConjugateTransposable)
+{
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto exec = gko::ReferenceExecutor::create();
+    auto mtx = gko::initialize<Dense>({{T{1.0, 2.0}, T{-1.0, 2.1}},
+                                       {T{-2.0, 1.5}, T{4.5, 0.0}},
+                                       {T{1.0, 0.0}, T{0.0, 1.0}}},
+                                      exec);
+    auto trans = mtx->conj_transpose();
+    auto trans_as_dense = static_cast<Dense *>(trans.get());
+
+    GKO_ASSERT_MTX_NEAR(trans_as_dense,
+                        l({{T{1.0, -2.0}, T{-2.0, -1.5}, T{1.0, 0.0}},
+                           {T{-1.0, -2.1}, T{4.5, 0.0}, T{0.0, -1.0}}}),
+                        0.0);
 }
 
 
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 806949caf9b..f9a4f401dc2 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -47,14 +46,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Ell : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Ell<>;
-    using Csr = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using T = value_type;
+    using Mtx = gko::matrix::Ell<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Ell()
         : exec(gko::ReferenceExecutor::create()),
@@ -85,10 +93,10 @@ class Ell : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], T{1.0});
+        EXPECT_EQ(v[1], T{3.0});
+        EXPECT_EQ(v[2], T{2.0});
+        EXPECT_EQ(v[3], T{5.0});
     }
 
     std::shared_ptr<const gko::Executor> exec;
@@ -96,107 +104,161 @@ class Ell : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
 };
 
+TYPED_TEST_CASE(Ell, gko::test::ValueIndexTypes);
+
 
-TEST_F(Ell, AppliesToDenseVector)
+TYPED_TEST(Ell, AppliesToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx1->apply(x.get(), y.get());
+    this->mtx1->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Ell, AppliesToDenseMatrix)
+TYPED_TEST(Ell, AppliesToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    mtx1->apply(x.get(), y.get());
+    this->mtx1->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{13.0,  3.5},
-                       { 5.0, -7.5}}), 0.0);
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Ell, AppliesLinearCombinationToDenseVector)
+TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Ell, AppliesLinearCombinationToDenseMatrix)
+TYPED_TEST(Ell, AppliesLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{-11.0, -2.5},
-                       { -1.0,  4.5}}), 0.0);
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Ell, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
+{
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Ell, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3}, 2);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Ell, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(Ell, ConvertsToPrecision)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3}, 2);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Ell = typename TestFixture::Mtx;
+    using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
+    auto tmp = OtherEll::create(this->exec);
+    auto res = Ell::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx1->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
+}
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+
+TYPED_TEST(Ell, MovesToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Ell = typename TestFixture::Mtx;
+    using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
+    auto tmp = OtherEll::create(this->exec);
+    auto res = Ell::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx1->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
 }
 
 
-TEST_F(Ell, ConvertsToDense)
+TYPED_TEST(Ell, ConvertsToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx1->get_executor());
 
-    mtx1->convert_to(dense_mtx.get());
+    this->mtx1->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -206,11 +268,12 @@ TEST_F(Ell, ConvertsToDense)
 }
 
 
-TEST_F(Ell, MovesToDense)
+TYPED_TEST(Ell, MovesToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx1->get_executor());
 
-    mtx1->move_to(dense_mtx.get());
+    this->mtx1->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -220,111 +283,121 @@ TEST_F(Ell, MovesToDense)
 }
 
 
-TEST_F(Ell, AppliesWithStrideToDenseVector)
+TYPED_TEST(Ell, AppliesWithStrideToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx2->apply(x.get(), y.get());
+    this->mtx2->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Ell, AppliesWithStrideToDenseMatrix)
+TYPED_TEST(Ell, AppliesWithStrideToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    mtx2->apply(x.get(), y.get());
+    this->mtx2->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{13.0, 3.5},
-                       {5.0, -7.5}}), 0.0);
+                        l({{13.0, 3.5},
+                           {5.0, -7.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Ell, AppliesWithStrideLinearCombinationToDenseVector)
+TYPED_TEST(Ell, AppliesWithStrideLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Ell, AppliesWithStrideLinearCombinationToDenseMatrix)
+TYPED_TEST(Ell, AppliesWithStrideLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{-11.0, -2.5},
-                       {-1.0, 4.5}}), 0.0);
+                        l({{-11.0, -2.5},
+                           {-1.0, 4.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Ell, ApplyWithStrideFailsOnWrongInnerDimension)
+TYPED_TEST(Ell, ApplyWithStrideFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Ell, ApplyWithStrideFailsOnWrongNumberOfRows)
+TYPED_TEST(Ell, ApplyWithStrideFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Ell, ApplyWithStrideFailsOnWrongNumberOfCols)
+TYPED_TEST(Ell, ApplyWithStrideFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3}, 2);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3}, 2);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Ell, ConvertsWithStrideToDense)
+TYPED_TEST(Ell, ConvertsWithStrideToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx2->get_executor());
     // clang-format off
-    auto dense_other = gko::initialize<gko::matrix::Dense<>>(
+    auto dense_other = gko::initialize<Vec>(
         4, {{1.0, 3.0, 2.0},
-            {0.0, 5.0, 0.0}}, exec);
+            {0.0, 5.0, 0.0}}, this->exec);
     // clang-format on
 
-    mtx2->convert_to(dense_mtx.get());
+    this->mtx2->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -334,11 +407,12 @@ TEST_F(Ell, ConvertsWithStrideToDense)
 }
 
 
-TEST_F(Ell, MovesWithStrideToDense)
+TYPED_TEST(Ell, MovesWithStrideToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx2->get_executor());
 
-    mtx2->move_to(dense_mtx.get());
+    this->mtx2->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -348,42 +422,179 @@ TEST_F(Ell, MovesWithStrideToDense)
 }
 
 
-TEST_F(Ell, ConvertsToCsr)
+TYPED_TEST(Ell, ConvertsToCsr)
+{
+    using Vec = typename TestFixture::Vec;
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge);
+
+    this->mtx1->convert_to(csr_mtx_c.get());
+    this->mtx1->convert_to(csr_mtx_m.get());
+
+    this->assert_equal_to_mtx(csr_mtx_c.get());
+    this->assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+}
+
+
+TYPED_TEST(Ell, MovesToCsr)
+{
+    using Vec = typename TestFixture::Vec;
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge);
+
+    this->mtx1->move_to(csr_mtx_c.get());
+    this->mtx1->move_to(csr_mtx_m.get());
+
+    this->assert_equal_to_mtx(csr_mtx_c.get());
+    this->assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+}
+
+
+TYPED_TEST(Ell, ConvertsWithStrideToCsr)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
-
-    mtx1->convert_to(csr_mtx_c.get());
-    mtx1->convert_to(csr_mtx_m.get());
-
-    assert_equal_to_mtx(csr_mtx_c.get());
-    assert_equal_to_mtx(csr_mtx_m.get());
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    using Vec = typename TestFixture::Vec;
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx2->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx2->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx2->clone();
+
+    this->mtx2->convert_to(csr_mtx_c.get());
+    mtx_clone->convert_to(csr_mtx_m.get());
+
+    this->assert_equal_to_mtx(csr_mtx_c.get());
+    this->assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Ell, ConvertsWithStrideToCsr)
+TYPED_TEST(Ell, MovesWithStrideToCsr)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx2->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx2->get_executor(), csr_s_merge);
-    auto mtx_clone = mtx2->clone();
-
-    mtx2->move_to(csr_mtx_c.get());
+    using Vec = typename TestFixture::Vec;
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx2->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx2->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx2->clone();
+
+    this->mtx2->move_to(csr_mtx_c.get());
     mtx_clone->move_to(csr_mtx_m.get());
 
-    assert_equal_to_mtx(csr_mtx_c.get());
-    assert_equal_to_mtx(csr_mtx_m.get());
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    this->assert_equal_to_mtx(csr_mtx_c.get());
+    this->assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+}
+
+
+TYPED_TEST(Ell, ConvertsEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Ell = typename TestFixture::Mtx;
+    using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
+    auto empty = Ell::create(this->exec);
+    auto res = OtherEll::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Ell, MovesEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Ell = typename TestFixture::Mtx;
+    using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
+    auto empty = Ell::create(this->exec);
+    auto res = OtherEll::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Ell, ConvertsEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Ell = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Ell::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Ell, MovesEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Ell = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Ell::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Ell, ConvertsEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Ell = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto empty = Ell::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Ell, MovesEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Ell = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto empty = Ell::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index 1ca6b61cd20..c1f98b67f4f 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/hybrid_kernels.hpp"
+#include <ginkgo/core/matrix/hybrid.hpp>
 
 
 #include <memory>
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -47,14 +46,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/matrix/hybrid_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Hybrid : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Hybrid<>;
-    using Vec = gko::matrix::Dense<>;
-    using Csr = gko::matrix::Csr<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using T = value_type;
+    using Mtx = gko::matrix::Hybrid<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
 
     Hybrid()
         : exec(gko::ReferenceExecutor::create()),
@@ -109,10 +118,10 @@ class Hybrid : public ::testing::Test {
         EXPECT_EQ(c[1], 1);
         EXPECT_EQ(c[2], 2);
         EXPECT_EQ(c[3], 1);
-        EXPECT_EQ(v[0], 1.0);
-        EXPECT_EQ(v[1], 3.0);
-        EXPECT_EQ(v[2], 2.0);
-        EXPECT_EQ(v[3], 5.0);
+        EXPECT_EQ(v[0], T{1.0});
+        EXPECT_EQ(v[1], T{3.0});
+        EXPECT_EQ(v[2], T{2.0});
+        EXPECT_EQ(v[3], T{5.0});
     }
 
     std::shared_ptr<const gko::ReferenceExecutor> exec;
@@ -121,66 +130,74 @@ class Hybrid : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3;
 };
 
+TYPED_TEST_CASE(Hybrid, gko::test::ValueIndexTypes);
+
 
-TEST_F(Hybrid, AppliesToDenseVector)
+TYPED_TEST(Hybrid, AppliesToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx1->apply(x.get(), y.get());
+    this->mtx1->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Hybrid, AppliesToDenseMatrix)
+TYPED_TEST(Hybrid, AppliesToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    mtx1->apply(x.get(), y.get());
+    this->mtx1->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{13.0,  3.5},
-                       { 5.0, -7.5}}), 0.0);
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Hybrid, AppliesLinearCombinationToDenseVector)
+TYPED_TEST(Hybrid, AppliesLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Hybrid, AppliesLinearCombinationToDenseMatrix)
+TYPED_TEST(Hybrid, AppliesLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
@@ -190,38 +207,84 @@ TEST_F(Hybrid, AppliesLinearCombinationToDenseMatrix)
 }
 
 
-TEST_F(Hybrid, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(Hybrid, ApplyFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Hybrid, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(Hybrid, ApplyFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Hybrid, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(Hybrid, ApplyFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3}, 2);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3}, 2);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Hybrid, ConvertsToDense)
+TYPED_TEST(Hybrid, ConvertsToPrecision)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor());
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Hybrid = typename TestFixture::Mtx;
+    using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
+    auto tmp = OtherHybrid::create(this->exec);
+    auto res = Hybrid::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx1->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
+}
 
-    mtx1->convert_to(dense_mtx.get());
+
+TYPED_TEST(Hybrid, MovesToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Hybrid = typename TestFixture::Mtx;
+    using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
+    auto tmp = OtherHybrid::create(this->exec);
+    auto res = Hybrid::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx1->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
+}
+
+
+TYPED_TEST(Hybrid, ConvertsToDense)
+{
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx1->get_executor());
+
+    this->mtx1->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -231,11 +294,12 @@ TEST_F(Hybrid, ConvertsToDense)
 }
 
 
-TEST_F(Hybrid, MovesToDense)
+TYPED_TEST(Hybrid, MovesToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx1->get_executor());
 
-    mtx1->move_to(dense_mtx.get());
+    this->mtx1->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -245,98 +309,199 @@ TEST_F(Hybrid, MovesToDense)
 }
 
 
-TEST_F(Hybrid, ConvertsToCsr)
+TYPED_TEST(Hybrid, ConvertsToCsr)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
-
-    mtx1->convert_to(csr_mtx_c.get());
-    mtx1->convert_to(csr_mtx_m.get());
-
-    assert_equal_to_mtx(csr_mtx_c.get());
-    assert_equal_to_mtx(csr_mtx_m.get());
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge);
+
+    this->mtx1->convert_to(csr_mtx_c.get());
+    this->mtx1->convert_to(csr_mtx_m.get());
+
+    this->assert_equal_to_mtx(csr_mtx_c.get());
+    this->assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Hybrid, MovesToCsr)
+TYPED_TEST(Hybrid, MovesToCsr)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
-    auto mtx_clone = mtx1->clone();
-
-    mtx1->move_to(csr_mtx_c.get());
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx1->clone();
+
+    this->mtx1->move_to(csr_mtx_c.get());
     mtx_clone->move_to(csr_mtx_m.get());
 
-    assert_equal_to_mtx(csr_mtx_c.get());
-    assert_equal_to_mtx(csr_mtx_m.get());
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    this->assert_equal_to_mtx(csr_mtx_c.get());
+    this->assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+}
+
+
+TYPED_TEST(Hybrid, ConvertsToCsrWithoutZeros)
+{
+    using Csr = typename TestFixture::Csr;
+    auto csr_mtx = Csr::create(this->mtx3->get_executor());
+
+    this->mtx3->convert_to(csr_mtx.get());
+
+    this->assert_equal_to_mtx(csr_mtx.get());
+}
+
+
+TYPED_TEST(Hybrid, MovesToCsrWithoutZeros)
+{
+    using Csr = typename TestFixture::Csr;
+    auto csr_mtx = Csr::create(this->mtx3->get_executor());
+
+    this->mtx3->move_to(csr_mtx.get());
+
+    this->assert_equal_to_mtx(csr_mtx.get());
 }
 
 
-TEST_F(Hybrid, ConvertsToCsrWithoutZeros)
+TYPED_TEST(Hybrid, ConvertsEmptyToPrecision)
 {
-    auto csr_mtx = Csr::create(mtx3->get_executor());
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Hybrid = typename TestFixture::Mtx;
+    using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
+    auto other = Hybrid::create(this->exec);
+    auto res = OtherHybrid::create(this->exec);
+
+    other->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
+}
 
-    mtx3->convert_to(csr_mtx.get());
 
-    assert_equal_to_mtx(csr_mtx.get());
+TYPED_TEST(Hybrid, MovesEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Hybrid = typename TestFixture::Mtx;
+    using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
+    auto other = Hybrid::create(this->exec);
+    auto res = OtherHybrid::create(this->exec);
+
+    other->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TEST_F(Hybrid, MovesToCsrWithoutZeros)
+TYPED_TEST(Hybrid, ConvertsEmptyToDense)
 {
-    auto csr_mtx = Csr::create(mtx3->get_executor());
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Hybrid = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto other = Hybrid::create(this->exec);
+    auto res = Dense::create(this->exec);
 
-    mtx3->move_to(csr_mtx.get());
+    other->convert_to(res.get());
 
-    assert_equal_to_mtx(csr_mtx.get());
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TEST_F(Hybrid, CountsNonzeros)
+TYPED_TEST(Hybrid, MovesEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Hybrid = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto other = Hybrid::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    other->move_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Hybrid, ConvertsEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Hybrid = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto other = Hybrid::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    other->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Hybrid, MovesEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Hybrid = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto other = Hybrid::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    other->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Hybrid, CountsNonzeros)
 {
     gko::size_type nonzeros;
 
-    gko::kernels::reference::hybrid::count_nonzeros(exec, mtx1.get(),
-                                                    &nonzeros);
+    gko::kernels::reference::hybrid::count_nonzeros(
+        this->exec, this->mtx1.get(), &nonzeros);
 
     ASSERT_EQ(nonzeros, 4);
 }
 
 
-TEST_F(Hybrid, AppliesWithStrideToDenseVector)
+TYPED_TEST(Hybrid, AppliesWithStrideToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx2->apply(x.get(), y.get());
+    this->mtx2->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Hybrid, AppliesWithStrideToDenseMatrix)
+TYPED_TEST(Hybrid, AppliesWithStrideToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    mtx2->apply(x.get(), y.get());
+    this->mtx2->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
@@ -346,80 +511,87 @@ TEST_F(Hybrid, AppliesWithStrideToDenseMatrix)
 }
 
 
-TEST_F(Hybrid, AppliesWithStrideLinearCombinationToDenseVector)
+TYPED_TEST(Hybrid, AppliesWithStrideLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Hybrid, AppliesWithStrideLinearCombinationToDenseMatrix)
+TYPED_TEST(Hybrid, AppliesWithStrideLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                    l({{-11.0, -2.5},
-                       {-1.0, 4.5}}), 0.0);
+                        l({{-11.0, -2.5},
+                           {-1.0, 4.5}}), 0.0);
     // clang-format on
 }
 
 
-TEST_F(Hybrid, ApplyWithStrideFailsOnWrongInnerDimension)
+TYPED_TEST(Hybrid, ApplyWithStrideFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Hybrid, ApplyWithStrideFailsOnWrongNumberOfRows)
+TYPED_TEST(Hybrid, ApplyWithStrideFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Hybrid, ApplyWithStrideFailsOnWrongNumberOfCols)
+TYPED_TEST(Hybrid, ApplyWithStrideFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3}, 2);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3}, 2);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Hybrid, ConvertsWithStrideToDense)
+TYPED_TEST(Hybrid, ConvertsWithStrideToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx2->get_executor());
     // clang-format off
-    auto dense_other = gko::initialize<gko::matrix::Dense<>>(
+    auto dense_other = gko::initialize<Vec>(
         4, {{1.0, 3.0, 2.0},
-            {0.0, 5.0, 0.0}}, exec);
+            {0.0, 5.0, 0.0}}, this->exec);
     // clang-format on
 
-    mtx2->convert_to(dense_mtx.get());
+    this->mtx2->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -429,11 +601,12 @@ TEST_F(Hybrid, ConvertsWithStrideToDense)
 }
 
 
-TEST_F(Hybrid, MovesWithStrideToDense)
+TYPED_TEST(Hybrid, MovesWithStrideToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx2->get_executor());
 
-    mtx2->move_to(dense_mtx.get());
+    this->mtx2->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp
index b725b83f83b..0ade3bce9f1 100644
--- a/reference/test/matrix/identity.cpp
+++ b/reference/test/matrix/identity.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,17 +36,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Identity : public ::testing::Test {
 protected:
-    using Id = gko::matrix::Identity<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type = T;
+    using Id = gko::matrix::Identity<value_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Identity() : exec(gko::ReferenceExecutor::create()) {}
 
@@ -54,13 +58,18 @@ class Identity : public ::testing::Test {
 };
 
 
-TEST_F(Identity, AppliesLinearCombinationToVector)
+TYPED_TEST_CASE(Identity, gko::test::ValueTypes);
+
+
+TYPED_TEST(Identity, AppliesLinearCombinationToVector)
 {
-    auto identity = Id::create(exec, 3);
-    auto alpha = gko::initialize<Vec>({2.0}, exec);
-    auto beta = gko::initialize<Vec>({1.0}, exec);
-    auto x = gko::initialize<Vec>({3.0, -1.0, 2.0}, exec);
-    auto b = gko::initialize<Vec>({2.0, 1.0, 5.0}, exec);
+    using Id = typename TestFixture::Id;
+    using Vec = typename TestFixture::Vec;
+    auto identity = Id::create(this->exec, 3);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({1.0}, this->exec);
+    auto x = gko::initialize<Vec>({3.0, -1.0, 2.0}, this->exec);
+    auto b = gko::initialize<Vec>({2.0, 1.0, 5.0}, this->exec);
 
     identity->apply(alpha.get(), b.get(), beta.get(), x.get());
 
@@ -68,19 +77,22 @@ TEST_F(Identity, AppliesLinearCombinationToVector)
 }
 
 
-TEST_F(Identity, AppliesLinearCombinationToMultipleVectors)
+TYPED_TEST(Identity, AppliesLinearCombinationToMultipleVectors)
 {
-    auto identity = Id::create(exec, 3);
-    auto alpha = gko::initialize<Vec>({2.0}, exec);
-    auto beta = gko::initialize<Vec>({1.0}, exec);
-    auto x =
-        gko::initialize<Vec>(3, {{3.0, 0.5}, {-1.0, 2.5}, {2.0, 3.4}}, exec);
-    auto b =
-        gko::initialize<Vec>(3, {{2.0, 3.0}, {1.0, 2.0}, {5.0, -1.0}}, exec);
+    using Id = typename TestFixture::Id;
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto identity = Id::create(this->exec, 3);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({1.0}, this->exec);
+    auto x = gko::initialize<Vec>(
+        3, {I<T>{3.0, 0.5}, I<T>{-1.0, 2.5}, I<T>{2.0, 3.5}}, this->exec);
+    auto b = gko::initialize<Vec>(
+        3, {I<T>{2.0, 3.0}, I<T>{1.0, 2.0}, I<T>{5.0, -1.0}}, this->exec);
 
     identity->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{7.0, 6.5}, {1.0, 6.5}, {12.0, 1.4}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(x, l({{7.0, 6.5}, {1.0, 6.5}, {12.0, 1.5}}), 0.0);
 }
 
 
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
new file mode 100644
index 00000000000..1c7e93fe115
--- /dev/null
+++ b/reference/test/matrix/permutation.cpp
@@ -0,0 +1,499 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Permutation : public ::testing::Test {
+protected:
+    using v_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using i_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Vec = gko::matrix::Dense<v_type>;
+    using Csr = gko::matrix::Csr<v_type, i_type>;
+
+    Permutation() : exec(gko::ReferenceExecutor::create()) {}
+
+    std::shared_ptr<const gko::Executor> exec;
+};
+
+TYPED_TEST_CASE(Permutation, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(Permutation, AppliesRowPermutationToDense)
+{
+    using i_type = typename TestFixture::i_type;
+    using T = typename TestFixture::v_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0},
+         I<T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+    i_type rdata[] = {1, 0};
+
+    auto perm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2},
+        gko::Array<i_type>::view(this->exec, 2, rdata));
+
+    perm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                        l({{4.0, 2.5},
+                           {2.0, 3.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesColPermutationToDense)
+{
+    using i_type = typename TestFixture::i_type;
+    using T = typename TestFixture::v_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0},
+         I<T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+    i_type rdata[] = {1, 0};
+
+    auto perm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2},
+        gko::Array<i_type>::view(this->exec, 2, rdata),
+        gko::matrix::column_permute);
+
+    perm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                        l({{3.0, 2.0},
+                           {2.5, 4.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense)
+{
+    using i_type = typename TestFixture::i_type;
+    using T = typename TestFixture::v_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0},
+         I<T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y1 = Vec::create(this->exec, gko::dim<2>{2});
+    auto y2 = Vec::create(this->exec, gko::dim<2>{2});
+    i_type cdata[] = {1, 0};
+    i_type rdata[] = {1, 0};
+
+    auto rperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2},
+        gko::Array<i_type>::view(this->exec, 2, rdata));
+    auto cperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2},
+        gko::Array<i_type>::view(this->exec, 2, cdata),
+        gko::matrix::column_permute);
+
+    rperm->apply(x.get(), y1.get());
+    cperm->apply(y1.get(), y2.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y2.get(),
+                        l({{2.5, 4.0},
+                           {3.0, 2.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray)
+{
+    using i_type = typename TestFixture::i_type;
+    using T = typename TestFixture::v_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0},
+         I<T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y1 = Vec::create(this->exec, gko::dim<2>{2});
+    i_type data[] = {1, 0};
+
+    auto perm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{2},
+        gko::Array<i_type>::view(this->exec, 2, data),
+        gko::matrix::row_permute | gko::matrix::column_permute);
+
+    perm->apply(x.get(), y1.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y1.get(),
+                        l({{2.5, 4.0},
+                           {3.0, 2.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense)
+{
+    using i_type = typename TestFixture::i_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
+                                  {0.0, 1.0, 0.0},
+                                  {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y1 = Vec::create(this->exec, gko::dim<2>{3});
+    auto y2 = Vec::create(this->exec, gko::dim<2>{3});
+    i_type cdata[] = {1, 2, 0};
+    i_type rdata[] = {1, 2, 0};
+
+    auto rperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, rdata),
+        gko::matrix::row_permute | gko::matrix::inverse_permute);
+    auto cperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, cdata),
+        gko::matrix::inverse_permute | gko::matrix::column_permute);
+
+    rperm->apply(x.get(), y1.get());
+    cperm->apply(y1.get(), y2.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y2.get(),
+                        l({{2.5, 0.0, 4.0},
+                           {0.0, 2.0, 3.0},
+                           {0.0, 0.0, 1.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray)
+{
+    using i_type = typename TestFixture::i_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                 this->exec);
+    // clang-format on
+    auto y1 = Vec::create(this->exec, gko::dim<2>{3});
+    i_type data[] = {1, 2, 0};
+
+    auto perm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, data),
+        gko::matrix::column_permute | gko::matrix::row_permute |
+            gko::matrix::inverse_permute);
+
+    perm->apply(x.get(), y1.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y1.get(),
+                        l({{2.5, 0.0, 4.0},
+                           {0.0, 2.0, 3.0},
+                           {0.0, 0.0, 1.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense)
+{
+    using i_type = typename TestFixture::i_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
+                                 {0.0, 1.0, 0.0},
+                                 {0.0, 4.0, 2.5}},
+                                this->exec);
+    // clang-format on
+    auto y = Vec::create(this->exec, gko::dim<2>{3});
+    i_type rdata[] = {1, 2, 0};
+
+    auto rperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, rdata),
+        gko::matrix::row_permute | gko::matrix::inverse_permute);
+
+    rperm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                        l({{0.0, 4.0, 2.5},
+                           {2.0, 3.0, 0.0},
+                           {0.0, 1.0, 0.0}}),
+                          0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseColPermutationToDense)
+{
+    using i_type = typename TestFixture::i_type;
+    using Vec = typename TestFixture::Vec;
+    // clang-format off
+    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y = Vec::create(this->exec, gko::dim<2>{3});
+    i_type cdata[] = {1, 2, 0};
+
+    auto cperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, cdata),
+        gko::matrix::inverse_permute | gko::matrix::column_permute);
+
+    cperm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                      l({{0.0, 2.0, 3.0},
+                         {0.0, 0.0, 1.0},
+                         {2.5, 0.0, 4.0}}),
+                      0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesRowPermutationToCsr)
+{
+    using i_type = typename TestFixture::i_type;
+    using Csr = typename TestFixture::Csr;
+    // clang-format off
+    auto x = gko::initialize<Csr>(
+                                  {{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y = Csr::create(this->exec, gko::dim<2>{3});
+    i_type rdata[] = {1, 2, 0};
+
+    auto perm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, rdata));
+
+    perm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                        l({{0.0, 1.0, 0.0},
+                           {0.0, 4.0, 2.5},
+                           {2.0, 3.0, 0.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesColPermutationToCsr)
+{
+    using i_type = typename TestFixture::i_type;
+    using Csr = typename TestFixture::Csr;
+    // clang-format off
+    auto x = gko::initialize<Csr>(
+                                  {{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y = Csr::create(this->exec, gko::dim<2>{3});
+    i_type cdata[] = {1, 2, 0};
+
+    auto perm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, cdata),
+        gko::matrix::column_permute);
+
+    perm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                      l({{3.0, 0.0, 2.0},
+                         {1.0, 0.0, 0.0},
+                         {4.0, 2.5, 0.0}}),
+                      0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr)
+{
+    using i_type = typename TestFixture::i_type;
+    using Csr = typename TestFixture::Csr;
+    // clang-format off
+    auto x = gko::initialize<Csr>(
+                                  {{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y1 = Csr::create(this->exec, gko::dim<2>{3});
+    auto y2 = Csr::create(this->exec, gko::dim<2>{3});
+    i_type cdata[] = {1, 2, 0};
+    i_type rdata[] = {1, 2, 0};
+
+    auto rperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, rdata));
+    auto cperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, cdata),
+        gko::matrix::column_permute);
+
+    rperm->apply(x.get(), y1.get());
+    cperm->apply(y1.get(), y2.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y2.get(),
+                      l({{1.0, 0.0, 0.0},
+                         {4.0, 2.5, 0.0},
+                         {3.0, 0.0, 2.0}}),
+                      0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr)
+{
+    using i_type = typename TestFixture::i_type;
+    using Csr = typename TestFixture::Csr;
+    // clang-format off
+    auto x = gko::initialize<Csr>({{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y = Csr::create(this->exec, gko::dim<2>{3});
+    i_type rdata[] = {1, 2, 0};
+
+    auto rperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, rdata),
+        gko::matrix::row_permute | gko::matrix::inverse_permute);
+
+    rperm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                        l({{0.0, 4.0, 2.5},
+                           {2.0, 3.0, 0.0},
+                           {0.0, 1.0, 0.0}}),
+                          0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr)
+{
+    using i_type = typename TestFixture::i_type;
+    using Csr = typename TestFixture::Csr;
+    // clang-format off
+    auto x = gko::initialize<Csr>({{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y = Csr::create(this->exec, gko::dim<2>{3});
+    i_type cdata[] = {1, 2, 0};
+
+    auto cperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, cdata),
+        gko::matrix::inverse_permute | gko::matrix::column_permute);
+
+    cperm->apply(x.get(), y.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y.get(),
+                      l({{0.0, 2.0, 3.0},
+                         {0.0, 0.0, 1.0},
+                         {2.5, 0.0, 4.0}}),
+                      0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr)
+{
+    using i_type = typename TestFixture::i_type;
+    using Csr = typename TestFixture::Csr;
+    // clang-format off
+    auto x = gko::initialize<Csr>({{2.0, 3.0, 0.0},
+                                   {0.0, 1.0, 0.0},
+                                   {0.0, 4.0, 2.5}},
+                                  this->exec);
+    // clang-format on
+    auto y1 = Csr::create(this->exec, gko::dim<2>{3});
+    auto y2 = Csr::create(this->exec, gko::dim<2>{3});
+    i_type cdata[] = {1, 2, 0};
+    i_type rdata[] = {1, 2, 0};
+
+    auto rperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, rdata),
+        gko::matrix::row_permute | gko::matrix::inverse_permute);
+    auto cperm = gko::matrix::Permutation<i_type>::create(
+        this->exec, gko::dim<2>{3},
+        gko::Array<i_type>::view(this->exec, 3, cdata),
+        gko::matrix::inverse_permute | gko::matrix::column_permute);
+
+    rperm->apply(x.get(), y1.get());
+    cperm->apply(y1.get(), y2.get());
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y2.get(),
+                        l({{2.5, 0.0, 4.0},
+                           {0.0, 2.0, 3.0},
+                           {0.0, 0.0, 1.0}}),
+                        0.0);
+    // clang-format on
+}
+
+
+}  // namespace
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 102e218dbb4..ba28e1d127c 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -45,15 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/matrix/sellp_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class Sellp : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Sellp<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Sellp<value_type, index_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Sellp()
         : exec(gko::ReferenceExecutor::create()),
@@ -74,29 +80,34 @@ class Sellp : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
 };
 
+TYPED_TEST_CASE(Sellp, gko::test::ValueIndexTypes);
 
-TEST_F(Sellp, AppliesToDenseVector)
+
+TYPED_TEST(Sellp, AppliesToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx1->apply(x.get(), y.get());
+    this->mtx1->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Sellp, AppliesToDenseMatrix)
+TYPED_TEST(Sellp, AppliesToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    mtx1->apply(x.get(), y.get());
+    this->mtx1->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
@@ -106,34 +117,37 @@ TEST_F(Sellp, AppliesToDenseMatrix)
 }
 
 
-TEST_F(Sellp, AppliesLinearCombinationToDenseVector)
+TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Sellp, AppliesLinearCombinationToDenseMatrix)
+TYPED_TEST(Sellp, AppliesLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
@@ -143,38 +157,84 @@ TEST_F(Sellp, AppliesLinearCombinationToDenseMatrix)
 }
 
 
-TEST_F(Sellp, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(Sellp, ApplyFailsOnWrongInnerDimension)
+{
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Sellp, ApplyFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Sellp, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(Sellp, ApplyFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3}, 2);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Sellp, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(Sellp, ConvertsToPrecision)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3}, 2);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Sellp = typename TestFixture::Mtx;
+    using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
+    auto tmp = OtherSellp::create(this->exec);
+    auto res = Sellp::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx1->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
+}
+
 
-    ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch);
+TYPED_TEST(Sellp, MovesToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Sellp = typename TestFixture::Mtx;
+    using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
+    auto tmp = OtherSellp::create(this->exec);
+    auto res = Sellp::create(this->exec);
+    // If OtherType is more precise: 0, otherwise r
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+
+    this->mtx1->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual);
 }
 
 
-TEST_F(Sellp, ConvertsToDense)
+TYPED_TEST(Sellp, ConvertsToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx1->get_executor());
 
-    mtx1->convert_to(dense_mtx.get());
+    this->mtx1->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -184,11 +244,12 @@ TEST_F(Sellp, ConvertsToDense)
 }
 
 
-TEST_F(Sellp, MovesToDense)
+TYPED_TEST(Sellp, MovesToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx1->get_executor());
 
-    mtx1->move_to(dense_mtx.get());
+    this->mtx1->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -198,17 +259,16 @@ TEST_F(Sellp, MovesToDense)
 }
 
 
-TEST_F(Sellp, ConvertsToCsr)
+TYPED_TEST(Sellp, ConvertsToCsr)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge);
 
-    mtx1->convert_to(csr_mtx_c.get());
-    mtx1->convert_to(csr_mtx_m.get());
+    this->mtx1->convert_to(csr_mtx_c.get());
+    this->mtx1->convert_to(csr_mtx_m.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(csr_mtx_c,
@@ -216,22 +276,21 @@ TEST_F(Sellp, ConvertsToCsr)
                            {0.0, 5.0, 0.0}}), 0.0);
     // clang-format on
     GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TEST_F(Sellp, MovesToCsr)
+TYPED_TEST(Sellp, MovesToCsr)
 {
-    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
-    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
-    auto csr_mtx_c =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
-    auto csr_mtx_m =
-        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
-    auto mtx_clone = mtx1->clone();
-
-    mtx1->move_to(csr_mtx_c.get());
+    using Csr = typename TestFixture::Csr;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx1->clone();
+
+    this->mtx1->move_to(csr_mtx_c.get());
     mtx_clone->move_to(csr_mtx_m.get());
 
     // clang-format off
@@ -240,33 +299,138 @@ TEST_F(Sellp, MovesToCsr)
                            {0.0, 5.0, 0.0}}), 0.0);
     // clang-format on
     GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
-    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
-    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+}
+
+
+TYPED_TEST(Sellp, ConvertsEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Sellp = typename TestFixture::Mtx;
+    using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
+    auto empty = OtherSellp::create(this->exec);
+    empty->get_slice_sets()[0] = 0;
+    auto res = Sellp::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Sellp, MovesEmptyToPrecision)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using OtherType = typename gko::next_precision<ValueType>;
+    using Sellp = typename TestFixture::Mtx;
+    using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
+    auto empty = OtherSellp::create(this->exec);
+    empty->get_slice_sets()[0] = 0;
+    auto res = Sellp::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Sellp, ConvertsEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Sellp = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Sellp::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Sellp, MovesEmptyToDense)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Sellp = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<ValueType>;
+    auto empty = Sellp::create(this->exec);
+    auto res = Dense::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Sellp, ConvertsEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Sellp = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto empty = Sellp::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(Sellp, MovesEmptyToCsr)
+{
+    using ValueType = typename TestFixture::value_type;
+    using IndexType = typename TestFixture::index_type;
+    using Sellp = typename TestFixture::Mtx;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto empty = Sellp::create(this->exec);
+    auto res = Csr::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseVector)
+TYPED_TEST(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx2->apply(x.get(), y.get());
+    this->mtx2->apply(x.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
 }
 
 
-TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseMatrix)
+TYPED_TEST(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseMatrix)
 {
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    mtx2->apply(x.get(), y.get());
+    this->mtx2->apply(x.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
@@ -276,34 +440,39 @@ TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseMatrix)
 }
 
 
-TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseVector)
+TYPED_TEST(Sellp,
+           AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
 }
 
 
-TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseMatrix)
+TYPED_TEST(Sellp,
+           AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
     // clang-format off
     auto x = gko::initialize<Vec>(
-        {{2.0, 3.0},
-         {1.0, -1.5},
-         {4.0, 2.5}}, exec);
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
     auto y = gko::initialize<Vec>(
-        {{1.0, 0.5},
-         {2.0, -1.5}}, exec);
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
     // clang-format on
 
-    mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
@@ -313,43 +482,47 @@ TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseMatrix)
 }
 
 
-TEST_F(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongInnerDimension)
+TYPED_TEST(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfRows)
+TYPED_TEST(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfCols)
+TYPED_TEST(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3}, 2);
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3}, 2);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToDense)
+TYPED_TEST(Sellp, ConvertsWithSliceSizeAndStrideFactorToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx2->get_executor());
     // clang-format off
-    auto dense_other = gko::initialize<gko::matrix::Dense<>>(
+    auto dense_other = gko::initialize<Vec>(
         4, {{1.0, 3.0, 2.0},
-            {0.0, 5.0, 0.0}}, exec);
+            {0.0, 5.0, 0.0}}, this->exec);
     // clang-format on
 
-    mtx2->convert_to(dense_mtx.get());
+    this->mtx2->convert_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -359,11 +532,12 @@ TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToDense)
 }
 
 
-TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToDense)
+TYPED_TEST(Sellp, MovesWithSliceSizeAndStrideFactorToDense)
 {
-    auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor());
+    using Vec = typename TestFixture::Vec;
+    auto dense_mtx = Vec::create(this->mtx2->get_executor());
 
-    mtx2->move_to(dense_mtx.get());
+    this->mtx2->move_to(dense_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense_mtx,
@@ -373,11 +547,12 @@ TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToDense)
 }
 
 
-TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToCsr)
+TYPED_TEST(Sellp, ConvertsWithSliceSizeAndStrideFactorToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx2->get_executor());
+    using Csr = typename TestFixture::Csr;
+    auto csr_mtx = Csr::create(this->mtx2->get_executor());
 
-    mtx2->convert_to(csr_mtx.get());
+    this->mtx2->convert_to(csr_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(csr_mtx,
@@ -387,11 +562,12 @@ TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToCsr)
 }
 
 
-TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToCsr)
+TYPED_TEST(Sellp, MovesWithSliceSizeAndStrideFactorToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx2->get_executor());
+    using Csr = typename TestFixture::Csr;
+    auto csr_mtx = Csr::create(this->mtx2->get_executor());
 
-    mtx2->move_to(csr_mtx.get());
+    this->mtx2->move_to(csr_mtx.get());
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(csr_mtx,
@@ -401,11 +577,12 @@ TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToCsr)
 }
 
 
-TEST_F(Sellp, CountsNonzeros)
+TYPED_TEST(Sellp, CountsNonzeros)
 {
     gko::size_type nonzeros;
 
-    gko::kernels::reference::sellp::count_nonzeros(exec, mtx1.get(), &nonzeros);
+    gko::kernels::reference::sellp::count_nonzeros(this->exec, this->mtx1.get(),
+                                                   &nonzeros);
 
     ASSERT_EQ(nonzeros, 4);
 }
diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp
index f20e8ba3134..69c8f580ffd 100644
--- a/reference/test/matrix/sparsity_csr.cpp
+++ b/reference/test/matrix/sparsity_csr.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -43,16 +43,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class SparsityCsr : public ::testing::Test {
 protected:
-    using v_type = double;
-    using i_type = int;
+    using v_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using i_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using Mtx = gko::matrix::SparsityCsr<v_type, i_type>;
     using Csr = gko::matrix::Csr<v_type, i_type>;
     using DenseMtx = gko::matrix::Dense<v_type>;
@@ -61,8 +64,8 @@ class SparsityCsr : public ::testing::Test {
         : exec(gko::ReferenceExecutor::create()),
           mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4))
     {
-        Mtx::index_type *c = mtx->get_col_idxs();
-        Mtx::index_type *r = mtx->get_row_ptrs();
+        i_type *c = mtx->get_col_idxs();
+        i_type *r = mtx->get_row_ptrs();
         r[0] = 0;
         r[1] = 3;
         r[2] = 4;
@@ -76,28 +79,35 @@ class SparsityCsr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
+TYPED_TEST_CASE(SparsityCsr, gko::test::ValueIndexTypes);
 
-TEST_F(SparsityCsr, CanBeCreatedFromExistingCsrMatrix)
+
+TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingCsrMatrix)
 {
+    using Csr = typename TestFixture::Csr;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using Mtx = typename TestFixture::Mtx;
     auto csr_mtx = gko::initialize<Csr>(
-        {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, exec);
+        {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, this->exec);
     auto comp_mtx = gko::initialize<DenseMtx>(
-        {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, exec);
+        {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, this->exec);
 
-    auto mtx = Mtx::create(exec, std::move(csr_mtx));
+    auto mtx = Mtx::create(this->exec, std::move(csr_mtx));
 
     GKO_ASSERT_MTX_NEAR(comp_mtx.get(), mtx.get(), 0.0);
 }
 
 
-TEST_F(SparsityCsr, CanBeCreatedFromExistingDenseMatrix)
+TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingDenseMatrix)
 {
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using Mtx = typename TestFixture::Mtx;
     auto dense_mtx = gko::initialize<DenseMtx>(
-        {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, exec);
+        {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, this->exec);
     auto comp_mtx = gko::initialize<DenseMtx>(
-        {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, exec);
+        {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, this->exec);
 
-    auto mtx = Mtx::create(exec, std::move(dense_mtx));
+    auto mtx = Mtx::create(this->exec, std::move(dense_mtx));
 
     GKO_ASSERT_MTX_NEAR(comp_mtx.get(), mtx.get(), 0.0);
 }
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index e63bca5453e..7c40ceb41bc 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/sparsity_csr_kernels.hpp"
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include <algorithm>
@@ -43,19 +43,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/matrix/sparsity_csr_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class SparsityCsr : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::SparsityCsr<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Mtx = gko::matrix::SparsityCsr<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     SparsityCsr()
         : exec(gko::ReferenceExecutor::create()),
@@ -71,8 +77,8 @@ class SparsityCsr : public ::testing::Test {
 
     void create_mtx(Mtx *m)
     {
-        Mtx::index_type *c = m->get_col_idxs();
-        Mtx::index_type *r = m->get_row_ptrs();
+        index_type *c = m->get_col_idxs();
+        index_type *r = m->get_row_ptrs();
         /*
          * 1   1   1
          * 0   1   0
@@ -88,8 +94,8 @@ class SparsityCsr : public ::testing::Test {
 
     void create_mtx2(Mtx *m)
     {
-        Mtx::index_type *c = m->get_col_idxs();
-        Mtx::index_type *r = m->get_row_ptrs();
+        index_type *c = m->get_col_idxs();
+        index_type *r = m->get_row_ptrs();
         // It keeps an explict zero
         /*
          *  1    1   1
@@ -151,102 +157,118 @@ class SparsityCsr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3_unsorted;
 };
 
+TYPED_TEST_CASE(SparsityCsr, gko::test::ValueIndexTypes);
+
 
-TEST_F(SparsityCsr, AppliesToDenseVector)
+TYPED_TEST(SparsityCsr, AppliesToDenseVector)
 {
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2, 1});
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
 
-    mtx->apply(x.get(), y.get());
+    this->mtx->apply(x.get(), y.get());
 
-    EXPECT_EQ(y->at(0), 7.0);
-    EXPECT_EQ(y->at(1), 1.0);
+    EXPECT_EQ(y->at(0), T{7.0});
+    EXPECT_EQ(y->at(1), T{1.0});
 }
 
 
-TEST_F(SparsityCsr, AppliesToDenseMatrix)
+TYPED_TEST(SparsityCsr, AppliesToDenseMatrix)
 {
-    auto x = gko::initialize<Vec>({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec);
-    auto y = Vec::create(exec, gko::dim<2>{2});
-
-    mtx->apply(x.get(), y.get());
-
-    EXPECT_EQ(y->at(0, 0), 7.0);
-    EXPECT_EQ(y->at(1, 0), 1.0);
-    EXPECT_EQ(y->at(0, 1), 4.0);
-    EXPECT_EQ(y->at(1, 1), -1.5);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0}, I<T>{1.0, -1.5}, I<T>{4.0, 2.5}}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    this->mtx->apply(x.get(), y.get());
+
+    EXPECT_EQ(y->at(0, 0), T{7.0});
+    EXPECT_EQ(y->at(1, 0), T{1.0});
+    EXPECT_EQ(y->at(0, 1), T{4.0});
+    EXPECT_EQ(y->at(1, 1), T{-1.5});
 }
 
 
-TEST_F(SparsityCsr, AppliesLinearCombinationToDenseVector)
+TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, exec);
-    auto y = gko::initialize<Vec>({1.0, 2.0}, exec);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
 
-    mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
+    this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
 
-    EXPECT_EQ(y->at(0), -5.0);
-    EXPECT_EQ(y->at(1), 3.0);
+    EXPECT_EQ(y->at(0), T{-5.0});
+    EXPECT_EQ(y->at(1), T{3.0});
 }
 
 
-TEST_F(SparsityCsr, AppliesLinearCombinationToDenseMatrix)
+TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseMatrix)
 {
-    auto alpha = gko::initialize<Vec>({-1.0}, exec);
-    auto beta = gko::initialize<Vec>({2.0}, exec);
-    auto x = gko::initialize<Vec>({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec);
-    auto y = gko::initialize<Vec>({{1.0, 0.5}, {2.0, -1.5}}, exec);
-
-    mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
-
-    EXPECT_EQ(y->at(0, 0), -5.0);
-    EXPECT_EQ(y->at(1, 0), 3.0);
-    EXPECT_EQ(y->at(0, 1), -3.0);
-    EXPECT_EQ(y->at(1, 1), -1.5);
+    using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0}, I<T>{1.0, -1.5}, I<T>{4.0, 2.5}}, this->exec);
+    auto y =
+        gko::initialize<Vec>({I<T>{1.0, 0.5}, I<T>{2.0, -1.5}}, this->exec);
+
+    this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    EXPECT_EQ(y->at(0, 0), T{-5.0});
+    EXPECT_EQ(y->at(1, 0), T{3.0});
+    EXPECT_EQ(y->at(0, 1), T{-3.0});
+    EXPECT_EQ(y->at(1, 1), T{-1.5});
 }
 
 
-TEST_F(SparsityCsr, ApplyFailsOnWrongInnerDimension)
+TYPED_TEST(SparsityCsr, ApplyFailsOnWrongInnerDimension)
 {
-    auto x = Vec::create(exec, gko::dim<2>{2});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{2});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(SparsityCsr, ApplyFailsOnWrongNumberOfRows)
+TYPED_TEST(SparsityCsr, ApplyFailsOnWrongNumberOfRows)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3, 2});
-    auto y = Vec::create(exec, gko::dim<2>{3, 2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3, 2});
+    auto y = Vec::create(this->exec, gko::dim<2>{3, 2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(SparsityCsr, ApplyFailsOnWrongNumberOfCols)
+TYPED_TEST(SparsityCsr, ApplyFailsOnWrongNumberOfCols)
 {
-    auto x = Vec::create(exec, gko::dim<2>{3});
-    auto y = Vec::create(exec, gko::dim<2>{2});
+    using Vec = typename TestFixture::Vec;
+    auto x = Vec::create(this->exec, gko::dim<2>{3});
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
 
-    ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch);
 }
 
 
-TEST_F(SparsityCsr, SquareMtxIsTransposable)
+TYPED_TEST(SparsityCsr, SquareMtxIsTransposable)
 {
+    using Mtx = typename TestFixture::Mtx;
     // clang-format off
-   auto mtx2 = gko::initialize<gko::matrix::SparsityCsr<>>(
+   auto mtx2 = gko::initialize<Mtx>(
                {{1.0, 1.0, 1.0},
                 {0.0, 1.0, 0.0},
-                {0.0, 1.0, 1.0}}, exec);
+                {0.0, 1.0, 1.0}}, this->exec);
     // clang-format on
 
     auto trans = mtx2->transpose();
-    auto trans_as_sparsity =
-        static_cast<gko::matrix::SparsityCsr<> *>(trans.get());
+    auto trans_as_sparsity = static_cast<Mtx *>(trans.get());
 
     // clang-format off
    GKO_ASSERT_MTX_NEAR(trans_as_sparsity,
@@ -257,11 +279,11 @@ TEST_F(SparsityCsr, SquareMtxIsTransposable)
 }
 
 
-TEST_F(SparsityCsr, NonSquareMtxIsTransposable)
+TYPED_TEST(SparsityCsr, NonSquareMtxIsTransposable)
 {
-    auto trans = mtx->transpose();
-    auto trans_as_sparsity =
-        static_cast<gko::matrix::SparsityCsr<> *>(trans.get());
+    using Mtx = typename TestFixture::Mtx;
+    auto trans = this->mtx->transpose();
+    auto trans_as_sparsity = static_cast<Mtx *>(trans.get());
 
     // clang-format off
    GKO_ASSERT_MTX_NEAR(trans_as_sparsity,
@@ -272,90 +294,86 @@ TEST_F(SparsityCsr, NonSquareMtxIsTransposable)
 }
 
 
-TEST_F(SparsityCsr, CountsCorrectNumberOfDiagonalElements)
+TYPED_TEST(SparsityCsr, CountsCorrectNumberOfDiagonalElements)
 {
+    using Mtx = typename TestFixture::Mtx;
     // clang-format off
-    auto mtx2 = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                         {{1.0, 1.0, 1.0},
-                                                          {0.0, 1.0, 0.0},
-                                                          {0.0, 1.0, 1.0}}, exec);
-    auto mtx_s = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                          {{1.0, 1.0, 1.0},
-                                                           {0.0, 0.0, 0.0},
-                                                           {0.0, 1.0, 1.0}}, exec);
+    auto mtx2 = gko::initialize<Mtx>({{1.0, 1.0, 1.0},
+                                      {0.0, 1.0, 0.0},
+                                      {0.0, 1.0, 1.0}}, this->exec);
+    auto mtx_s = gko::initialize<Mtx>({{1.0, 1.0, 1.0},
+                                       {0.0, 0.0, 0.0},
+                                       {0.0, 1.0, 1.0}}, this->exec);
     // clang-format on
     gko::size_type m2_num_diags = 0;
     gko::size_type ms_num_diags = 0;
 
     gko::kernels::reference::sparsity_csr::count_num_diagonal_elements(
-        exec, mtx2.get(), &m2_num_diags);
+        this->exec, mtx2.get(), &m2_num_diags);
     gko::kernels::reference::sparsity_csr::count_num_diagonal_elements(
-        exec, mtx_s.get(), &ms_num_diags);
+        this->exec, mtx_s.get(), &ms_num_diags);
 
     ASSERT_EQ(m2_num_diags, 3);
     ASSERT_EQ(ms_num_diags, 2);
 }
 
 
-TEST_F(SparsityCsr, RemovesDiagonalElementsForFullRankMatrix)
+TYPED_TEST(SparsityCsr, RemovesDiagonalElementsForFullRankMatrix)
 {
+    using Mtx = typename TestFixture::Mtx;
     // clang-format off
-    auto mtx2 = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                         {{1.0, 1.0, 1.0},
-                                                          {0.0, 1.0, 0.0},
-                                                          {0.0, 1.0, 1.0}}, exec);
-    auto mtx_s = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                          {{0.0, 1.0, 1.0},
-                                                           {0.0, 0.0, 0.0},
-                                                           {0.0, 1.0, 0.0}}, exec);
+    auto mtx2 = gko::initialize<Mtx>({{1.0, 1.0, 1.0},
+                                      {0.0, 1.0, 0.0},
+                                      {0.0, 1.0, 1.0}}, this->exec);
+    auto mtx_s = gko::initialize<Mtx>({{0.0, 1.0, 1.0},
+                                       {0.0, 0.0, 0.0},
+                                       {0.0, 1.0, 0.0}}, this->exec);
     // clang-format on
-    auto tmp_mtx = gko::matrix::SparsityCsr<>::create(
-        exec, mtx_s->get_size(), mtx_s->get_num_nonzeros());
+    auto tmp_mtx =
+        Mtx::create(this->exec, mtx_s->get_size(), mtx_s->get_num_nonzeros());
     tmp_mtx->copy_from(mtx2.get());
 
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        exec, tmp_mtx.get(), mtx2->get_const_row_ptrs(),
-        mtx2->get_const_col_idxs());
+        this->exec, mtx2->get_const_row_ptrs(), mtx2->get_const_col_idxs(),
+        tmp_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(tmp_mtx.get(), mtx_s.get(), 0.0);
 }
 
 
-TEST_F(SparsityCsr, RemovesDiagonalElementsForIncompleteRankMatrix)
+TYPED_TEST(SparsityCsr, RemovesDiagonalElementsForIncompleteRankMatrix)
 {
+    using Mtx = typename TestFixture::Mtx;
     // clang-format off
-    auto mtx2 = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                         {{1.0, 1.0, 1.0},
-                                                          {0.0, 0.0, 0.0},
-                                                          {0.0, 1.0, 1.0}}, exec);
-    auto mtx_s = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                          {{0.0, 1.0, 1.0},
-                                                           {0.0, 0.0, 0.0},
-                                                           {0.0, 1.0, 0.0}}, exec);
+    auto mtx2 = gko::initialize<Mtx>({{1.0, 1.0, 1.0},
+                                      {0.0, 0.0, 0.0},
+                                      {0.0, 1.0, 1.0}}, this->exec);
+    auto mtx_s = gko::initialize<Mtx>({{0.0, 1.0, 1.0},
+                                       {0.0, 0.0, 0.0},
+                                       {0.0, 1.0, 0.0}}, this->exec);
     // clang-format on
-    auto tmp_mtx = gko::matrix::SparsityCsr<>::create(
-        exec, mtx_s->get_size(), mtx_s->get_num_nonzeros());
+    auto tmp_mtx =
+        Mtx::create(this->exec, mtx_s->get_size(), mtx_s->get_num_nonzeros());
     tmp_mtx->copy_from(mtx2.get());
 
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        exec, tmp_mtx.get(), mtx2->get_const_row_ptrs(),
-        mtx2->get_const_col_idxs());
+        this->exec, mtx2->get_const_row_ptrs(), mtx2->get_const_col_idxs(),
+        tmp_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(tmp_mtx.get(), mtx_s.get(), 0.0);
 }
 
 
-TEST_F(SparsityCsr, SquareMtxIsConvertibleToAdjacencyMatrix)
+TYPED_TEST(SparsityCsr, SquareMtxIsConvertibleToAdjacencyMatrix)
 {
+    using Mtx = typename TestFixture::Mtx;
     // clang-format off
-    auto mtx2 = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                         {{1.0, 1.0, 1.0},
-                                                          {0.0, 1.0, 0.0},
-                                                          {0.0, 1.0, 1.0}}, exec);
-    auto mtx_s = gko::initialize<gko::matrix::SparsityCsr<>>(
-                                                          {{0.0, 1.0, 1.0},
-                                                           {0.0, 0.0, 0.0},
-                                                           {0.0, 1.0, 0.0}}, exec);
+    auto mtx2 = gko::initialize<Mtx>({{1.0, 1.0, 1.0},
+                                      {0.0, 1.0, 0.0},
+                                      {0.0, 1.0, 1.0}}, this->exec);
+    auto mtx_s = gko::initialize<Mtx>({{0.0, 1.0, 1.0},
+                                       {0.0, 0.0, 0.0},
+                                       {0.0, 1.0, 0.0}}, this->exec);
     // clang-format on
 
     auto adj_mat = mtx2->to_adjacency_matrix();
@@ -364,43 +382,43 @@ TEST_F(SparsityCsr, SquareMtxIsConvertibleToAdjacencyMatrix)
 }
 
 
-TEST_F(SparsityCsr, NonSquareMtxIsNotConvertibleToAdjacencyMatrix)
+TYPED_TEST(SparsityCsr, NonSquareMtxIsNotConvertibleToAdjacencyMatrix)
 {
-    ASSERT_THROW(mtx->to_adjacency_matrix(), gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx->to_adjacency_matrix(), gko::DimensionMismatch);
 }
 
 
-TEST_F(SparsityCsr, RecognizeSortedMatrix)
+TYPED_TEST(SparsityCsr, RecognizeSortedMatrix)
 {
-    ASSERT_TRUE(mtx->is_sorted_by_column_index());
-    ASSERT_TRUE(mtx2->is_sorted_by_column_index());
-    ASSERT_TRUE(mtx3_sorted->is_sorted_by_column_index());
+    ASSERT_TRUE(this->mtx->is_sorted_by_column_index());
+    ASSERT_TRUE(this->mtx2->is_sorted_by_column_index());
+    ASSERT_TRUE(this->mtx3_sorted->is_sorted_by_column_index());
 }
 
 
-TEST_F(SparsityCsr, RecognizeUnsortedMatrix)
+TYPED_TEST(SparsityCsr, RecognizeUnsortedMatrix)
 {
-    ASSERT_FALSE(mtx3_unsorted->is_sorted_by_column_index());
+    ASSERT_FALSE(this->mtx3_unsorted->is_sorted_by_column_index());
 }
 
 
-TEST_F(SparsityCsr, SortSortedMatrix)
+TYPED_TEST(SparsityCsr, SortSortedMatrix)
 {
-    auto matrix = mtx3_sorted->clone();
+    auto matrix = this->mtx3_sorted->clone();
 
     matrix->sort_by_column_index();
 
-    GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0);
+    GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0);
 }
 
 
-TEST_F(SparsityCsr, SortUnsortedMatrix)
+TYPED_TEST(SparsityCsr, SortUnsortedMatrix)
 {
-    auto matrix = mtx3_unsorted->clone();
+    auto matrix = this->mtx3_unsorted->clone();
 
     matrix->sort_by_column_index();
 
-    GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0);
+    GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0);
 }
 
 
diff --git a/reference/test/preconditioner/CMakeLists.txt b/reference/test/preconditioner/CMakeLists.txt
index 89232e6e55b..908ac83533e 100644
--- a/reference/test/preconditioner/CMakeLists.txt
+++ b/reference/test/preconditioner/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_test(ilu)
+ginkgo_create_test(isai_kernels)
 ginkgo_create_test(jacobi)
 ginkgo_create_test(jacobi_kernels)
diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp
index 2bea8d75c9c..b44791098c1 100644
--- a/reference/test/preconditioner/ilu.cpp
+++ b/reference/test/preconditioner/ilu.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -47,20 +47,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Ilu : public ::testing::Test {
 protected:
-    using value_type = gko::default_precision;
-    using index_type = gko::int32;
+    using value_type = T;
     using Mtx = gko::matrix::Dense<value_type>;
     using l_solver_type = gko::solver::Bicgstab<value_type>;
     using u_solver_type = gko::solver::Bicgstab<value_type>;
@@ -69,16 +69,17 @@ class Ilu : public ::testing::Test {
         gko::preconditioner::Ilu<l_solver_type, u_solver_type, false>;
     using ilu_rev_prec_type =
         gko::preconditioner::Ilu<l_solver_type, u_solver_type, true>;
-    using composition = gko::Composition<value_type>;
+    using Composition = gko::Composition<value_type>;
 
     Ilu()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::initialize<Mtx>({{2, 1, 1}, {2, 5, 2}, {2, 5, 5}}, exec)),
-          l_factor(
-              gko::initialize<Mtx>({{1, 0, 0}, {1, 1, 0}, {1, 1, 1}}, exec)),
-          u_factor(
-              gko::initialize<Mtx>({{2, 1, 1}, {0, 4, 1}, {0, 0, 3}}, exec)),
-          l_u_composition(composition::create(l_factor, u_factor)),
+          mtx(gko::initialize<Mtx>({{2., 1., 1.}, {2., 5., 2.}, {2., 5., 5.}},
+                                   exec)),
+          l_factor(gko::initialize<Mtx>(
+              {{1., 0., 0.}, {1., 1., 0.}, {1., 1., 1.}}, exec)),
+          u_factor(gko::initialize<Mtx>(
+              {{2., 1., 1.}, {0., 4., 1.}, {0., 0., 3.}}, exec)),
+          l_u_composition(Composition::create(l_factor, u_factor)),
           l_factory(
               l_solver_type::build()
                   .with_criteria(
@@ -87,8 +88,8 @@ class Ilu : public ::testing::Test {
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<T>::value)
                           .on(exec))
                   .on(exec)),
           u_factory(
@@ -99,8 +100,8 @@ class Ilu : public ::testing::Test {
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<T>::value)
                           .on(exec))
                   .on(exec)),
           ilu_pre_factory(ilu_prec_type::build()
@@ -117,79 +118,92 @@ class Ilu : public ::testing::Test {
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> l_factor;
     std::shared_ptr<Mtx> u_factor;
-    std::shared_ptr<composition> l_u_composition;
-    std::shared_ptr<l_solver_type::Factory> l_factory;
-    std::shared_ptr<u_solver_type::Factory> u_factory;
-    std::shared_ptr<ilu_prec_type::Factory> ilu_pre_factory;
-    std::shared_ptr<ilu_rev_prec_type::Factory> ilu_rev_pre_factory;
+    std::shared_ptr<Composition> l_u_composition;
+    std::shared_ptr<typename l_solver_type::Factory> l_factory;
+    std::shared_ptr<typename u_solver_type::Factory> u_factory;
+    std::shared_ptr<typename ilu_prec_type::Factory> ilu_pre_factory;
+    std::shared_ptr<typename ilu_rev_prec_type::Factory> ilu_rev_pre_factory;
 };
 
+TYPED_TEST_CASE(Ilu, gko::test::ValueTypes);
 
-TEST_F(Ilu, BuildsDefaultWithoutThrowing)
+
+TYPED_TEST(Ilu, BuildsDefaultWithoutThrowing)
 {
-    auto ilu_pre_default_factory = ilu_prec_type::build().on(exec);
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    auto ilu_pre_default_factory = ilu_prec_type::build().on(this->exec);
 
-    ASSERT_NO_THROW(ilu_pre_default_factory->generate(l_u_composition));
+    ASSERT_NO_THROW(ilu_pre_default_factory->generate(this->l_u_composition));
 }
 
 
-TEST_F(Ilu, BuildsCustomWithoutThrowing)
+TYPED_TEST(Ilu, BuildsCustomWithoutThrowing)
 {
-    ASSERT_NO_THROW(ilu_pre_factory->generate(l_u_composition));
+    ASSERT_NO_THROW(this->ilu_pre_factory->generate(this->l_u_composition));
 }
 
 
-TEST_F(Ilu, BuildsCustomWithoutThrowing2)
+TYPED_TEST(Ilu, BuildsCustomWithoutThrowing2)
 {
-    ASSERT_NO_THROW(ilu_pre_factory->generate(mtx));
+    ASSERT_NO_THROW(this->ilu_pre_factory->generate(this->mtx));
 }
 
 
-TEST_F(Ilu, ThrowOnWrongCompositionInput)
+TYPED_TEST(Ilu, ThrowOnWrongCompositionInput)
 {
-    std::shared_ptr<composition> composition = composition::create(l_factor);
+    using Composition = typename TestFixture::Composition;
+    std::shared_ptr<Composition> composition =
+        Composition::create(this->l_factor);
 
-    ASSERT_THROW(ilu_pre_factory->generate(composition), gko::NotSupported);
+    ASSERT_THROW(this->ilu_pre_factory->generate(composition),
+                 gko::NotSupported);
 }
 
 
-TEST_F(Ilu, ThrowOnWrongCompositionInput2)
+TYPED_TEST(Ilu, ThrowOnWrongCompositionInput2)
 {
-    std::shared_ptr<composition> composition =
-        composition::create(l_factor, u_factor, l_factor);
+    using Composition = typename TestFixture::Composition;
+    std::shared_ptr<Composition> composition =
+        Composition::create(this->l_factor, this->u_factor, this->l_factor);
 
-    ASSERT_THROW(ilu_pre_factory->generate(composition), gko::NotSupported);
+    ASSERT_THROW(this->ilu_pre_factory->generate(composition),
+                 gko::NotSupported);
 }
 
 
-TEST_F(Ilu, SetsCorrectMatrices)
+TYPED_TEST(Ilu, SetsCorrectMatrices)
 {
-    auto ilu = ilu_pre_factory->generate(l_u_composition);
+    using Mtx = typename TestFixture::Mtx;
+    auto ilu = this->ilu_pre_factory->generate(this->l_u_composition);
     auto internal_l_factor = ilu->get_l_solver()->get_system_matrix();
     auto internal_u_factor = ilu->get_u_solver()->get_system_matrix();
 
     // These convert steps are required since `get_system_matrix` usually
     // just returns `LinOp`, which `GKO_ASSERT_MTX_NEAR` can not use properly
-    std::unique_ptr<Mtx> converted_l_factor{Mtx::create(exec)};
-    std::unique_ptr<Mtx> converted_u_factor{Mtx::create(exec)};
+    std::unique_ptr<Mtx> converted_l_factor{Mtx::create(this->exec)};
+    std::unique_ptr<Mtx> converted_u_factor{Mtx::create(this->exec)};
     gko::as<gko::ConvertibleTo<Mtx>>(internal_l_factor.get())
         ->convert_to(converted_l_factor.get());
     gko::as<gko::ConvertibleTo<Mtx>>(internal_u_factor.get())
         ->convert_to(converted_u_factor.get());
-    GKO_ASSERT_MTX_NEAR(converted_l_factor, l_factor, 0);
-    GKO_ASSERT_MTX_NEAR(converted_u_factor, u_factor, 0);
+    GKO_ASSERT_MTX_NEAR(converted_l_factor, this->l_factor, 0);
+    GKO_ASSERT_MTX_NEAR(converted_u_factor, this->u_factor, 0);
 }
 
 
-TEST_F(Ilu, CanBeCopied)
+TYPED_TEST(Ilu, CanBeCopied)
 {
-    auto ilu = ilu_pre_factory->generate(l_u_composition);
+    using Mtx = typename TestFixture::Mtx;
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    using Composition = typename TestFixture::Composition;
+    auto ilu = this->ilu_pre_factory->generate(this->l_u_composition);
     auto before_l_solver = ilu->get_l_solver();
     auto before_u_solver = ilu->get_u_solver();
     // The switch up of matrices is intentional, to make sure they are distinct!
-    auto u_l_composition = composition::create(u_factor, l_factor);
-    auto copied =
-        ilu_prec_type::build().on(exec)->generate(gko::share(u_l_composition));
+    auto u_l_composition = Composition::create(this->u_factor, this->l_factor);
+    auto copied = ilu_prec_type::build()
+                      .on(this->exec)
+                      ->generate(gko::share(u_l_composition));
 
     copied->copy_from(ilu.get());
 
@@ -198,15 +212,18 @@ TEST_F(Ilu, CanBeCopied)
 }
 
 
-TEST_F(Ilu, CanBeMoved)
+TYPED_TEST(Ilu, CanBeMoved)
 {
-    auto ilu = ilu_pre_factory->generate(l_u_composition);
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    using Composition = typename TestFixture::Composition;
+    auto ilu = this->ilu_pre_factory->generate(this->l_u_composition);
     auto before_l_solver = ilu->get_l_solver();
     auto before_u_solver = ilu->get_u_solver();
     // The switch up of matrices is intentional, to make sure they are distinct!
-    auto u_l_composition = composition::create(u_factor, l_factor);
-    auto moved =
-        ilu_prec_type::build().on(exec)->generate(gko::share(u_l_composition));
+    auto u_l_composition = Composition::create(this->u_factor, this->l_factor);
+    auto moved = ilu_prec_type::build()
+                     .on(this->exec)
+                     ->generate(gko::share(u_l_composition));
 
     moved->copy_from(std::move(ilu));
 
@@ -215,9 +232,9 @@ TEST_F(Ilu, CanBeMoved)
 }
 
 
-TEST_F(Ilu, CanBeCloned)
+TYPED_TEST(Ilu, CanBeCloned)
 {
-    auto ilu = ilu_pre_factory->generate(l_u_composition);
+    auto ilu = this->ilu_pre_factory->generate(this->l_u_composition);
     auto before_l_solver = ilu->get_l_solver();
     auto before_u_solver = ilu->get_u_solver();
 
@@ -228,28 +245,60 @@ TEST_F(Ilu, CanBeCloned)
 }
 
 
-TEST_F(Ilu, SolvesDefaultSingleRhs)
+TYPED_TEST(Ilu, CanBeTransposed)
 {
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
-    x->copy_from(b.get());
+    using Ilu = typename TestFixture::ilu_prec_type;
+    using Mtx = typename TestFixture::Mtx;
+    auto ilu = this->ilu_pre_factory->generate(this->l_u_composition);
+    auto l_ref = gko::as<Mtx>(ilu->get_l_solver()->get_system_matrix());
+    auto u_ref = gko::as<Mtx>(ilu->get_u_solver()->get_system_matrix());
+
+    auto transp = gko::as<Ilu>(ilu->transpose());
+
+    auto l_transp = gko::as<Mtx>(
+        gko::as<Mtx>(transp->get_u_solver()->get_system_matrix())->transpose());
+    auto u_transp = gko::as<Mtx>(
+        gko::as<Mtx>(transp->get_l_solver()->get_system_matrix())->transpose());
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_transp);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_transp);
+    GKO_ASSERT_MTX_NEAR(l_ref, l_transp, 0);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_transp, 0);
+}
 
-    auto preconditioner =
-        default_ilu_prec_type::build().on(exec)->generate(mtx);
-    preconditioner->apply(b.get(), x.get());
 
-    // Since it uses TRS per default, the result should be accurate
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14);
+TYPED_TEST(Ilu, CanBeConjTransposed)
+{
+    using Ilu = typename TestFixture::ilu_prec_type;
+    using Mtx = typename TestFixture::Mtx;
+    auto ilu = this->ilu_pre_factory->generate(this->l_u_composition);
+    auto l_ref = gko::as<Mtx>(ilu->get_l_solver()->get_system_matrix());
+    auto u_ref = gko::as<Mtx>(ilu->get_u_solver()->get_system_matrix());
+
+    auto transp = gko::as<Ilu>(ilu->conj_transpose());
+
+    auto l_transp =
+        gko::as<Mtx>(gko::as<Mtx>(transp->get_u_solver()->get_system_matrix())
+                         ->conj_transpose());
+    auto u_transp =
+        gko::as<Mtx>(gko::as<Mtx>(transp->get_l_solver()->get_system_matrix())
+                         ->conj_transpose());
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_transp);
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_transp);
+    GKO_ASSERT_MTX_NEAR(l_ref, l_transp, 0);
+    GKO_ASSERT_MTX_NEAR(u_ref, u_transp, 0);
 }
 
 
-TEST_F(Ilu, SolvesCustomTypeDefaultFactorySingleRhs)
+TYPED_TEST(Ilu, SolvesCustomTypeDefaultFactorySingleRhs)
 {
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
+    using ilu_prec_type = typename TestFixture::ilu_prec_type;
+    using Mtx = typename TestFixture::Mtx;
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b.get());
 
-    auto preconditioner = ilu_prec_type::build().on(exec)->generate(mtx);
+    auto preconditioner =
+        ilu_prec_type::build().on(this->exec)->generate(this->mtx);
     preconditioner->apply(b.get(), x.get());
 
     // Since it uses Bicgstab with default parmeters, the result will not be
@@ -258,145 +307,201 @@ TEST_F(Ilu, SolvesCustomTypeDefaultFactorySingleRhs)
 }
 
 
-TEST_F(Ilu, SolvesSingleRhsWithParIlu)
+TYPED_TEST(Ilu, SolvesSingleRhsWithParIlu)
 {
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b.get());
     auto par_ilu_fact =
-        gko::factorization::ParIlu<value_type>::build().on(exec);
-    auto par_ilu = par_ilu_fact->generate(mtx);
+        gko::factorization::ParIlu<value_type>::build().on(this->exec);
+    auto par_ilu = par_ilu_fact->generate(this->mtx);
 
-    auto preconditioner = ilu_pre_factory->generate(gko::share(par_ilu));
+    auto preconditioner = this->ilu_pre_factory->generate(gko::share(par_ilu));
     preconditioner->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}),
+                        r<TypeParam>::value * 1e+1);
 }
 
 
-TEST_F(Ilu, SolvesSingleRhsWithComposition)
+TYPED_TEST(Ilu, SolvesSingleRhsWithComposition)
 {
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
+    using Mtx = typename TestFixture::Mtx;
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b.get());
 
-    auto preconditioner = ilu_pre_factory->generate(l_u_composition);
+    auto preconditioner =
+        this->ilu_pre_factory->generate(this->l_u_composition);
     preconditioner->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}),
+                        r<TypeParam>::value * 1e+1);
 }
 
 
-TEST_F(Ilu, SolvesSingleRhsWithMtx)
+TYPED_TEST(Ilu, SolvesSingleRhsWithMtx)
 {
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
+    using Mtx = typename TestFixture::Mtx;
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b.get());
 
-    auto preconditioner = ilu_pre_factory->generate(mtx);
+    auto preconditioner = this->ilu_pre_factory->generate(this->mtx);
     preconditioner->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}),
+                        r<TypeParam>::value * 1e+1);
 }
 
 
-TEST_F(Ilu, SolvesReverseSingleRhs)
+TYPED_TEST(Ilu, SolvesReverseSingleRhs)
 {
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
+    using Mtx = typename TestFixture::Mtx;
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b.get());
-    auto preconditioner = ilu_rev_pre_factory->generate(l_u_composition);
+    auto preconditioner =
+        this->ilu_rev_pre_factory->generate(this->l_u_composition);
 
     preconditioner->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.625, 0.875, 1.75}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.625, 0.875, 1.75}),
+                        r<TypeParam>::value * 1e+1);
 }
 
 
-TEST_F(Ilu, SolvesAdvancedSingleRhs)
+TYPED_TEST(Ilu, SolvesAdvancedSingleRhs)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
     const value_type alpha{2.0};
-    const auto alpha_linop = gko::initialize<Mtx>({alpha}, exec);
+    const auto alpha_linop = gko::initialize<Mtx>({alpha}, this->exec);
     const value_type beta{-1};
-    const auto beta_linop = gko::initialize<Mtx>({beta}, exec);
-    const auto b = gko::initialize<Mtx>({-3.0, 6.0, 9.0}, exec);
-    auto x = gko::initialize<Mtx>({1.0, 2.0, 3.0}, exec);
-    auto preconditioner = ilu_pre_factory->generate(l_u_composition);
+    const auto beta_linop = gko::initialize<Mtx>({beta}, this->exec);
+    const auto b = gko::initialize<Mtx>({-3.0, 6.0, 9.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0, 3.0}, this->exec);
+    auto preconditioner =
+        this->ilu_pre_factory->generate(this->l_u_composition);
 
     preconditioner->apply(alpha_linop.get(), b.get(), beta_linop.get(),
                           x.get());
 
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-7.0, 2.0, -1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-7.0, 2.0, -1.0}), r<TypeParam>::value);
 }
 
 
-TEST_F(Ilu, SolvesAdvancedReverseSingleRhs)
+TYPED_TEST(Ilu, SolvesAdvancedReverseSingleRhs)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
     const value_type alpha{2.0};
-    const auto alpha_linop = gko::initialize<Mtx>({alpha}, exec);
+    const auto alpha_linop = gko::initialize<Mtx>({alpha}, this->exec);
     const value_type beta{-1};
-    const auto beta_linop = gko::initialize<Mtx>({beta}, exec);
-    const auto b = gko::initialize<Mtx>({-3.0, 6.0, 9.0}, exec);
-    auto x = gko::initialize<Mtx>({1.0, 2.0, 3.0}, exec);
-    auto preconditioner = ilu_rev_pre_factory->generate(l_u_composition);
+    const auto beta_linop = gko::initialize<Mtx>({beta}, this->exec);
+    const auto b = gko::initialize<Mtx>({-3.0, 6.0, 9.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, 2.0, 3.0}, this->exec);
+    auto preconditioner =
+        this->ilu_rev_pre_factory->generate(this->l_u_composition);
 
     preconditioner->apply(alpha_linop.get(), b.get(), beta_linop.get(),
                           x.get());
 
-    GKO_ASSERT_MTX_NEAR(x.get(), l({-7.75, 6.25, 1.5}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-7.75, 6.25, 1.5}),
+                        r<TypeParam>::value * 1e+1);
 }
 
 
-TEST_F(Ilu, SolvesMultipleRhs)
+TYPED_TEST(Ilu, SolvesMultipleRhs)
 {
-    const auto b =
-        gko::initialize<Mtx>({{1.0, 8.0}, {3.0, 21.0}, {6.0, 24.0}}, exec);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 2});
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    const auto b = gko::initialize<Mtx>(
+        {I<T>{1.0, 8.0}, I<T>{3.0, 21.0}, I<T>{6.0, 24.0}}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 2});
     x->copy_from(b.get());
-    auto preconditioner = ilu_pre_factory->generate(l_u_composition);
+    auto preconditioner =
+        this->ilu_pre_factory->generate(this->l_u_composition);
 
     preconditioner->apply(b.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(x.get(), l({{-0.125, 2.0}, {0.25, 3.0}, {1.0, 1.0}}),
-                        1e-14);
+                        r<TypeParam>::value * 1e+1);
 }
 
 
-TEST_F(Ilu, SolvesDifferentNumberOfRhs)
+TYPED_TEST(Ilu, SolvesDifferentNumberOfRhs)
 {
-    const auto b1 = gko::initialize<Mtx>({-3.0, 6.0, 9.0}, exec);
-    auto x11 = Mtx::create(exec, gko::dim<2>{3, 1});
-    auto x12 = Mtx::create(exec, gko::dim<2>{3, 1});
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    const auto b1 = gko::initialize<Mtx>({-3.0, 6.0, 9.0}, this->exec);
+    auto x11 = Mtx::create(this->exec, gko::dim<2>{3, 1});
+    auto x12 = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x11->copy_from(b1.get());
     x12->copy_from(b1.get());
-    const auto b2 =
-        gko::initialize<Mtx>({{1.0, 8.0}, {3.0, 21.0}, {6.0, 24.0}}, exec);
-    auto x2 = Mtx::create(exec, gko::dim<2>{3, 2});
+    const auto b2 = gko::initialize<Mtx>(
+        {I<T>{1.0, 8.0}, I<T>{3.0, 21.0}, I<T>{6.0, 24.0}}, this->exec);
+    auto x2 = Mtx::create(this->exec, gko::dim<2>{3, 2});
     x2->copy_from(b2.get());
-    auto preconditioner = ilu_pre_factory->generate(l_u_composition);
+    auto preconditioner =
+        this->ilu_pre_factory->generate(this->l_u_composition);
 
     preconditioner->apply(b1.get(), x11.get());
     preconditioner->apply(b2.get(), x2.get());
     preconditioner->apply(b1.get(), x12.get());
 
-    GKO_ASSERT_MTX_NEAR(x11.get(), l({-3.0, 2.0, 1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x11.get(), l({-3.0, 2.0, 1.0}),
+                        r<TypeParam>::value * 1e+1);
     GKO_ASSERT_MTX_NEAR(x2.get(), l({{-0.125, 2.0}, {0.25, 3.0}, {1.0, 1.0}}),
-                        1e-14);
-    GKO_ASSERT_MTX_NEAR(x12.get(), x11.get(), 1e-14);
+                        r<TypeParam>::value * 1e+1);
+    GKO_ASSERT_MTX_NEAR(x12.get(), x11.get(), r<TypeParam>::value * 1e+1);
+}
+
+
+class DefaultIlu : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    using default_ilu_prec_type = gko::preconditioner::Ilu<>;
+
+    DefaultIlu()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::initialize<Mtx>({{2., 1., 1.}, {2., 5., 2.}, {2., 5., 5.}},
+                                   exec))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<Mtx> mtx;
+};
+
+
+TEST_F(DefaultIlu, SolvesDefaultSingleRhs)
+{
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
+    x->copy_from(b.get());
+
+    auto preconditioner =
+        default_ilu_prec_type::build().on(this->exec)->generate(this->mtx);
+    preconditioner->apply(b.get(), x.get());
+
+    // Since it uses TRS per default, the result should be accurate
+    GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14);
 }
 
 
-TEST_F(Ilu, CanBeUsedAsPreconditioner)
+TEST_F(DefaultIlu, CanBeUsedAsPreconditioner)
 {
     auto solver =
-        gko::solver::Bicgstab<value_type>::build()
+        gko::solver::Bicgstab<>::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(exec))
-            .with_preconditioner(default_ilu_prec_type::build().on(exec))
-            .on(exec)
-            ->generate(mtx);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
+                gko::stop::Iteration::build().with_max_iters(2u).on(this->exec))
+            .with_preconditioner(default_ilu_prec_type::build().on(this->exec))
+            .on(this->exec)
+            ->generate(this->mtx);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
     x->copy_from(b.get());
 
     solver->apply(b.get(), x.get());
@@ -405,19 +510,19 @@ TEST_F(Ilu, CanBeUsedAsPreconditioner)
 }
 
 
-TEST_F(Ilu, CanBeUsedAsGeneratedPreconditioner)
+TEST_F(DefaultIlu, CanBeUsedAsGeneratedPreconditioner)
 {
     std::shared_ptr<default_ilu_prec_type> precond =
-        default_ilu_prec_type::build().on(exec)->generate(mtx);
+        default_ilu_prec_type::build().on(this->exec)->generate(this->mtx);
     auto solver =
-        gko::solver::Bicgstab<value_type>::build()
+        gko::solver::Bicgstab<>::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(exec))
+                gko::stop::Iteration::build().with_max_iters(2u).on(this->exec))
             .with_generated_preconditioner(precond)
-            .on(exec)
-            ->generate(mtx);
-    auto x = Mtx::create(exec, gko::dim<2>{3, 1});
-    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, exec);
+            .on(this->exec)
+            ->generate(this->mtx);
+    auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
+    const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
     x->copy_from(b.get());
 
     solver->apply(b.get(), x.get());
diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp
new file mode 100644
index 00000000000..06ad239732d
--- /dev/null
+++ b/reference/test/preconditioner/isai_kernels.cpp
@@ -0,0 +1,924 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "reference/preconditioner/isai_kernels.cpp"
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/preconditioner/ilu.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Isai : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using LowerIsai = gko::preconditioner::LowerIsai<value_type, index_type>;
+    using UpperIsai = gko::preconditioner::UpperIsai<value_type, index_type>;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    Isai()
+        : exec{gko::ReferenceExecutor::create()},
+          l_dense{gko::initialize<Dense>(
+              {{2., 0., 0.}, {1., -2., 0.}, {-1., 1., -1.}}, exec)},
+          l_dense_inv{gko::initialize<Dense>(
+              {{.5, 0., 0.}, {.25, -.5, 0.}, {-.25, -.5, -1.}}, exec)},
+          u_dense{gko::initialize<Dense>(
+              {{4., 1., -1.}, {0., -2., 4.}, {0., 0., 8.}}, exec)},
+          u_dense_inv{gko::initialize<Dense>(
+              {{.25, .125, -0.03125}, {0., -.5, .25}, {0., 0., .125}}, exec)},
+          l_csr{Csr::create(exec)},
+          l_csr_inv{Csr::create(exec)},
+          u_csr{Csr::create(exec)},
+          u_csr_inv{Csr::create(exec)},
+          l_sparse{Csr::create(exec, gko::dim<2>(4, 4),
+                               I<value_type>{-1., 2., 4., 5., -4., 8., -8.},
+                               I<index_type>{0, 0, 1, 1, 2, 2, 3},
+                               I<index_type>{0, 1, 3, 5, 7})},
+          l_s_unsorted{Csr::create(exec, gko::dim<2>(4, 4),
+                                   I<value_type>{-1., 4., 2., 5., -4., -8., 8.},
+                                   I<index_type>{0, 1, 0, 1, 2, 3, 2},
+                                   I<index_type>{0, 1, 3, 5, 7})},
+          l_sparse_inv{
+              Csr::create(exec, gko::dim<2>(4, 4),
+                          I<value_type>{-1., .5, .25, .3125, -.25, -.25, -.125},
+                          I<index_type>{0, 0, 1, 1, 2, 2, 3},
+                          I<index_type>{0, 1, 3, 5, 7})},
+          l_sparse_inv2{Csr::create(exec, gko::dim<2>(4, 4),
+                                    I<value_type>{-1., .5, .25, .625, .3125,
+                                                  -.25, .3125, -.25, -.125},
+                                    I<index_type>{0, 0, 1, 0, 1, 2, 1, 2, 3},
+                                    I<index_type>{0, 1, 3, 6, 9})},
+          l_sparse_inv3{
+              Csr::create(exec, gko::dim<2>(4, 4),
+                          I<value_type>{-1., .5, .25, .625, .3125, -.25, .625,
+                                        .3125, -.25, -.125},
+                          I<index_type>{0, 0, 1, 0, 1, 2, 0, 1, 2, 3},
+                          I<index_type>{0, 1, 3, 6, 10})},
+          l_sparse2{Csr::create(exec, gko::dim<2>(4, 4),
+                                I<value_type>{-2, 1, 4, 1, -2, 1, -1, 1, 2},
+                                I<index_type>{0, 0, 1, 1, 2, 0, 1, 2, 3},
+                                I<index_type>{0, 1, 3, 5, 9})},
+          l_sparse2_inv{Csr::create(exec, gko::dim<2>(4, 4),
+                                    I<value_type>{-.5, .125, .25, .125, -.5,
+                                                  .28125, .0625, 0.25, 0.5},
+                                    I<index_type>{0, 0, 1, 1, 2, 0, 1, 2, 3},
+                                    I<index_type>{0, 1, 3, 5, 9})},
+          u_sparse{
+              Csr::create(exec, gko::dim<2>(4, 4),
+                          I<value_type>{-2., 1., -1., 1., 4., 1., -2., 1., 2.},
+                          I<index_type>{0, 1, 2, 3, 1, 2, 2, 3, 3},
+                          I<index_type>{0, 4, 6, 8, 9})},
+          u_s_unsorted{
+              Csr::create(exec, gko::dim<2>(4, 4),
+                          I<value_type>{-2., -1., 1., 1., 1., 4., -2., 1., 2.},
+                          I<index_type>{0, 2, 1, 3, 2, 1, 2, 3, 3},
+                          I<index_type>{0, 4, 6, 8, 9})},
+          u_sparse_inv{Csr::create(
+              exec, gko::dim<2>(4, 4),
+              I<value_type>{-.5, .125, .3125, .09375, .25, .125, -.5, .25, .5},
+              I<index_type>{0, 1, 2, 3, 1, 2, 2, 3, 3},
+              I<index_type>{0, 4, 6, 8, 9})},
+          u_sparse_inv2{Csr::create(exec, gko::dim<2>(4, 4),
+                                    I<value_type>{-.5, .125, .3125, .09375, .25,
+                                                  .125, -.0625, -.5, .25, .5},
+                                    I<index_type>{0, 1, 2, 3, 1, 2, 3, 2, 3, 3},
+                                    I<index_type>{0, 4, 7, 9, 10})}
+    {
+        lower_isai_factory = LowerIsai::build().on(exec);
+        upper_isai_factory = UpperIsai::build().on(exec);
+        l_dense->convert_to(lend(l_csr));
+        l_dense_inv->convert_to(lend(l_csr_inv));
+        u_dense->convert_to(lend(u_csr));
+        u_dense_inv->convert_to(lend(u_csr_inv));
+        l_csr_longrow = read<Csr>("isai_l.mtx");
+        l_csr_longrow_e = read<Csr>("isai_l_excess.mtx");
+        l_csr_longrow_e_rhs = read<Dense>("isai_l_excess_rhs.mtx");
+        l_csr_longrow_inv_partial = read<Csr>("isai_l_inv_partial.mtx");
+        l_csr_longrow_inv = read<Csr>("isai_l_inv.mtx");
+        u_csr_longrow = read<Csr>("isai_u.mtx");
+        u_csr_longrow_e = read<Csr>("isai_u_excess.mtx");
+        u_csr_longrow_e_rhs = read<Dense>("isai_u_excess_rhs.mtx");
+        u_csr_longrow_inv_partial = read<Csr>("isai_u_inv_partial.mtx");
+        u_csr_longrow_inv = read<Csr>("isai_u_inv.mtx");
+    }
+
+    template <typename ReadMtx>
+    std::unique_ptr<ReadMtx> read(const char *name)
+    {
+        std::ifstream mtxstream{std::string{gko::matrices::location_isai_mtxs} +
+                                name};
+        auto result = gko::read<ReadMtx>(mtxstream, exec);
+        // to avoid removing 0s, the matrices store 12345 instead
+        for (gko::size_type i = 0; i < result->get_num_stored_elements(); ++i) {
+            auto &val = result->get_values()[i];
+            if (val == static_cast<value_type>(12345.0)) {
+                val = 0;
+            }
+        }
+        return std::move(result);
+    }
+
+    std::unique_ptr<Csr> clone_allocations(const Csr *csr_mtx)
+    {
+        const auto num_elems = csr_mtx->get_num_stored_elements();
+        auto sparsity = csr_mtx->clone();
+
+        // values are now filled with invalid data to catch potential errors
+        std::fill_n(sparsity->get_values(), num_elems, -gko::one<value_type>());
+        return sparsity;
+    }
+
+    std::unique_ptr<Csr> transpose(const Csr *mtx)
+    {
+        return gko::as<Csr>(mtx->transpose());
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<typename LowerIsai::Factory> lower_isai_factory;
+    std::unique_ptr<typename UpperIsai::Factory> upper_isai_factory;
+    std::shared_ptr<Dense> l_dense;
+    std::shared_ptr<Dense> l_dense_inv;
+    std::shared_ptr<Dense> u_dense;
+    std::shared_ptr<Dense> u_dense_inv;
+    std::shared_ptr<Csr> l_csr;
+    std::shared_ptr<Csr> l_csr_inv;
+    std::shared_ptr<Csr> l_csr_longrow;
+    std::shared_ptr<Csr> l_csr_longrow_e;
+    std::shared_ptr<Dense> l_csr_longrow_e_rhs;
+    std::shared_ptr<Csr> l_csr_longrow_inv_partial;
+    std::shared_ptr<Csr> l_csr_longrow_inv;
+    std::shared_ptr<Csr> u_csr;
+    std::shared_ptr<Csr> u_csr_inv;
+    std::shared_ptr<Csr> u_csr_longrow;
+    std::shared_ptr<Csr> u_csr_longrow_e;
+    std::shared_ptr<Dense> u_csr_longrow_e_rhs;
+    std::shared_ptr<Csr> u_csr_longrow_inv_partial;
+    std::shared_ptr<Csr> u_csr_longrow_inv;
+    std::shared_ptr<Csr> l_sparse;
+    std::shared_ptr<Csr> l_s_unsorted;
+    std::shared_ptr<Csr> l_sparse_inv;
+    std::shared_ptr<Csr> l_sparse_inv2;
+    std::shared_ptr<Csr> l_sparse_inv3;
+    std::shared_ptr<Csr> l_sparse2;
+    std::shared_ptr<Csr> l_sparse2_inv;
+    std::shared_ptr<Csr> u_sparse;
+    std::shared_ptr<Csr> u_s_unsorted;
+    std::shared_ptr<Csr> u_sparse_inv;
+    std::shared_ptr<Csr> u_sparse_inv2;
+};
+
+TYPED_TEST_CASE(Isai, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(Isai, KernelGenerateL1)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->l_csr));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->l_csr), lend(result), a1.get_data(),
+        a2.get_data(), true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_csr_inv);
+    GKO_ASSERT_MTX_NEAR(result, this->l_csr_inv, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateL2)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    const auto l_mtx = this->transpose(lend(this->u_csr));
+    auto result = this->clone_allocations(lend(l_mtx));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(l_mtx), lend(result), a1.get_data(), a2.get_data(),
+        true);
+
+    const auto expected = this->transpose(lend(this->u_csr_inv));
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, expected);
+    GKO_ASSERT_MTX_NEAR(result, expected, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateLsparse1)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->l_sparse));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->l_sparse), lend(result), a1.get_data(),
+        a2.get_data(), true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(result, this->l_sparse_inv, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateLsparse2)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->l_sparse2));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->l_sparse2), lend(result), a1.get_data(),
+        a2.get_data(), true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_sparse2_inv);
+    GKO_ASSERT_MTX_NEAR(result, this->l_sparse2_inv, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateLsparse3)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    const auto l_mtx = this->transpose(lend(this->u_sparse));
+    auto result = this->clone_allocations(lend(l_mtx));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(l_mtx), lend(result), a1.get_data(), a2.get_data(),
+        true);
+
+    // Results in a slightly different version than u_sparse_inv->transpose()
+    // because a different row-sparsity pattern is used in u_sparse vs. l_mtx
+    // (only one value changes compared to u_sparse_inv->transpose())
+    const auto expected = Csr::create(
+        this->exec, gko::dim<2>(4, 4),
+        I<value_type>{-.5, .125, .25, .3125, .125, -.5, .125, .25, .5},
+        I<index_type>{0, 0, 1, 0, 1, 2, 0, 2, 3}, I<index_type>{0, 1, 3, 6, 9});
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, expected);
+    GKO_ASSERT_MTX_NEAR(result, expected, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateLLongrow)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->l_csr_longrow));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+    // only the 32nd row has some excess storage
+    auto a1_expect = zeros;
+    a1_expect.get_data()[33] = 33;
+    a1_expect.get_data()[34] = 33;
+    a1_expect.get_data()[35] = 66;
+    auto a2_expect = zeros;
+    a2_expect.get_data()[33] = 124;
+    a2_expect.get_data()[34] = 124;
+    a2_expect.get_data()[35] = 248;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->l_csr_longrow), lend(result), a1.get_data(),
+        a2.get_data(), true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_csr_longrow_inv_partial);
+    GKO_ASSERT_MTX_NEAR(result, this->l_csr_longrow_inv_partial,
+                        r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, a1_expect);
+    GKO_ASSERT_ARRAY_EQ(a2, a2_expect);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateExcessLLongrow)
+{
+    using Csr = typename TestFixture::Csr;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto num_rows = this->l_csr_longrow->get_size()[0];
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+    // only the 32nd row has some excess storage
+    auto a1 = zeros;
+    a1.get_data()[33] = 33;
+    a1.get_data()[34] = 33;
+    a1.get_data()[35] = 66;
+    auto a2 = zeros;
+    a2.get_data()[33] = 124;
+    a2.get_data()[34] = 124;
+    a2.get_data()[35] = 248;
+    auto result = Csr::create(this->exec, gko::dim<2>(66, 66), 248);
+    auto result_rhs = Dense::create(this->exec, gko::dim<2>(66, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        this->exec, lend(this->l_csr_longrow), lend(this->l_csr_longrow),
+        a1.get_const_data(), a2.get_const_data(), lend(result),
+        lend(result_rhs));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_csr_longrow_e);
+    GKO_ASSERT_MTX_NEAR(result, this->l_csr_longrow_e, 0);
+    GKO_ASSERT_MTX_NEAR(result_rhs, this->l_csr_longrow_e_rhs, 0);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateU1)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    const auto u_mtx = this->transpose(lend(this->l_csr));
+    auto result = this->clone_allocations(lend(u_mtx));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(u_mtx), lend(result), a1.get_data(), a2.get_data(),
+        false);
+
+    auto expected = this->transpose(lend(this->l_csr_inv));
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, expected);
+    GKO_ASSERT_MTX_NEAR(result, expected, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateU2)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->u_csr));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->u_csr), lend(result), a1.get_data(),
+        a2.get_data(), false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_csr_inv);
+    GKO_ASSERT_MTX_NEAR(result, this->u_csr_inv, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateUsparse1)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    const auto u_mtx = this->transpose(lend(this->l_sparse));
+    auto result = this->clone_allocations(lend(u_mtx));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(u_mtx), lend(result), a1.get_data(), a2.get_data(),
+        false);
+
+    const auto expected = this->transpose(lend(this->l_sparse_inv));
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, expected);
+    GKO_ASSERT_MTX_NEAR(result, expected, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateUsparse2)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    const auto u_mtx = this->transpose(this->l_sparse2.get());
+    auto result = this->clone_allocations(lend(u_mtx));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(u_mtx), lend(result), a1.get_data(), a2.get_data(),
+        false);
+
+    // Results in a slightly different version than l_sparse2_inv->transpose()
+    // because a different row-sparsity pattern is used in l_sparse2 vs. u_mtx
+    // (only one value changes compared to l_sparse2_inv->transpose())
+    const auto expected = Csr::create(
+        this->exec, gko::dim<2>(4, 4),
+        I<value_type>{-.5, .125, .3125, .25, .125, .0625, -.5, .25, .5},
+        I<index_type>{0, 1, 3, 1, 2, 3, 2, 3, 3}, I<index_type>{0, 3, 6, 8, 9});
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, expected);
+    GKO_ASSERT_MTX_NEAR(result, expected, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateUsparse3)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->u_sparse));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->u_sparse), lend(result), a1.get_data(),
+        a2.get_data(), false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(result, this->u_sparse_inv, r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, zeros);
+    GKO_ASSERT_ARRAY_EQ(a2, zeros);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateULongrow)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto result = this->clone_allocations(lend(this->u_csr_longrow));
+    auto num_rows = result->get_size()[0];
+    gko::Array<index_type> a1(this->exec, num_rows + 1);
+    gko::Array<index_type> a2(this->exec, num_rows + 1);
+    // zero-filled array
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+    // only the 32nd row has some excess storage
+    auto a1_expect = zeros;
+    std::fill_n(a1_expect.get_data() + 3, 33, 33);
+    auto a2_expect = zeros;
+    std::fill_n(a2_expect.get_data() + 3, 33, 153);
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        this->exec, lend(this->u_csr_longrow), lend(result), a1.get_data(),
+        a2.get_data(), false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_csr_longrow_inv_partial);
+    GKO_ASSERT_MTX_NEAR(result, this->u_csr_longrow_inv_partial,
+                        r<value_type>::value);
+    // no row above the size limit -> zero array
+    GKO_ASSERT_ARRAY_EQ(a1, a1_expect);
+    GKO_ASSERT_ARRAY_EQ(a2, a2_expect);
+}
+
+
+TYPED_TEST(Isai, KernelGenerateExcessULongrow)
+{
+    using Csr = typename TestFixture::Csr;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto num_rows = this->u_csr_longrow->get_size()[0];
+    gko::Array<index_type> zeros(this->exec, num_rows + 1);
+    std::fill_n(zeros.get_data(), num_rows + 1, 0);
+    // only the 32nd row has some excess storage
+    auto a1 = zeros;
+    std::fill_n(a1.get_data() + 3, 33, 33);
+    auto a2 = zeros;
+    std::fill_n(a2.get_data() + 3, 33, 153);
+    auto result = Csr::create(this->exec, gko::dim<2>(33, 33), 153);
+    auto result_rhs = Dense::create(this->exec, gko::dim<2>(33, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        this->exec, lend(this->u_csr_longrow), lend(this->u_csr_longrow),
+        a1.get_const_data(), a2.get_const_data(), lend(result),
+        lend(result_rhs));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_csr_longrow_e);
+    GKO_ASSERT_MTX_NEAR(result, this->u_csr_longrow_e, 0);
+    GKO_ASSERT_MTX_NEAR(result_rhs, this->u_csr_longrow_e_rhs, 0);
+}
+
+
+TYPED_TEST(Isai, KernelScatterExcessSolution)
+{
+    using Csr = typename TestFixture::Csr;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    gko::Array<index_type> ptrs{this->exec, I<index_type>{0, 0, 2, 2, 5, 7, 7}};
+    auto mtx = Csr::create(this->exec, gko::dim<2>{6, 6},
+                           I<value_type>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+                           I<index_type>{0, 0, 1, 0, 0, 1, 2, 0, 1, 0},
+                           I<index_type>{0, 1, 3, 4, 7, 9, 10});
+    auto expect =
+        Csr::create(this->exec, gko::dim<2>{6, 6},
+                    I<value_type>{1, 11, 12, 4, 13, 14, 15, 16, 17, 10},
+                    I<index_type>{0, 0, 1, 0, 0, 1, 2, 0, 1, 0},
+                    I<index_type>{0, 1, 3, 4, 7, 9, 10});
+    auto sol = Dense::create(this->exec, gko::dim<2>(7, 1),
+                             I<value_type>{11, 12, 13, 14, 15, 16, 17}, 1);
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        this->exec, ptrs.get_const_data(), sol.get(), mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(mtx, expect, 0);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseL)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->lower_isai_factory->generate(this->l_sparse);
+
+    auto l_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseLLongrow)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->lower_isai_factory->generate(this->l_csr_longrow);
+
+    auto l_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_csr_longrow_inv);
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_csr_longrow_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseU)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->upper_isai_factory->generate(this->u_sparse);
+
+    auto u_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseULongrow)
+{
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->upper_isai_factory->generate(this->u_csr_longrow);
+
+    auto u_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_csr_longrow_inv);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_csr_longrow_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseLWithL2)
+{
+    using value_type = typename TestFixture::value_type;
+    const auto isai = TestFixture::LowerIsai::build()
+                          .with_sparsity_power(2)
+                          .on(this->exec)
+                          ->generate(this->l_sparse);
+
+    auto l_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv2);
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv2, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseUWithU2)
+{
+    using value_type = typename TestFixture::value_type;
+    const auto isai = TestFixture::UpperIsai::build()
+                          .with_sparsity_power(2)
+                          .on(this->exec)
+                          ->generate(this->u_sparse);
+
+    auto u_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv2);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv2, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseLWithL3)
+{
+    using value_type = typename TestFixture::value_type;
+    const auto isai = TestFixture::LowerIsai::build()
+                          .with_sparsity_power(3)
+                          .on(this->exec)
+                          ->generate(this->l_sparse);
+
+    auto l_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv3);
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv3, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsCorrectInverseUWithU3)
+{
+    using value_type = typename TestFixture::value_type;
+    const auto isai = TestFixture::UpperIsai::build()
+                          .with_sparsity_power(3)
+                          .on(this->exec)
+                          ->generate(this->u_sparse);
+
+    auto u_inv = isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv2);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv2, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, GeneratesWithUnsortedCsr)
+{
+    using Csr = typename TestFixture::Csr;
+    using T = typename TestFixture::value_type;
+
+    const auto l_isai = this->lower_isai_factory->generate(this->l_s_unsorted);
+    const auto u_isai = this->upper_isai_factory->generate(this->u_s_unsorted);
+    auto l_inv = l_isai->get_approximate_inverse();
+    auto u_inv = u_isai->get_approximate_inverse();
+
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r<T>::value);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r<T>::value);
+}
+
+
+TYPED_TEST(Isai, ApplyWithLMtx)
+{
+    using Dense = typename TestFixture::Dense;
+    using T = typename TestFixture::value_type;
+    const auto vec = gko::initialize<Dense>({18., 16., 12.}, this->exec);
+    auto result = Dense::create_with_config_of(lend(vec));
+    const auto l_isai = this->lower_isai_factory->generate(this->l_dense);
+
+    l_isai->apply(lend(vec), lend(result));
+
+    GKO_ASSERT_MTX_NEAR(result, l({9., -3.5, -24.5}), r<T>::value);
+}
+
+
+TYPED_TEST(Isai, ApplyWithUMtx)
+{
+    using Dense = typename TestFixture::Dense;
+    using T = typename TestFixture::value_type;
+    const auto vec = gko::initialize<Dense>({18., 16., 12.}, this->exec);
+    auto result = Dense::create_with_config_of(lend(vec));
+    const auto u_isai = this->upper_isai_factory->generate(this->u_dense);
+
+    u_isai->apply(lend(vec), lend(result));
+
+    GKO_ASSERT_MTX_NEAR(result, l({6.125, -5., 1.5}), r<T>::value);
+}
+
+
+TYPED_TEST(Isai, AdvancedApplyLMtx)
+{
+    using Dense = typename TestFixture::Dense;
+    using T = typename TestFixture::value_type;
+    const auto alpha = gko::initialize<Dense>({3.}, this->exec);
+    const auto beta = gko::initialize<Dense>({-4.}, this->exec);
+    const auto vec = gko::initialize<Dense>({18., 16., 12}, this->exec);
+    auto result = gko::initialize<Dense>({2., -3., 1.}, this->exec);
+    const auto l_isai = this->lower_isai_factory->generate(this->l_dense);
+
+    l_isai->apply(lend(alpha), lend(vec), lend(beta), lend(result));
+
+    GKO_ASSERT_MTX_NEAR(result, l({19., 1.5, -77.5}), r<T>::value);
+}
+
+
+TYPED_TEST(Isai, AdvancedApplyUMtx)
+{
+    using Dense = typename TestFixture::Dense;
+    using T = typename TestFixture::value_type;
+    const auto alpha = gko::initialize<Dense>({3.}, this->exec);
+    const auto beta = gko::initialize<Dense>({-4.}, this->exec);
+    const auto vec = gko::initialize<Dense>({18., 16., 12}, this->exec);
+    auto result = gko::initialize<Dense>({2., -3., 1.}, this->exec);
+    const auto u_isai = this->upper_isai_factory->generate(this->u_dense);
+
+    u_isai->apply(lend(alpha), lend(vec), lend(beta), lend(result));
+
+    GKO_ASSERT_MTX_NEAR(result, l({10.375, -3., 0.5}), r<T>::value);
+}
+
+
+TYPED_TEST(Isai, UseWithIluPreconditioner)
+{
+    using Dense = typename TestFixture::Dense;
+    using index_type = typename TestFixture::index_type;
+    using T = typename TestFixture::value_type;
+    using LowerIsai = typename TestFixture::LowerIsai;
+    using UpperIsai = typename TestFixture::UpperIsai;
+    const auto vec = gko::initialize<Dense>({128, -64, 32}, this->exec);
+    auto result = Dense::create(this->exec, vec->get_size());
+    auto mtx = gko::share(Dense::create_with_config_of(lend(this->l_dense)));
+    this->l_dense->apply(lend(this->u_dense), lend(mtx));
+    auto ilu_factory = gko::preconditioner::Ilu<LowerIsai, UpperIsai, false,
+                                                index_type>::build()
+                           .on(this->exec);
+    auto ilu = ilu_factory->generate(mtx);
+
+    ilu->apply(lend(vec), lend(result));
+
+    GKO_ASSERT_MTX_NEAR(result, l({25., -40., -4.}), r<T>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsTransposedCorrectInverseL)
+{
+    using UpperIsai = typename TestFixture::UpperIsai;
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->lower_isai_factory->generate(this->l_sparse);
+
+    auto l_inv = gko::as<Csr>(gko::as<UpperIsai>(isai->transpose())
+                                  ->get_approximate_inverse()
+                                  ->transpose());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsTransposedCorrectInverseU)
+{
+    using LowerIsai = typename TestFixture::LowerIsai;
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->upper_isai_factory->generate(this->u_sparse);
+
+    auto u_inv = gko::as<Csr>(gko::as<LowerIsai>(isai->transpose())
+                                  ->get_approximate_inverse()
+                                  ->transpose());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsConjTransposedCorrectInverseL)
+{
+    using UpperIsai = typename TestFixture::UpperIsai;
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->lower_isai_factory->generate(this->l_sparse);
+
+    auto l_inv = gko::as<Csr>(gko::as<UpperIsai>(isai->conj_transpose())
+                                  ->get_approximate_inverse()
+                                  ->conj_transpose());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r<value_type>::value);
+}
+
+
+TYPED_TEST(Isai, ReturnsConjTransposedCorrectInverseU)
+{
+    using LowerIsai = typename TestFixture::LowerIsai;
+    using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    const auto isai = this->upper_isai_factory->generate(this->u_sparse);
+
+    auto u_inv = gko::as<Csr>(gko::as<LowerIsai>(isai->conj_transpose())
+                                  ->get_approximate_inverse()
+                                  ->conj_transpose());
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv);
+    GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r<value_type>::value);
+}
+
+
+}  // namespace
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 767bcbd5028..29d5ddd477f 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -39,27 +39,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/base/extended_float.hpp>
-#include <core/preconditioner/jacobi_utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Jacobi : public ::testing::Test {
 protected:
-    using Bj = gko::preconditioner::Jacobi<>;
-    using Mtx = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Bj = gko::preconditioner::Jacobi<value_type, index_type>;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Jacobi()
         : exec(gko::ReferenceExecutor::create()),
           bj_factory(Bj::build().with_max_block_size(3u).on(exec)),
           block_pointers(exec, 3),
           block_precisions(exec, 2),
-          mtx(gko::matrix::Csr<>::create(exec, gko::dim<2>{5}, 13))
+          mtx(Mtx::create(exec, gko::dim<2>{5}, 13))
     {
         block_pointers.get_data()[0] = 0;
         block_pointers.get_data()[1] = 2;
@@ -74,11 +82,12 @@ class Jacobi : public ::testing::Test {
                   |-1   4  -2
            -1     |    -1   4
          */
-        init_array(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13});
-        init_array(mtx->get_col_idxs(),
-                   {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4});
-        init_array(mtx->get_values(), {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0,
-                                       -1.0, 4.0, -2.0, -1.0, -1.0, 4.0});
+        init_array<index_type>(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13});
+        init_array<index_type>(mtx->get_col_idxs(),
+                               {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4});
+        init_array<value_type>(mtx->get_values(),
+                               {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0, -1.0,
+                                4.0, -2.0, -1.0, -1.0, 4.0});
         bj_factory = Bj::build()
                          .with_max_block_size(3u)
                          .with_block_pointers(block_pointers)
@@ -109,8 +118,8 @@ class Jacobi : public ::testing::Test {
     {
         for (int i = 0; i < block_size; ++i) {
             for (int j = 0; j < block_size; ++j) {
-                EXPECT_EQ(static_cast<double>(ptr_a[i * stride_a + j]),
-                          static_cast<double>(ptr_b[i * stride_b + j]))
+                EXPECT_EQ(static_cast<value_type>(ptr_a[i * stride_a + j]),
+                          static_cast<value_type>(ptr_b[i * stride_b + j]))
                     << "Mismatch at position (" << i << ", " << j << ")";
             }
         }
@@ -143,7 +152,7 @@ class Jacobi : public ::testing::Test {
             ASSERT_EQ(prec_a, prec_b);
             auto scheme = a->get_storage_scheme();
             GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-                Bj::value_type, prec_a,
+                value_type, prec_a,
                 assert_same_block(
                     b_ptr_a[i + 1] - b_ptr_a[i],
                     reinterpret_cast<const resolved_precision *>(
@@ -158,19 +167,21 @@ class Jacobi : public ::testing::Test {
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<Bj::Factory> bj_factory;
-    std::unique_ptr<Bj::Factory> adaptive_bj_factory;
-    gko::Array<gko::int32> block_pointers;
+    std::unique_ptr<typename Bj::Factory> bj_factory;
+    std::unique_ptr<typename Bj::Factory> adaptive_bj_factory;
+    gko::Array<index_type> block_pointers;
     gko::Array<gko::precision_reduction> block_precisions;
-    std::shared_ptr<gko::matrix::Csr<>> mtx;
+    std::shared_ptr<Mtx> mtx;
     std::unique_ptr<Bj> bj;
     std::unique_ptr<Bj> adaptive_bj;
 };
 
+TYPED_TEST_CASE(Jacobi, gko::test::ValueIndexTypes);
+
 
-TEST_F(Jacobi, GeneratesCorrectStorageScheme)
+TYPED_TEST(Jacobi, GeneratesCorrectStorageScheme)
 {
-    auto scheme = bj->get_storage_scheme();
+    auto scheme = this->bj->get_storage_scheme();
 
     ASSERT_EQ(scheme.group_power, 3);  // 8 3-by-3 blocks fit into 32-wide group
     ASSERT_EQ(scheme.block_offset, 3);
@@ -178,156 +189,185 @@ TEST_F(Jacobi, GeneratesCorrectStorageScheme)
 }
 
 
-TEST_F(Jacobi, CanBeCloned)
+TYPED_TEST(Jacobi, CanBeCloned)
 {
-    auto bj_clone = clone(bj);
+    auto bj_clone = clone(this->bj);
 
-    assert_same_precond(lend(bj_clone), lend(bj));
+    this->assert_same_precond(lend(bj_clone), lend(this->bj));
 }
 
 
-TEST_F(Jacobi, CanBeClonedWithAdaptvePrecision)
+TYPED_TEST(Jacobi, CanBeClonedWithAdaptvePrecision)
 {
-    auto bj_clone = clone(adaptive_bj);
-    assert_same_precond(lend(bj_clone), lend(adaptive_bj));
+    auto bj_clone = clone(this->adaptive_bj);
+    this->assert_same_precond(lend(bj_clone), lend(this->adaptive_bj));
 }
 
 
-TEST_F(Jacobi, CanBeCopied)
+TYPED_TEST(Jacobi, CanBeCopied)
 {
-    gko::Array<gko::int32> empty(exec, 1);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    gko::Array<index_type> empty(this->exec, 1);
     empty.get_data()[0] = 0;
-    auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate(
-        Mtx::create(exec));
+    auto copy = Bj::build()
+                    .with_block_pointers(empty)
+                    .on(this->exec)
+                    ->generate(Mtx::create(this->exec));
 
-    copy->copy_from(lend(bj));
+    copy->copy_from(lend(this->bj));
 
-    assert_same_precond(lend(copy), lend(bj));
+    this->assert_same_precond(lend(copy), lend(this->bj));
 }
 
 
-TEST_F(Jacobi, CanBeCopiedWithAdaptivePrecision)
+TYPED_TEST(Jacobi, CanBeCopiedWithAdaptivePrecision)
 {
-    gko::Array<gko::int32> empty(exec, 1);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    gko::Array<index_type> empty(this->exec, 1);
     empty.get_data()[0] = 0;
-    auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate(
-        Mtx::create(exec));
+    auto copy = Bj::build()
+                    .with_block_pointers(empty)
+                    .on(this->exec)
+                    ->generate(Mtx::create(this->exec));
 
-    copy->copy_from(lend(adaptive_bj));
+    copy->copy_from(lend(this->adaptive_bj));
 
-    assert_same_precond(lend(copy), lend(adaptive_bj));
+    this->assert_same_precond(lend(copy), lend(this->adaptive_bj));
 }
 
 
-TEST_F(Jacobi, CanBeMoved)
+TYPED_TEST(Jacobi, CanBeMoved)
 {
-    auto tmp = clone(bj);
-    gko::Array<gko::int32> empty(exec, 1);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    auto tmp = clone(this->bj);
+    gko::Array<index_type> empty(this->exec, 1);
     empty.get_data()[0] = 0;
-    auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate(
-        Mtx::create(exec));
+    auto copy = Bj::build()
+                    .with_block_pointers(empty)
+                    .on(this->exec)
+                    ->generate(Mtx::create(this->exec));
 
-    copy->copy_from(give(bj));
+    copy->copy_from(give(this->bj));
 
-    assert_same_precond(lend(copy), lend(tmp));
+    this->assert_same_precond(lend(copy), lend(tmp));
 }
 
 
-TEST_F(Jacobi, CanBeMovedWithAdaptivePrecision)
+TYPED_TEST(Jacobi, CanBeMovedWithAdaptivePrecision)
 {
-    auto tmp = clone(adaptive_bj);
-    gko::Array<gko::int32> empty(exec, 1);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    auto tmp = clone(this->adaptive_bj);
+    gko::Array<index_type> empty(this->exec, 1);
     empty.get_data()[0] = 0;
-    auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate(
-        Mtx::create(exec));
+    auto copy = Bj::build()
+                    .with_block_pointers(empty)
+                    .on(this->exec)
+                    ->generate(Mtx::create(this->exec));
 
-    copy->copy_from(give(adaptive_bj));
+    copy->copy_from(give(this->adaptive_bj));
 
-    assert_same_precond(lend(copy), lend(tmp));
+    this->assert_same_precond(lend(copy), lend(tmp));
 }
 
 
-TEST_F(Jacobi, CanBeCleared)
+TYPED_TEST(Jacobi, CanBeCleared)
 {
-    bj->clear();
+    this->bj->clear();
 
-    ASSERT_EQ(bj->get_size(), gko::dim<2>(0, 0));
-    ASSERT_EQ(bj->get_num_stored_elements(), 0);
-    ASSERT_EQ(bj->get_parameters().max_block_size, 32);
-    ASSERT_EQ(bj->get_parameters().block_pointers.get_const_data(), nullptr);
-    ASSERT_EQ(bj->get_blocks(), nullptr);
+    ASSERT_EQ(this->bj->get_size(), gko::dim<2>(0, 0));
+    ASSERT_EQ(this->bj->get_num_stored_elements(), 0);
+    ASSERT_EQ(this->bj->get_parameters().max_block_size, 32);
+    ASSERT_EQ(this->bj->get_parameters().block_pointers.get_const_data(),
+              nullptr);
+    ASSERT_EQ(this->bj->get_blocks(), nullptr);
 }
 
 
-TEST_F(Jacobi, CanBeClearedWithAdaptivePrecision)
+TYPED_TEST(Jacobi, CanBeClearedWithAdaptivePrecision)
 {
-    adaptive_bj->clear();
-
-    ASSERT_EQ(adaptive_bj->get_size(), gko::dim<2>(0, 0));
-    ASSERT_EQ(adaptive_bj->get_num_stored_elements(), 0);
-    ASSERT_EQ(adaptive_bj->get_parameters().max_block_size, 32);
-    ASSERT_EQ(adaptive_bj->get_parameters().block_pointers.get_const_data(),
-              nullptr);
-    ASSERT_EQ(adaptive_bj->get_parameters()
+    this->adaptive_bj->clear();
+
+    ASSERT_EQ(this->adaptive_bj->get_size(), gko::dim<2>(0, 0));
+    ASSERT_EQ(this->adaptive_bj->get_num_stored_elements(), 0);
+    ASSERT_EQ(this->adaptive_bj->get_parameters().max_block_size, 32);
+    ASSERT_EQ(
+        this->adaptive_bj->get_parameters().block_pointers.get_const_data(),
+        nullptr);
+    ASSERT_EQ(this->adaptive_bj->get_parameters()
                   .storage_optimization.block_wise.get_const_data(),
               nullptr);
-    ASSERT_EQ(adaptive_bj->get_blocks(), nullptr);
+    ASSERT_EQ(this->adaptive_bj->get_blocks(), nullptr);
 }
 
 
 #define GKO_EXPECT_NONZERO_NEAR(first, second, tol) \
     EXPECT_EQ(first.row, second.row);               \
     EXPECT_EQ(first.column, second.column);         \
-    EXPECT_NEAR(first.value, second.value, tol)
+    GKO_EXPECT_NEAR(first.value, second.value, tol)
 
 
-TEST_F(Jacobi, GeneratesCorrectMatrixData)
+TYPED_TEST(Jacobi, GeneratesCorrectMatrixData)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    auto tol = r<value_type>::value;
+    gko::matrix_data<value_type, index_type> data;
 
-    bj->write(data);
+    this->bj->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>{5});
     ASSERT_EQ(data.nonzeros.size(), 13);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), 1e-14);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), tol);
 }
 
 
-TEST_F(Jacobi, GeneratesCorrectMatrixDataWithAdaptivePrecision)
+TYPED_TEST(Jacobi, GeneratesCorrectMatrixDataWithAdaptivePrecision)
 {
-    using tpl = gko::matrix_data<>::nonzero_type;
-    gko::matrix_data<> data;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using tpl = typename gko::matrix_data<value_type, index_type>::nonzero_type;
+    gko::matrix_data<value_type, index_type> data;
+    auto tol = r<value_type>::value;
+    auto half_tol = std::sqrt(r<value_type>::value);
 
-    adaptive_bj->write(data);
+    this->adaptive_bj->write(data);
 
     ASSERT_EQ(data.size, gko::dim<2>{5});
     ASSERT_EQ(data.nonzeros.size(), 13);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), 1e-7);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), 1e-7);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), 1e-7);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), 1e-7);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), 1e-14);
-    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), 1e-14);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), half_tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), half_tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), half_tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), half_tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), tol);
+    GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), tol);
 }
 
 
diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp
index be1cf1178f7..9fc4a028f27 100644
--- a/reference/test/preconditioner/jacobi_kernels.cpp
+++ b/reference/test/preconditioner/jacobi_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,31 +34,41 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
+#include <type_traits>
 
 
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/extended_float.hpp"
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename ValueIndexType>
 class Jacobi : public ::testing::Test {
 protected:
-    using Bj = gko::preconditioner::Jacobi<>;
-    using Mtx = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
-    using mdata = gko::matrix_data<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Bj = gko::preconditioner::Jacobi<value_type, index_type>;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
+    using mdata = gko::matrix_data<value_type, index_type>;
 
     Jacobi()
         : exec(gko::ReferenceExecutor::create()),
           block_pointers(exec, 3),
           block_precisions(exec, 2),
-          mtx(gko::matrix::Csr<>::create(exec, gko::dim<2>{5}, 13))
+          mtx(gko::matrix::Csr<value_type, index_type>::create(
+              exec, gko::dim<2>{5}, 13))
     {
         block_pointers.get_data()[0] = 0;
         block_pointers.get_data()[1] = 2;
@@ -83,11 +93,12 @@ class Jacobi : public ::testing::Test {
                   |-1   4  -2
            -1     |    -1   4
          */
-        init_array(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13});
-        init_array(mtx->get_col_idxs(),
-                   {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4});
-        init_array(mtx->get_values(), {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0,
-                                       -1.0, 4.0, -2.0, -1.0, -1.0, 4.0});
+        init_array<index_type>(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13});
+        init_array<index_type>(mtx->get_col_idxs(),
+                               {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4});
+        init_array<value_type>(mtx->get_values(),
+                               {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0, -1.0,
+                                4.0, -2.0, -1.0, -1.0, 4.0});
     }
 
     template <typename T>
@@ -99,20 +110,22 @@ class Jacobi : public ::testing::Test {
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<Bj::Factory> bj_factory;
-    std::unique_ptr<Bj::Factory> adaptive_bj_factory;
-    gko::Array<gko::int32> block_pointers;
+    std::unique_ptr<typename Bj::Factory> bj_factory;
+    std::unique_ptr<typename Bj::Factory> adaptive_bj_factory;
+    gko::Array<index_type> block_pointers;
     gko::Array<gko::precision_reduction> block_precisions;
-    std::shared_ptr<gko::matrix::Csr<>> mtx;
+    std::shared_ptr<gko::matrix::Csr<value_type, index_type>> mtx;
 };
 
+TYPED_TEST_CASE(Jacobi, gko::test::ValueIndexTypes);
 
-TEST_F(Jacobi, CanBeGenerated)
+
+TYPED_TEST(Jacobi, CanBeGenerated)
 {
-    auto bj = bj_factory->generate(mtx);
+    auto bj = this->bj_factory->generate(this->mtx);
 
     ASSERT_NE(bj, nullptr);
-    EXPECT_EQ(bj->get_executor(), exec);
+    EXPECT_EQ(bj->get_executor(), this->exec);
     EXPECT_EQ(bj->get_parameters().max_block_size, 3);
     ASSERT_EQ(bj->get_size(), gko::dim<2>(5, 5));
     ASSERT_EQ(bj->get_num_blocks(), 2);
@@ -123,11 +136,11 @@ TEST_F(Jacobi, CanBeGenerated)
 }
 
 
-TEST_F(Jacobi, CanBeGeneratedWithAdaptivePrecision)
+TYPED_TEST(Jacobi, CanBeGeneratedWithAdaptivePrecision)
 {
-    auto bj = adaptive_bj_factory->generate(mtx);
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
 
-    EXPECT_EQ(bj->get_executor(), exec);
+    EXPECT_EQ(bj->get_executor(), this->exec);
     EXPECT_EQ(bj->get_parameters().max_block_size, 17);
     ASSERT_EQ(bj->get_size(), gko::dim<2>(5, 5));
     ASSERT_EQ(bj->get_num_blocks(), 2);
@@ -142,7 +155,7 @@ TEST_F(Jacobi, CanBeGeneratedWithAdaptivePrecision)
 }
 
 
-TEST_F(Jacobi, FindsNaturalBlocks)
+TYPED_TEST(Jacobi, FindsNaturalBlocks)
 {
     /* example matrix:
         1   1
@@ -150,11 +163,18 @@ TEST_F(Jacobi, FindsNaturalBlocks)
         1       1
         1       1
      */
-    auto mtx = Mtx::create(exec, gko::dim<2>{4}, 8);
-    init_array(mtx->get_row_ptrs(), {0, 2, 4, 6, 8});
-    init_array(mtx->get_col_idxs(), {0, 1, 0, 1, 0, 2, 0, 2});
-    init_array(mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
-    auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(give(mtx));
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    auto mtx = Mtx::create(this->exec, gko::dim<2>{4}, 8);
+    this->template init_array<index_type>(mtx->get_row_ptrs(), {0, 2, 4, 6, 8});
+    this->template init_array<index_type>(mtx->get_col_idxs(),
+                                          {0, 1, 0, 1, 0, 2, 0, 2});
+    this->template init_array<value_type>(
+        mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    auto bj =
+        Bj::build().with_max_block_size(3u).on(this->exec)->generate(give(mtx));
 
     EXPECT_EQ(bj->get_parameters().max_block_size, 3);
     ASSERT_EQ(bj->get_num_blocks(), 2);
@@ -165,7 +185,7 @@ TEST_F(Jacobi, FindsNaturalBlocks)
 }
 
 
-TEST_F(Jacobi, ExecutesSupervariableAgglomeration)
+TYPED_TEST(Jacobi, ExecutesSupervariableAgglomeration)
 {
     /* example matrix:
         1   1
@@ -174,12 +194,19 @@ TEST_F(Jacobi, ExecutesSupervariableAgglomeration)
                 1   1
                         1
      */
-    auto mtx = Mtx::create(exec, gko::dim<2>{5}, 9);
-    init_array(mtx->get_row_ptrs(), {0, 2, 4, 6, 8, 9});
-    init_array(mtx->get_col_idxs(), {0, 1, 0, 1, 2, 3, 2, 3, 4});
-    init_array(mtx->get_values(),
-               {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
-    auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(give(mtx));
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    auto mtx = Mtx::create(this->exec, gko::dim<2>{5}, 9);
+    this->template init_array<index_type>(mtx->get_row_ptrs(),
+                                          {0, 2, 4, 6, 8, 9});
+    this->template init_array<index_type>(mtx->get_col_idxs(),
+                                          {0, 1, 0, 1, 2, 3, 2, 3, 4});
+    this->template init_array<value_type>(
+        mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    auto bj =
+        Bj::build().with_max_block_size(3u).on(this->exec)->generate(give(mtx));
 
     EXPECT_EQ(bj->get_parameters().max_block_size, 3);
     ASSERT_EQ(bj->get_num_blocks(), 2);
@@ -190,7 +217,7 @@ TEST_F(Jacobi, ExecutesSupervariableAgglomeration)
 }
 
 
-TEST_F(Jacobi, AdheresToBlockSizeBound)
+TYPED_TEST(Jacobi, AdheresToBlockSizeBound)
 {
     /* example matrix:
         1
@@ -201,11 +228,19 @@ TEST_F(Jacobi, AdheresToBlockSizeBound)
                             1
                                 1
      */
-    auto mtx = Mtx::create(exec, gko::dim<2>{7}, 7);
-    init_array(mtx->get_row_ptrs(), {0, 1, 2, 3, 4, 5, 6, 7});
-    init_array(mtx->get_col_idxs(), {0, 1, 2, 3, 4, 5, 6});
-    init_array(mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
-    auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(give(mtx));
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    auto mtx = Mtx::create(this->exec, gko::dim<2>{7}, 7);
+    this->template init_array<index_type>(mtx->get_row_ptrs(),
+                                          {0, 1, 2, 3, 4, 5, 6, 7});
+    this->template init_array<index_type>(mtx->get_col_idxs(),
+                                          {0, 1, 2, 3, 4, 5, 6});
+    this->template init_array<value_type>(mtx->get_values(),
+                                          {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    auto bj =
+        Bj::build().with_max_block_size(3u).on(this->exec)->generate(give(mtx));
 
     EXPECT_EQ(bj->get_parameters().max_block_size, 3);
     ASSERT_EQ(bj->get_num_blocks(), 3);
@@ -217,12 +252,14 @@ TEST_F(Jacobi, AdheresToBlockSizeBound)
 }
 
 
-TEST_F(Jacobi, CanBeGeneratedWithUnknownBlockSizes)
+TYPED_TEST(Jacobi, CanBeGeneratedWithUnknownBlockSizes)
 {
-    auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(mtx);
+    using Bj = typename TestFixture::Bj;
+    auto bj =
+        Bj::build().with_max_block_size(3u).on(this->exec)->generate(this->mtx);
 
     ASSERT_NE(bj, nullptr);
-    EXPECT_EQ(bj->get_executor(), exec);
+    EXPECT_EQ(bj->get_executor(), this->exec);
     EXPECT_EQ(bj->get_parameters().max_block_size, 3);
     ASSERT_EQ(bj->get_size(), gko::dim<2>(5, 5));
     ASSERT_EQ(bj->get_num_blocks(), 2);
@@ -233,397 +270,611 @@ TEST_F(Jacobi, CanBeGeneratedWithUnknownBlockSizes)
 }
 
 
-TEST_F(Jacobi, InvertsDiagonalBlocks)
+TYPED_TEST(Jacobi, InvertsDiagonalBlocks)
 {
-    auto bj = bj_factory->generate(mtx);
+    using T = typename TestFixture::value_type;
+    auto bj = this->bj_factory->generate(this->mtx);
 
     auto scheme = bj->get_storage_scheme();
     auto p = scheme.get_stride();
     auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0);
-    EXPECT_NEAR(b1[0 + 0 * p], 4.0 / 14.0, 1e-14);
-    EXPECT_NEAR(b1[0 + 1 * p], 2.0 / 14.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 0 * p], 1.0 / 14.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 1 * p], 4.0 / 14.0, 1e-14);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], T{2.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], T{1.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r<T>::value);
+
+    auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
+}
+
+
+TYPED_TEST(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecision)
+{
+    using T = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<T>::value);
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
+
+    auto scheme = bj->get_storage_scheme();
+    auto p = scheme.get_stride();
+    const auto b_prec_bj =
+        bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    using reduced = ::gko::reduce_precision<T>;
+    auto b1 = reinterpret_cast<const reduced *>(
+        bj->get_blocks() + scheme.get_global_block_offset(0));
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{4.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{2.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{1.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{4.0 / 14.0}, half_tol);
+
+    auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
+}
 
+
+TYPED_TEST(Jacobi, CanTransposeDiagonalBlocks)
+{
+    using T = typename TestFixture::value_type;
+    using Bj = typename TestFixture::Bj;
+    auto tmp_bj = this->bj_factory->generate(this->mtx);
+
+    auto bj = gko::as<Bj>(tmp_bj->transpose());
+
+    auto scheme = bj->get_storage_scheme();
+    auto p = scheme.get_stride();
+    auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], T{2.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], T{1.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r<T>::value);
     auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
-    EXPECT_NEAR(b2[0 + 0 * p], 14.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[0 + 1 * p], 8.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[0 + 2 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 0 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 1 * p], 16.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 2 * p], 8.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 0 * p], 1.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 1 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 2 * p], 14.0 / 48.0, 1e-14);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
 }
 
-TEST_F(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecision)
+
+TYPED_TEST(Jacobi, CanTransposeDiagonalBlocksWithAdaptivePrecision)
 {
-    auto bj = adaptive_bj_factory->generate(mtx);
+    using T = typename TestFixture::value_type;
+    using Bj = typename TestFixture::Bj;
+    auto half_tol = std::sqrt(r<T>::value);
+    auto tmp_bj = this->adaptive_bj_factory->generate(this->mtx);
+
+    auto bj = gko::as<Bj>(tmp_bj->transpose());
 
     auto scheme = bj->get_storage_scheme();
     auto p = scheme.get_stride();
-    auto b1 = reinterpret_cast<const float *>(
+    using reduced = ::gko::reduce_precision<T>;
+    auto b1 = reinterpret_cast<const reduced *>(
         bj->get_blocks() + scheme.get_global_block_offset(0));
-    EXPECT_NEAR(b1[0 + 0 * p], 4.0 / 14.0, 1e-7);
-    EXPECT_NEAR(b1[0 + 1 * p], 2.0 / 14.0, 1e-7);
-    EXPECT_NEAR(b1[1 + 0 * p], 1.0 / 14.0, 1e-7);
-    EXPECT_NEAR(b1[1 + 1 * p], 4.0 / 14.0, 1e-7);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{4.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{2.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{1.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{4.0 / 14.0}, half_tol);
+    auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
+}
+
+
+TYPED_TEST(Jacobi, CanConjTransposeDiagonalBlocks)
+{
+    using T = typename TestFixture::value_type;
+    using Bj = typename TestFixture::Bj;
+    auto tmp_bj = this->bj_factory->generate(this->mtx);
+
+    auto bj = gko::as<Bj>(tmp_bj->conj_transpose());
+
+    auto scheme = bj->get_storage_scheme();
+    auto p = scheme.get_stride();
+    auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], T{2.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], T{1.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r<T>::value);
+    auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
+}
 
+
+TYPED_TEST(Jacobi, CanConjTransposeDiagonalBlocksWithAdaptivePrecision)
+{
+    using T = typename TestFixture::value_type;
+    using Bj = typename TestFixture::Bj;
+    auto half_tol = std::sqrt(r<T>::value);
+    auto tmp_bj = this->adaptive_bj_factory->generate(this->mtx);
+
+    auto bj = gko::as<Bj>(tmp_bj->conj_transpose());
+
+    auto scheme = bj->get_storage_scheme();
+    auto p = scheme.get_stride();
+    using reduced = ::gko::reduce_precision<T>;
+    auto b1 = reinterpret_cast<const reduced *>(
+        bj->get_blocks() + scheme.get_global_block_offset(0));
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{4.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{2.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{1.0 / 14.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{4.0 / 14.0}, half_tol);
     auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
-    EXPECT_NEAR(b2[0 + 0 * p], 14.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[0 + 1 * p], 8.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[0 + 2 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 0 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 1 * p], 16.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 2 * p], 8.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 0 * p], 1.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 1 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 2 * p], 14.0 / 48.0, 1e-14);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
 }
 
 
-TEST_F(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecisionAndSmallBlocks)
+TYPED_TEST(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecisionAndSmallBlocks)
 {
+    using Bj = typename TestFixture::Bj;
+    using T = typename TestFixture::value_type;
     auto bj = Bj::build()
                   .with_max_block_size(3u)
                   // group size will be > 1
-                  .with_block_pointers(block_pointers)
-                  .with_storage_optimization(block_precisions)
-                  .on(exec)
-                  ->generate(mtx);
+                  .with_block_pointers(this->block_pointers)
+                  .with_storage_optimization(this->block_precisions)
+                  .on(this->exec)
+                  ->generate(this->mtx);
 
     auto scheme = bj->get_storage_scheme();
     auto p = scheme.get_stride();
     auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0);
-    EXPECT_NEAR(b1[0 + 0 * p], 4.0 / 14.0, 1e-14);
-    EXPECT_NEAR(b1[0 + 1 * p], 2.0 / 14.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 0 * p], 1.0 / 14.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 1 * p], 4.0 / 14.0, 1e-14);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], T{2.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], T{1.0 / 14.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r<T>::value);
 
     auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1);
-    EXPECT_NEAR(b2[0 + 0 * p], 14.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[0 + 1 * p], 8.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[0 + 2 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 0 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 1 * p], 16.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[1 + 2 * p], 8.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 0 * p], 1.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 1 * p], 4.0 / 48.0, 1e-14);
-    EXPECT_NEAR(b2[2 + 2 * p], 14.0 / 48.0, 1e-14);
+    GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 1 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[0 + 2 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 0 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[1 + 2 * p], T{8.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 0 * p], T{1.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 1 * p], T{4.0 / 48.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r<T>::value);
 }
 
 
-TEST_F(Jacobi, PivotsWhenInvertingBlocks)
+TYPED_TEST(Jacobi, PivotsWhenInvertingBlocks)
 {
-    gko::Array<gko::int32> bp(exec, 2);
-    init_array(bp.get_data(), {0, 3});
-    auto mtx = Mtx::create(exec, gko::dim<2>{3}, 9);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    gko::Array<index_type> bp(this->exec, 2);
+    this->template init_array<index_type>(bp.get_data(), {0, 3});
+    auto mtx = Mtx::create(this->exec, gko::dim<2>{3}, 9);
     /* test matrix:
        0 2 0
        0 0 4
        1 0 0
      */
-    init_array(mtx->get_row_ptrs(), {0, 3, 6, 9});
-    init_array(mtx->get_col_idxs(), {0, 1, 2, 0, 1, 2, 0, 1, 2});
-    init_array(mtx->get_values(),
-               {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0});
+    this->template init_array<index_type>(mtx->get_row_ptrs(), {0, 3, 6, 9});
+    this->template init_array<index_type>(mtx->get_col_idxs(),
+                                          {0, 1, 2, 0, 1, 2, 0, 1, 2});
+    this->template init_array<T>(mtx->get_values(),
+                                 {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0});
 
     auto bj = Bj::build()
                   .with_max_block_size(3u)
                   .with_block_pointers(bp)
-                  .on(exec)
+                  .on(this->exec)
                   ->generate(give(mtx));
 
     auto scheme = bj->get_storage_scheme();
     auto p = scheme.get_stride();
     auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0);
-    EXPECT_NEAR(b1[0 + 0 * p], 0.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[0 + 1 * p], 0.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[0 + 2 * p], 4.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 0 * p], 2.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 1 * p], 0.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[1 + 2 * p], 0.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[2 + 0 * p], 0.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[2 + 1 * p], 1.0 / 4.0, 1e-14);
-    EXPECT_NEAR(b1[2 + 2 * p], 0.0 / 4.0, 1e-14);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], T{0.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], T{0.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[0 + 2 * p], T{4.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], T{2.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], T{0.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[1 + 2 * p], T{0.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[2 + 0 * p], T{0.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[2 + 1 * p], T{1.0 / 4.0}, r<T>::value);
+    GKO_EXPECT_NEAR(b1[2 + 2 * p], T{0.0 / 4.0}, r<T>::value);
 }
 
 
-TEST_F(Jacobi, PivotsWhenInvertingBlocksWithiAdaptivePrecision)
+TYPED_TEST(Jacobi, PivotsWhenInvertingBlocksWithiAdaptivePrecision)
 {
-    gko::Array<gko::int32> bp(exec, 2);
-    init_array(bp.get_data(), {0, 3});
-    auto mtx = Mtx::create(exec, gko::dim<2>{3}, 9);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    using T = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<T>::value);
+    gko::Array<index_type> bp(this->exec, 2);
+    this->template init_array<index_type>(bp.get_data(), {0, 3});
+    auto mtx = Mtx::create(this->exec, gko::dim<2>{3}, 9);
     /* test matrix:
        0 2 0
        0 0 4
        1 0 0
      */
-    init_array(mtx->get_row_ptrs(), {0, 3, 6, 9});
-    init_array(mtx->get_col_idxs(), {0, 1, 2, 0, 1, 2, 0, 1, 2});
-    init_array(mtx->get_values(),
-               {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0});
+    this->template init_array<index_type>(mtx->get_row_ptrs(), {0, 3, 6, 9});
+    this->template init_array<index_type>(mtx->get_col_idxs(),
+                                          {0, 1, 2, 0, 1, 2, 0, 1, 2});
+    this->template init_array<T>(mtx->get_values(),
+                                 {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0});
 
     auto bj = Bj::build()
                   .with_max_block_size(3u)
                   .with_block_pointers(bp)
-                  .with_storage_optimization(block_precisions)
-                  .on(exec)
+                  .with_storage_optimization(this->block_precisions)
+                  .on(this->exec)
                   ->generate(give(mtx));
 
     auto scheme = bj->get_storage_scheme();
     auto p = scheme.get_stride();
-    auto b1 = reinterpret_cast<const float *>(
+    using reduced = ::gko::reduce_precision<T>;
+    auto b1 = reinterpret_cast<const reduced *>(
         bj->get_blocks() + scheme.get_global_block_offset(0));
-    EXPECT_NEAR(b1[0 + 0 * p], 0.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[0 + 1 * p], 0.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[0 + 2 * p], 4.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[1 + 0 * p], 2.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[1 + 1 * p], 0.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[1 + 2 * p], 0.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[2 + 0 * p], 0.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[2 + 1 * p], 1.0 / 4.0, 1e-7);
-    EXPECT_NEAR(b1[2 + 2 * p], 0.0 / 4.0, 1e-7);
+    GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{0.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{0.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[0 + 2 * p], reduced{4.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{2.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{0.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[1 + 2 * p], reduced{0.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[2 + 0 * p], reduced{0.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[2 + 1 * p], reduced{1.0 / 4.0}, half_tol);
+    GKO_EXPECT_NEAR(b1[2 + 2 * p], reduced{0.0 / 4.0}, half_tol);
 }
 
 
-TEST_F(Jacobi, ComputesConditionNumbersOfBlocks)
+TYPED_TEST(Jacobi, ComputesConditionNumbersOfBlocks)
 {
-    auto bj = adaptive_bj_factory->generate(mtx);
+    using T = typename TestFixture::value_type;
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
 
     auto cond = bj->get_conditioning();
-    EXPECT_NEAR(cond[0], 6.0 * 6.0 / 14.0, 1e-14);
-    ASSERT_NEAR(cond[1], 7.0 * 28.0 / 48.0, 1e-14);
+    GKO_EXPECT_NEAR(cond[0], gko::remove_complex<T>{6.0 * 6.0 / 14.0},
+                    r<T>::value * 1e1);
+    GKO_ASSERT_NEAR(cond[1], gko::remove_complex<T>{7.0 * 28.0 / 48.0},
+                    r<T>::value * 1e1);
 }
 
 
-TEST_F(Jacobi, SelectsCorrectBlockPrecisions)
+TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions)
 {
+    using Bj = typename TestFixture::Bj;
+    using T = typename TestFixture::value_type;
     auto bj =
         Bj::build()
             .with_max_block_size(17u)
-            .with_block_pointers(block_pointers)
+            .with_block_pointers(this->block_pointers)
             .with_storage_optimization(gko::precision_reduction::autodetect())
-            .with_accuracy(1.5e-3)
-            .on(exec)
-            ->generate(give(mtx));
+            .with_accuracy(gko::remove_complex<T>{1.5e-3})
+            .on(this->exec)
+            ->generate(give(this->mtx));
 
     auto prec =
         bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    auto precision2 = std::is_same<gko::remove_complex<T>, float>::value
+                          ? gko::precision_reduction(0, 0)   // float
+                          : gko::precision_reduction(0, 1);  // double
     EXPECT_EQ(prec[0], gko::precision_reduction(0, 2));  // u * cond = ~1.2e-3
-    ASSERT_EQ(prec[1], gko::precision_reduction(0, 1));  // u * cond = ~2.0e-3
+    ASSERT_EQ(prec[1], precision2);                      // u * cond = ~2.0e-3
 }
 
 
-TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
+TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow)
 {
-    auto mtx = gko::matrix::Csr<>::create(exec);
+    using Bj = typename TestFixture::Bj;
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    using mdata = typename TestFixture::mdata;
+    using T = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<T>::value);
+    auto mtx = Mtx::create(this->exec);
     // clang-format off
     mtx->read(mdata::diag({
-        // perfectly conditioned block, small value difference,
-        // can use fp16 (5, 10)
-        {{2.0, 1.0},
-         {1.0, 2.0}},
-        // perfectly conditioned block (scaled orthogonal),
-        // with large value difference, need fp16 (7, 8)
-        {{1e-7, -1e-14},
-         {1e-14,  1e-7}}
+                // perfectly conditioned block, small value difference,
+                // can use fp16 (5, 10)
+                {{2.0, 1.0},
+                 {1.0, 2.0}},
+                // perfectly conditioned block (scaled orthogonal),
+                // with large value difference, need fp16 (7, 8)
+                {{half_tol, -r<T>::value},
+                 {r<T>::value,  half_tol}}
     }));
     // clang-format on
 
     auto bj =
         Bj::build()
             .with_max_block_size(13u)
-            .with_block_pointers(gko::Array<gko::int32>(exec, {0, 2, 4}))
+            .with_block_pointers(gko::Array<index_type>(this->exec, {0, 2, 4}))
             .with_storage_optimization(gko::precision_reduction::autodetect())
-            .with_accuracy(0.1)
-            .on(exec)
+            .with_accuracy(gko::remove_complex<T>{1e-1})
+            .on(this->exec)
             ->generate(give(mtx));
 
     // both blocks are in the same group, both need (7, 8)
     auto prec =
         bj->get_parameters().storage_optimization.block_wise.get_const_data();
-    EXPECT_EQ(prec[0], gko::precision_reduction(1, 1));
-    ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
+    auto precision = std::is_same<gko::remove_complex<T>, float>::value
+                         ? gko::precision_reduction(0, 2)   // float
+                         : gko::precision_reduction(1, 1);  // double
+    EXPECT_EQ(prec[0], precision);
+    ASSERT_EQ(prec[1], precision);
 }
 
 
-TEST_F(Jacobi, AppliesToVector)
+TYPED_TEST(Jacobi, AppliesToVector)
 {
-    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, exec);
-    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, exec);
-    auto bj = bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
+    auto bj = this->bj_factory->generate(this->mtx);
 
     bj->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), r<value_type>::value);
 }
 
 
-TEST_F(Jacobi, AppliesToVectorWithAdaptivePrecision)
+TYPED_TEST(Jacobi, AppliesToVectorWithAdaptivePrecision)
 {
-    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, exec);
-    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, exec);
-    auto bj = adaptive_bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
 
     bj->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), 1e-7);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), half_tol);
 }
 
 
-TEST_F(Jacobi, AppliesToVectorWithAdaptivePrecisionAndSmallBlocks)
+TYPED_TEST(Jacobi, AppliesToVectorWithAdaptivePrecisionAndSmallBlocks)
 {
-    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, exec);
-    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, exec);
+    using Bj = typename TestFixture::Bj;
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
     auto bj = Bj::build()
                   .with_max_block_size(3u)
                   // group size will be > 1
-                  .with_block_pointers(block_pointers)
-                  .with_storage_optimization(block_precisions)
-                  .on(exec)
-                  ->generate(mtx);
+                  .with_block_pointers(this->block_pointers)
+                  .with_storage_optimization(this->block_precisions)
+                  .on(this->exec)
+                  ->generate(this->mtx);
 
     bj->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), 1e-7);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), half_tol);
 }
 
 
-TEST_F(Jacobi, AppliesToMultipleVectors)
+TYPED_TEST(Jacobi, AppliesToMultipleVectors)
 {
-    auto x = gko::initialize<Vec>(
-        3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}},
-        exec);
-    auto b = gko::initialize<Vec>(
-        3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}},
-        exec);
-    auto bj = bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
+    auto bj = this->bj_factory->generate(this->mtx);
 
     bj->apply(b.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(
         x, l({{1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}}),
-        1e-14);
+        r<value_type>::value);
 }
 
 
-TEST_F(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecision)
+TYPED_TEST(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecision)
 {
-    auto x = gko::initialize<Vec>(
-        3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}},
-        exec);
-    auto b = gko::initialize<Vec>(
-        3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}},
-        exec);
-    auto bj = adaptive_bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
 
     bj->apply(b.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(
         x, l({{1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}}),
-        1e-7);
+        half_tol);
 }
 
 
-TEST_F(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecisionAndSmallBlocks)
+TYPED_TEST(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecisionAndSmallBlocks)
 {
-    auto x = gko::initialize<Vec>(
-        3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}},
-        exec);
-    auto b = gko::initialize<Vec>(
-        3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}},
-        exec);
+    using Vec = typename TestFixture::Vec;
+    using Bj = typename TestFixture::Bj;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
     auto bj = Bj::build()
                   .with_max_block_size(3u)
                   // group size will be > 1
-                  .with_block_pointers(block_pointers)
-                  .with_storage_optimization(block_precisions)
-                  .on(exec)
-                  ->generate(mtx);
+                  .with_block_pointers(this->block_pointers)
+                  .with_storage_optimization(this->block_precisions)
+                  .on(this->exec)
+                  ->generate(this->mtx);
 
     bj->apply(b.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(
         x, l({{1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}}),
-        1e-7);
+        half_tol);
 }
 
 
-TEST_F(Jacobi, AppliesLinearCombinationToVector)
+TYPED_TEST(Jacobi, AppliesLinearCombinationToVector)
 {
-    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, exec);
-    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, exec);
-    auto alpha = gko::initialize<Vec>({2.0}, exec);
-    auto beta = gko::initialize<Vec>({-1.0}, exec);
-    auto bj = bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto bj = this->bj_factory->generate(this->mtx);
 
     bj->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}),
+                        r<value_type>::value);
 }
 
 
-TEST_F(Jacobi, AppliesLinearCombinationToVectorWithAdaptivePrecision)
+TYPED_TEST(Jacobi, AppliesLinearCombinationToVectorWithAdaptivePrecision)
 {
-    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, exec);
-    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, exec);
-    auto alpha = gko::initialize<Vec>({2.0}, exec);
-    auto beta = gko::initialize<Vec>({-1.0}, exec);
-    auto bj = adaptive_bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
 
     bj->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), 1e-7);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), half_tol);
 }
 
 
-TEST_F(Jacobi, AppliesLinearCombinationToMultipleVectors)
+TYPED_TEST(Jacobi, AppliesLinearCombinationToMultipleVectors)
 {
-    auto x = gko::initialize<Vec>(
-        3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}},
-        exec);
-    auto b = gko::initialize<Vec>(
-        3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}},
-        exec);
-    auto alpha = gko::initialize<Vec>({2.0}, exec);
-    auto beta = gko::initialize<Vec>({-1.0}, exec);
-    auto bj = bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto bj = this->bj_factory->generate(this->mtx);
 
     bj->apply(alpha.get(), b.get(), beta.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(
         x, l({{1.0, -0.5}, {1.0, 2.5}, {-2.0, -1.0}, {4.0, 1.0}, {-3.0, 0.5}}),
-        1e-14);
+        r<value_type>::value);
 }
 
 
-TEST_F(Jacobi, AppliesLinearCombinationToMultipleVectorsWithAdaptivePrecision)
+TYPED_TEST(Jacobi,
+           AppliesLinearCombinationToMultipleVectorsWithAdaptivePrecision)
 {
-    auto x = gko::initialize<Vec>(
-        3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}},
-        exec);
-    auto b = gko::initialize<Vec>(
-        3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}},
-        exec);
-    auto alpha = gko::initialize<Vec>({2.0}, exec);
-    auto beta = gko::initialize<Vec>({-1.0}, exec);
-    auto bj = adaptive_bj_factory->generate(mtx);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto bj = this->adaptive_bj_factory->generate(this->mtx);
 
     bj->apply(alpha.get(), b.get(), beta.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(
         x, l({{1.0, -0.5}, {1.0, 2.5}, {-2.0, -1.0}, {4.0, 1.0}, {-3.0, 0.5}}),
-        1e-7);
+        half_tol);
 }
 
 
-TEST_F(Jacobi, ConvertsToDense)
+TYPED_TEST(Jacobi, ConvertsToDense)
 {
-    auto dense = gko::matrix::Dense<>::create(exec);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto dense = Vec::create(this->exec);
 
-    dense->copy_from(bj_factory->generate(mtx));
+    dense->copy_from(this->bj_factory->generate(this->mtx));
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense,
@@ -631,16 +882,19 @@ TEST_F(Jacobi, ConvertsToDense)
            {1.0 / 14, 4.0 / 14,       0.0,       0.0,       0.0},
            {     0.0,      0.0, 14.0 / 48,  8.0 / 48,  4.0 / 48},
            {     0.0,      0.0,  4.0 / 48, 16.0 / 48,  8.0 / 48},
-           {     0.0,      0.0,  1.0 / 48,  4.0 / 48, 14.0 / 48}}), 1e-14);
+           {     0.0,      0.0,  1.0 / 48,  4.0 / 48, 14.0 / 48}}), r<value_type>::value);
     // clang-format on
 }
 
 
-TEST_F(Jacobi, ConvertsToDenseWithAdaptivePrecision)
+TYPED_TEST(Jacobi, ConvertsToDenseWithAdaptivePrecision)
 {
-    auto dense = gko::matrix::Dense<>::create(exec);
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto dense = Vec::create(this->exec);
 
-    dense->copy_from(adaptive_bj_factory->generate(mtx));
+    dense->copy_from(this->adaptive_bj_factory->generate(this->mtx));
 
     // clang-format off
     GKO_ASSERT_MTX_NEAR(dense,
@@ -648,9 +902,22 @@ TEST_F(Jacobi, ConvertsToDenseWithAdaptivePrecision)
            {1.0 / 14, 4.0 / 14,       0.0,       0.0,       0.0},
            {     0.0,      0.0, 14.0 / 48,  8.0 / 48,  4.0 / 48},
            {     0.0,      0.0,  4.0 / 48, 16.0 / 48,  8.0 / 48},
-           {     0.0,      0.0,  1.0 / 48,  4.0 / 48, 14.0 / 48}}), 1e-7);
+           {     0.0,      0.0,  1.0 / 48,  4.0 / 48, 14.0 / 48}}), half_tol);
     // clang-format on
 }
 
 
+TYPED_TEST(Jacobi, ConvertsEmptyToDense)
+{
+    using Vec = typename TestFixture::Vec;
+    auto empty = Vec::create(this->exec);
+    auto res = Vec::create(this->exec);
+
+    res->copy_from(
+        TestFixture::Bj::build().on(this->exec)->generate(gko::share(empty)));
+
+    ASSERT_FALSE(res->get_size());
+}
+
+
 }  // namespace
diff --git a/reference/test/solver/CMakeLists.txt b/reference/test/solver/CMakeLists.txt
index 86a6bbf576b..c86beb38b5c 100644
--- a/reference/test/solver/CMakeLists.txt
+++ b/reference/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_test(bicg_kernels)
 ginkgo_create_test(bicgstab_kernels)
 ginkgo_create_test(cg_kernels)
 ginkgo_create_test(cgs_kernels)
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
new file mode 100644
index 00000000000..e1d5b35da65
--- /dev/null
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -0,0 +1,338 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/bicg.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+#include <ginkgo/core/stop/time.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class Bicg : public ::testing::Test {
+protected:
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Bicg<value_type>;
+    Bicg()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::initialize<Mtx>(
+              {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
+          bicg_factory(
+              Solver::build()
+                  .with_criteria(
+                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
+                      gko::stop::Time::build()
+                          .with_time_limit(std::chrono::seconds(6))
+                          .on(exec),
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
+                          .on(exec))
+                  .on(exec)),
+          mtx_big(gko::initialize<Mtx>(
+              {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0},
+               {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5},
+               {4150.0, 1805.0, 6472.5, 2656.0, 2409.5, 3836.5},
+               {-3139.5, 73.0, 2656.0, 6048.0, 665.0, -132.0},
+               {3829.5, 1966.0, 2409.5, 665.0, 4240.5, 4373.5},
+               {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}},
+              exec)),
+          bicg_factory_big(
+              Solver::build()
+                  .with_criteria(
+                      gko::stop::Iteration::build().with_max_iters(100u).on(
+                          exec),
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
+                          .on(exec))
+                  .on(exec)),
+          mtx_non_symmetric(gko::initialize<Mtx>(
+              {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec))
+
+
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<Mtx> mtx_big;
+    std::shared_ptr<Mtx> mtx_non_symmetric;
+    std::unique_ptr<typename Solver::Factory> bicg_factory;
+    std::unique_ptr<typename Solver::Factory> bicg_factory_big;
+    std::unique_ptr<typename Solver::Factory> bicg_factory_non_symmetric;
+};
+
+TYPED_TEST_CASE(Bicg, gko::test::ValueTypes);
+
+
+TYPED_TEST(Bicg, SolvesStencilSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->bicg_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value);
+}
+
+
+TYPED_TEST(Bicg, SolvesMultipleStencilSystems)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->bicg_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, 1.0}, I<T>{3.0, 0.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
+
+    solver->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}),
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->bicg_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
+
+    solver->apply(alpha.get(), b.get(), beta.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r<value_type>::value);
+}
+
+
+TYPED_TEST(Bicg, SolvesMultipleStencilSystemsUsingAdvancedApply)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->bicg_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, 1.0}, I<T>{3.0, 0.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
+
+    solver->apply(alpha.get(), b.get(), beta.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}),
+                        r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Bicg, SolvesBigDenseSystem1)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->bicg_factory_big->generate(this->mtx_big);
+    auto b = gko::initialize<Mtx>(
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e2);
+}
+
+
+TYPED_TEST(Bicg, SolvesBigDenseSystem2)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->bicg_factory_big->generate(this->mtx_big);
+    auto b = gko::initialize<Mtx>(
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
+                        r<value_type>::value * 1e2);
+}
+
+
+TYPED_TEST(Bicg, SolvesNonSymmetricStencilSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->bicg_factory->generate(this->mtx_non_symmetric);
+    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e2);
+}
+
+
+template <typename T>
+gko::remove_complex<T> infNorm(gko::matrix::Dense<T> *mat, size_t col = 0)
+{
+    using std::abs;
+    using no_cpx_t = gko::remove_complex<T>;
+    no_cpx_t norm = 0.0;
+    for (size_t i = 0; i < mat->get_size()[0]; ++i) {
+        no_cpx_t absEntry = abs(mat->at(i, col));
+        if (norm < absEntry) norm = absEntry;
+    }
+    return norm;
+}
+
+
+TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->bicg_factory_big->generate(this->mtx_big);
+    auto b1 = gko::initialize<Mtx>(
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto b2 = gko::initialize<Mtx>(
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
+
+    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    auto bc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2});
+    auto xc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2});
+    for (size_t i = 0; i < bc->get_size()[0]; ++i) {
+        bc->at(i, 0) = b1->at(i);
+        bc->at(i, 1) = b2->at(i);
+
+        xc->at(i, 0) = x1->at(i);
+        xc->at(i, 1) = x2->at(i);
+    }
+
+    solver->apply(b1.get(), x1.get());
+    solver->apply(b2.get(), x2.get());
+    solver->apply(bc.get(), xc.get());
+    auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2});
+    for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) {
+        mergedRes->at(i, 0) = x1->at(i);
+        mergedRes->at(i, 1) = x2->at(i);
+    }
+
+    auto alpha = gko::initialize<Mtx>({1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+
+    auto residual1 = Mtx::create(this->exec, b1->get_size());
+    residual1->copy_from(b1.get());
+    auto residual2 = Mtx::create(this->exec, b2->get_size());
+    residual2->copy_from(b2.get());
+    auto residualC = Mtx::create(this->exec, bc->get_size());
+    residualC->copy_from(bc.get());
+
+    this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
+    this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
+    this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
+
+    auto normS1 = infNorm(residual1.get());
+    auto normS2 = infNorm(residual2.get());
+    auto normC1 = infNorm(residualC.get(), 0);
+    auto normC2 = infNorm(residualC.get(), 1);
+    auto normB1 = infNorm(b1.get());
+    auto normB2 = infNorm(b2.get());
+
+    // make sure that all combined solutions are as good or better than the
+    // single solutions
+    ASSERT_LE(normC1 / normB1, normS1 / normB1 + r<value_type>::value);
+    ASSERT_LE(normC2 / normB2, normS2 / normB2 + r<value_type>::value);
+
+    // Not sure if this is necessary, the assertions above should cover what is
+    // needed.
+    GKO_ASSERT_MTX_NEAR(xc, mergedRes, r<value_type>::value);
+}
+
+
+TYPED_TEST(Bicg, SolvesTransposedNonSymmetricStencilSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver =
+        this->bicg_factory->generate(this->mtx_non_symmetric->transpose());
+    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e2);
+}
+
+
+TYPED_TEST(Bicg, SolvesConjTransposedNonSymmetricStencilSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver =
+        this->bicg_factory->generate(this->mtx_non_symmetric->conj_transpose());
+    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e2);
+}
+
+
+}  // namespace
diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp
index c6ceb0b88ba..45836e2ac01 100644
--- a/reference/test/solver/bicgstab_kernels.cpp
+++ b/reference/test/solver/bicgstab_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,23 +36,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Bicgstab : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Bicgstab<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Bicgstab<value_type>;
 
     Bicgstab()
         : exec(gko::ReferenceExecutor::create()),
@@ -65,91 +69,113 @@ class Bicgstab : public ::testing::Test {
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           bicgstab_factory_precision(
-              gko::solver::Bicgstab<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(50u).on(
                           exec),
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec))
     {}
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<gko::solver::Bicgstab<>::Factory> bicgstab_factory;
-    std::unique_ptr<gko::solver::Bicgstab<>::Factory>
-        bicgstab_factory_precision;
+    std::unique_ptr<typename Solver::Factory> bicgstab_factory;
+    std::unique_ptr<typename Solver::Factory> bicgstab_factory_precision;
 };
 
+TYPED_TEST_CASE(Bicgstab, gko::test::ValueTypes);
 
-TEST_F(Bicgstab, SolvesDenseSystem)
+
+TYPED_TEST(Bicgstab, SolvesDenseSystem)
 {
-    auto solver = bicgstab_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->bicgstab_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol);
 }
 
 
-TEST_F(Bicgstab, SolvesMultipleDenseSystems)
+TYPED_TEST(Bicgstab, SolvesMultipleDenseSystems)
 {
-    auto solver = bicgstab_factory->generate(mtx);
-    auto b =
-        gko::initialize<Mtx>({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->bicgstab_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, -5.0}, I<T>{3.0, 1.0}, I<T>{1.0, -2.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}),
+                        half_tol);
 }
 
 
-TEST_F(Bicgstab, SolvesDenseSystemUsingAdvancedApply)
+TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply)
 {
-    auto solver = bicgstab_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->bicgstab_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
 
-    GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), half_tol);
 }
 
 
-TEST_F(Bicgstab, SolvesMultipleDenseSystemsUsingAdvancedApply)
+TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsUsingAdvancedApply)
 {
-    auto solver = bicgstab_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b =
-        gko::initialize<Mtx>({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->bicgstab_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, -5.0}, I<T>{3.0, 1.0}, I<T>{1.0, -2.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
 
-    GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}),
+                        half_tol);
 }
 
 
 // The following test-data was generated and validated with MATLAB
-TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1)
+TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
                               {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0},
@@ -157,10 +183,11 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1)
                               {60.0, -86.0, 54.0, -40.0, -93.0, 56.0},
                               {53.0, 94.0, -54.0, 86.0, -61.0, 4.0},
                               {-42.0, 57.0, 32.0, 89.0, 89.0, -39.0}},
-                             exec);
-    auto solver = bicgstab_factory_precision->generate(locmtx);
-    auto b = gko::initialize<Mtx>({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+                             this->exec);
+    auto solver = this->bicgstab_factory_precision->generate(locmtx);
+    auto b =
+        gko::initialize<Mtx>({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
@@ -168,12 +195,15 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1)
         x,
         l({0.13853406350816114, -0.08147485210505287, -0.0450299311807042,
            -0.0051264177562865719, 0.11609654300797841, 0.1018688746740561}),
-        1e-9);
+        half_tol * 5e-1);
 }
 
 
-TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2)
+TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
                               {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0},
@@ -181,10 +211,11 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2)
                               {60.0, -86.0, 54.0, -40.0, -93.0, 56.0},
                               {53.0, 94.0, -54.0, 86.0, -61.0, 4.0},
                               {-42.0, 57.0, 32.0, 89.0, 89.0, -39.0}},
-                             exec);
-    auto solver = bicgstab_factory_precision->generate(locmtx);
-    auto b = gko::initialize<Mtx>({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+                             this->exec);
+    auto solver = this->bicgstab_factory_precision->generate(locmtx);
+    auto b =
+        gko::initialize<Mtx>({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
@@ -192,24 +223,29 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2)
         x,
         l({0.13517641417299162, 0.75117689075221139, 0.47572853185155239,
            -0.50927993095367852, 0.13463333820848167, 0.23126768306576015}),
-        1e-9);
+        half_tol * 1e-1);
 }
 
 
-double infNorm(gko::matrix::Dense<> *mat, size_t col = 0)
+template <typename T>
+gko::remove_complex<T> infNorm(gko::matrix::Dense<T> *mat, size_t col = 0)
 {
     using std::abs;
-    double norm = 0.0;
+    using no_cpx_t = gko::remove_complex<T>;
+    no_cpx_t norm = 0.0;
     for (size_t i = 0; i < mat->get_size()[0]; ++i) {
-        double absEntry = abs(mat->at(i, col));
+        no_cpx_t absEntry = abs(mat->at(i, col));
         if (norm < absEntry) norm = absEntry;
     }
     return norm;
 }
 
 
-TEST_F(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck)
+TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
                               {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0},
@@ -217,16 +253,20 @@ TEST_F(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck)
                               {60.0, -86.0, 54.0, -40.0, -93.0, 56.0},
                               {53.0, 94.0, -54.0, 86.0, -61.0, 4.0},
                               {-42.0, 57.0, 32.0, 89.0, 89.0, -39.0}},
-                             exec);
-    auto solver = bicgstab_factory_precision->generate(locmtx);
-    auto b1 = gko::initialize<Mtx>({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, exec);
-    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto b2 = gko::initialize<Mtx>({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, exec);
-    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto bc = gko::initialize<Mtx>(
-        {{0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, exec);
-    auto xc = gko::initialize<Mtx>(
-        {{0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, exec);
+                             this->exec);
+    auto solver = this->bicgstab_factory_precision->generate(locmtx);
+    auto b1 =
+        gko::initialize<Mtx>({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, this->exec);
+    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto b2 =
+        gko::initialize<Mtx>({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, this->exec);
+    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto bc = gko::initialize<Mtx>({I<T>{0., 0.}, I<T>{0., 0.}, I<T>{0., 0.},
+                                    I<T>{0., 0.}, I<T>{0., 0.}, I<T>{0., 0.}},
+                                   this->exec);
+    auto xc = gko::initialize<Mtx>({I<T>{0., 0.}, I<T>{0., 0.}, I<T>{0., 0.},
+                                    I<T>{0., 0.}, I<T>{0., 0.}, I<T>{0., 0.}},
+                                   this->exec);
     for (size_t i = 0; i < xc->get_size()[0]; ++i) {
         bc->at(i, 0) = b1->at(i);
         bc->at(i, 1) = b2->at(i);
@@ -237,42 +277,74 @@ TEST_F(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck)
     solver->apply(b1.get(), x1.get());
     solver->apply(b2.get(), x2.get());
     solver->apply(bc.get(), xc.get());
-    auto testMtx = gko::initialize<Mtx>(
-        {{0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, exec);
+    auto testMtx =
+        gko::initialize<Mtx>({I<T>{0., 0.}, I<T>{0., 0.}, I<T>{0., 0.},
+                              I<T>{0., 0.}, I<T>{0., 0.}, I<T>{0., 0.}},
+                             this->exec);
 
     for (size_t i = 0; i < testMtx->get_size()[0]; ++i) {
         testMtx->at(i, 0) = x1->at(i);
         testMtx->at(i, 1) = x2->at(i);
     }
 
-    auto alpha = gko::initialize<Mtx>({1.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto residual1 = gko::initialize<Mtx>({0.}, exec);
+    auto alpha = gko::initialize<Mtx>({1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto residual1 = gko::initialize<Mtx>({0.}, this->exec);
     residual1->copy_from(b1->clone());
-    auto residual2 = gko::initialize<Mtx>({0.}, exec);
+    auto residual2 = gko::initialize<Mtx>({0.}, this->exec);
     residual2->copy_from(b2->clone());
-    auto residualC = gko::initialize<Mtx>({0.}, exec);
+    auto residualC = gko::initialize<Mtx>({0.}, this->exec);
     residualC->copy_from(bc->clone());
 
     locmtx->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
     locmtx->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
     locmtx->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
 
-    double normS1 = infNorm(residual1.get());
-    double normS2 = infNorm(residual2.get());
-    double normC1 = infNorm(residualC.get(), 0);
-    double normC2 = infNorm(residualC.get(), 1);
-    double normB1 = infNorm(bc.get(), 0);
-    double normB2 = infNorm(bc.get(), 1);
+    auto normS1 = infNorm(residual1.get());
+    auto normS2 = infNorm(residual2.get());
+    auto normC1 = infNorm(residualC.get(), 0);
+    auto normC2 = infNorm(residualC.get(), 1);
+    auto normB1 = infNorm(bc.get(), 0);
+    auto normB2 = infNorm(bc.get(), 1);
 
     // make sure that all combined solutions are as good or better than the
     // single solutions
-    ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-12);
-    ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-12);
+    ASSERT_LE(normC1 / normB1, normS1 / normB1 + r<value_type>::value * 1e2);
+    ASSERT_LE(normC2 / normB2, normS2 / normB2 + r<value_type>::value * 1e2);
 
     // Not sure if this is necessary, the assertions above should cover what is
     // needed.
-    GKO_ASSERT_MTX_NEAR(xc, testMtx, 1e-14);
+    GKO_ASSERT_MTX_NEAR(xc, testMtx, r<value_type>::value);
+}
+
+
+TYPED_TEST(Bicgstab, SolvesTransposedDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->bicgstab_factory->generate(this->mtx->transpose());
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol);
+}
+
+
+TYPED_TEST(Bicgstab, SolvesConjTransposedDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->bicgstab_factory->generate(this->mtx->conj_transpose());
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol);
 }
 
 
diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp
index 349e83b93db..adeea72b0ae 100644
--- a/reference/test/solver/cg_kernels.cpp
+++ b/reference/test/solver/cg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,35 +36,40 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Cg : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Cg<value_type>;
     Cg()
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
           cg_factory(
-              gko::solver::Cg<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(4u).on(exec),
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           mtx_big(gko::initialize<Mtx>(
@@ -76,12 +81,12 @@ class Cg : public ::testing::Test {
                {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}},
               exec)),
           cg_factory_big(
-              gko::solver::Cg<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(100u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec))
     {}
@@ -89,114 +94,148 @@ class Cg : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> mtx_big;
-    std::unique_ptr<gko::solver::Cg<>::Factory> cg_factory;
-    std::unique_ptr<gko::solver::Cg<>::Factory> cg_factory_big;
+    std::unique_ptr<typename Solver::Factory> cg_factory;
+    std::unique_ptr<typename Solver::Factory> cg_factory_big;
 };
 
+TYPED_TEST_CASE(Cg, gko::test::ValueTypes);
+
 
-TEST_F(Cg, SolvesStencilSystem)
+TYPED_TEST(Cg, SolvesStencilSystem)
 {
-    auto solver = cg_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value);
 }
 
 
-TEST_F(Cg, SolvesMultipleStencilSystems)
+TYPED_TEST(Cg, SolvesMultipleStencilSystems)
 {
-    auto solver = cg_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->cg_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, 1.0}, I<T>{3.0, 0.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}),
+                        r<value_type>::value);
 }
 
 
-TEST_F(Cg, SolvesStencilSystemUsingAdvancedApply)
+TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply)
 {
-    auto solver = cg_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r<value_type>::value);
 }
 
 
-TEST_F(Cg, SolvesMultipleStencilSystemsUsingAdvancedApply)
+TYPED_TEST(Cg, SolvesMultipleStencilSystemsUsingAdvancedApply)
 {
-    auto solver = cg_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->cg_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, 1.0}, I<T>{3.0, 0.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}),
+                        r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Cg, SolvesBigDenseSystem1)
+TYPED_TEST(Cg, SolvesBigDenseSystem1)
 {
-    auto solver = cg_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e2);
 }
 
 
-TEST_F(Cg, SolvesBigDenseSystem2)
+TYPED_TEST(Cg, SolvesBigDenseSystem2)
 {
-    auto solver = cg_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
+                        r<value_type>::value * 1e2);
 }
 
 
-double infNorm(gko::matrix::Dense<> *mat, size_t col = 0)
+template <typename T>
+gko::remove_complex<T> infNorm(gko::matrix::Dense<T> *mat, size_t col = 0)
 {
     using std::abs;
-    double norm = 0.0;
+    using no_cpx_t = gko::remove_complex<T>;
+    no_cpx_t norm = 0.0;
     for (size_t i = 0; i < mat->get_size()[0]; ++i) {
-        double absEntry = abs(mat->at(i, col));
+        no_cpx_t absEntry = abs(mat->at(i, col));
         if (norm < absEntry) norm = absEntry;
     }
     return norm;
 }
 
 
-TEST_F(Cg, SolvesMultipleDenseSystemForDivergenceCheck)
+TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck)
 {
-    auto solver = cg_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
-        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec);
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
     auto b2 = gko::initialize<Mtx>(
-        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec);
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
 
-    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
-    auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2});
-    auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2});
+    auto bc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2});
+    auto xc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2});
     for (size_t i = 0; i < bc->get_size()[0]; ++i) {
         bc->at(i, 0) = b1->at(i);
         bc->at(i, 1) = b2->at(i);
@@ -208,41 +247,75 @@ TEST_F(Cg, SolvesMultipleDenseSystemForDivergenceCheck)
     solver->apply(b1.get(), x1.get());
     solver->apply(b2.get(), x2.get());
     solver->apply(bc.get(), xc.get());
-    auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2});
+    auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2});
     for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) {
         mergedRes->at(i, 0) = x1->at(i);
         mergedRes->at(i, 1) = x2->at(i);
     }
 
-    auto alpha = gko::initialize<Mtx>({1.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
+    auto alpha = gko::initialize<Mtx>({1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
 
-    auto residual1 = Mtx::create(exec, b1->get_size());
+    auto residual1 = Mtx::create(this->exec, b1->get_size());
     residual1->copy_from(b1.get());
-    auto residual2 = Mtx::create(exec, b2->get_size());
+    auto residual2 = Mtx::create(this->exec, b2->get_size());
     residual2->copy_from(b2.get());
-    auto residualC = Mtx::create(exec, bc->get_size());
+    auto residualC = Mtx::create(this->exec, bc->get_size());
     residualC->copy_from(bc.get());
 
-    mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
-    mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
-    mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
+    this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
+    this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
+    this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
 
-    double normS1 = infNorm(residual1.get());
-    double normS2 = infNorm(residual2.get());
-    double normC1 = infNorm(residualC.get(), 0);
-    double normC2 = infNorm(residualC.get(), 1);
-    double normB1 = infNorm(b1.get());
-    double normB2 = infNorm(b2.get());
+    auto normS1 = infNorm(residual1.get());
+    auto normS2 = infNorm(residual2.get());
+    auto normC1 = infNorm(residualC.get(), 0);
+    auto normC2 = infNorm(residualC.get(), 1);
+    auto normB1 = infNorm(b1.get());
+    auto normB2 = infNorm(b2.get());
 
     // make sure that all combined solutions are as good or better than the
     // single solutions
-    ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14);
-    ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14);
+    ASSERT_LE(normC1 / normB1, normS1 / normB1 + r<value_type>::value);
+    ASSERT_LE(normC2 / normB2, normS2 / normB2 + r<value_type>::value);
 
     // Not sure if this is necessary, the assertions above should cover what is
     // needed.
-    GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14);
+    GKO_ASSERT_MTX_NEAR(xc, mergedRes, r<value_type>::value);
+}
+
+
+TYPED_TEST(Cg, SolvesTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory_big->generate(this->mtx_big);
+    auto b = gko::initialize<Mtx>(
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e2);
+}
+
+
+TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cg_factory_big->generate(this->mtx_big);
+    auto b = gko::initialize<Mtx>(
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e2);
 }
 
 
diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp
index 277d73d6d20..8a15055bea9 100644
--- a/reference/test/solver/cgs_kernels.cpp
+++ b/reference/test/solver/cgs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,23 +36,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Cgs : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Cgs<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Cgs<value_type>;
 
     Cgs()
         : exec(gko::ReferenceExecutor::create()),
@@ -63,8 +67,8 @@ class Cgs : public ::testing::Test {
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(40u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           mtx_big(
@@ -76,12 +80,12 @@ class Cgs : public ::testing::Test {
                                     {69.0, 32.0, -68.0, 57.0, -30.0, -51.0}},
                                    exec)),
           cgs_factory_big(
-              gko::solver::Cgs<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(100u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec))
     {}
@@ -89,116 +93,148 @@ class Cgs : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> mtx_big;
-    std::unique_ptr<gko::solver::Cgs<>::Factory> cgs_factory;
-    std::unique_ptr<gko::solver::Cgs<>::Factory> cgs_factory_big;
+    std::unique_ptr<typename Solver::Factory> cgs_factory;
+    std::unique_ptr<typename Solver::Factory> cgs_factory_big;
 };
 
+TYPED_TEST_CASE(Cgs, gko::test::ValueTypes);
 
-TEST_F(Cgs, SolvesDenseSystem)
+
+TYPED_TEST(Cgs, SolvesDenseSystem)
 {
-    auto solver = cgs_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<T>::value);
+    auto solver = this->cgs_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol);
 }
 
 
-TEST_F(Cgs, SolvesMultipleDenseSystem)
+TYPED_TEST(Cgs, SolvesMultipleDenseSystem)
 {
-    auto solver = cgs_factory->generate(mtx);
-    auto b =
-        gko::initialize<Mtx>({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->cgs_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, -5.0}, I<T>{3.0, 1.0}, I<T>{1.0, -2.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}),
+                        half_tol);
 }
 
 
-TEST_F(Cgs, SolvesDenseSystemUsingAdvancedApply)
+TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply)
 {
-    auto solver = cgs_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->cgs_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), half_tol);
 }
 
 
-TEST_F(Cgs, SolvesMultipleDenseSystemsUsingAdvancedApply)
+TYPED_TEST(Cgs, SolvesMultipleDenseSystemsUsingAdvancedApply)
 {
-    auto solver = cgs_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b =
-        gko::initialize<Mtx>({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto solver = this->cgs_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, -5.0}, I<T>{3.0, 1.0}, I<T>{1.0, -2.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), 1e-8);
+    GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}),
+                        half_tol);
 }
 
 
-TEST_F(Cgs, SolvesBigDenseSystem1)
+TYPED_TEST(Cgs, SolvesBigDenseSystem1)
 {
-    auto solver = cgs_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cgs_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}),
+                        r<value_type>::value * 1e3);
 }
 
 
-TEST_F(Cgs, SolvesBigDenseSystem2)
+TYPED_TEST(Cgs, SolvesBigDenseSystem2)
 {
-    auto solver = cgs_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cgs_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-58.0, 98.0, -16.0, -58.0, 2.0, 76.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({-58.0, 98.0, -16.0, -58.0, 2.0, 76.0}),
+                        r<value_type>::value * 1e2);
 }
 
 
-double infNorm(gko::matrix::Dense<> *mat, size_t col = 0)
+template <typename T>
+gko::remove_complex<T> infNorm(gko::matrix::Dense<T> *mat, size_t col = 0)
 {
     using std::abs;
-    double norm = 0.0;
+    using no_cpx_t = gko::remove_complex<T>;
+    no_cpx_t norm = 0.0;
     for (size_t i = 0; i < mat->get_size()[0]; ++i) {
-        double absEntry = abs(mat->at(i, col));
+        no_cpx_t absEntry = abs(mat->at(i, col));
         if (norm < absEntry) norm = absEntry;
     }
     return norm;
 }
 
 
-TEST_F(Cgs, SolvesMultipleDenseSystems)
+TYPED_TEST(Cgs, SolvesMultipleDenseSystems)
 {
-    auto solver = cgs_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cgs_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
-        {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, exec);
+        {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
     auto b2 = gko::initialize<Mtx>(
-        {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, exec);
+        {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec);
 
-    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
-    auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2});
-    auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2});
+    auto bc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2});
+    auto xc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2});
     for (size_t i = 0; i < bc->get_size()[0]; ++i) {
         bc->at(i, 0) = b1->at(i);
         bc->at(i, 1) = b2->at(i);
@@ -210,41 +246,74 @@ TEST_F(Cgs, SolvesMultipleDenseSystems)
     solver->apply(b1.get(), x1.get());
     solver->apply(b2.get(), x2.get());
     solver->apply(bc.get(), xc.get());
-    auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2});
+    auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2});
     for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) {
         mergedRes->at(i, 0) = x1->at(i);
         mergedRes->at(i, 1) = x2->at(i);
     }
 
-    auto alpha = gko::initialize<Mtx>({1.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
+    auto alpha = gko::initialize<Mtx>({1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
 
-    auto residual1 = Mtx::create(exec, b1->get_size());
+    auto residual1 = Mtx::create(this->exec, b1->get_size());
     residual1->copy_from(b1.get());
-    auto residual2 = Mtx::create(exec, b2->get_size());
+    auto residual2 = Mtx::create(this->exec, b2->get_size());
     residual2->copy_from(b2.get());
-    auto residualC = Mtx::create(exec, bc->get_size());
+    auto residualC = Mtx::create(this->exec, bc->get_size());
     residualC->copy_from(bc.get());
 
-    mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
-    mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
-    mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
+    this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
+    this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
+    this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
 
-    double normS1 = infNorm(residual1.get());
-    double normS2 = infNorm(residual2.get());
-    double normC1 = infNorm(residualC.get(), 0);
-    double normC2 = infNorm(residualC.get(), 1);
-    double normB1 = infNorm(b1.get());
-    double normB2 = infNorm(b2.get());
+    auto normS1 = infNorm(residual1.get());
+    auto normS2 = infNorm(residual2.get());
+    auto normC1 = infNorm(residualC.get(), 0);
+    auto normC2 = infNorm(residualC.get(), 1);
+    auto normB1 = infNorm(b1.get());
+    auto normB2 = infNorm(b2.get());
 
     // make sure that all combined solutions are as good or better than the
     // single solutions
-    ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14);
-    ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14);
+    ASSERT_LE(normC1 / normB1, normS1 / normB1 + r<value_type>::value);
+    ASSERT_LE(normC2 / normB2, normS2 / normB2 + r<value_type>::value);
 
     // Not sure if this is necessary, the assertions above should cover what is
     // needed.
-    GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14);
+    GKO_ASSERT_MTX_NEAR(xc, mergedRes, r<value_type>::value);
+}
+
+
+TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose());
+    auto b = gko::initialize<Mtx>(
+        {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}),
+                        r<value_type>::value * 1e3);
+}
+
+
+TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver =
+        this->cgs_factory_big->generate(this->mtx_big->conj_transpose());
+    auto b = gko::initialize<Mtx>(
+        {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}),
+                        r<value_type>::value * 1e3);
 }
 
 
diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp
index 18c3fd69ff5..843ea5a6037 100644
--- a/reference/test/solver/fcg_kernels.cpp
+++ b/reference/test/solver/fcg_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,25 +30,33 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <ginkgo/core/solver/fcg.hpp>
+
+
 #include <gtest/gtest.h>
-#include <core/test/utils.hpp>
+
+
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/solver/fcg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Fcg : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::Fcg<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Fcg<value_type>;
 
     Fcg()
         : exec(gko::ReferenceExecutor::create()),
@@ -61,8 +69,8 @@ class Fcg : public ::testing::Test {
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           mtx_big(gko::initialize<Mtx>(
@@ -74,12 +82,12 @@ class Fcg : public ::testing::Test {
                {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}},
               exec)),
           fcg_factory_big(
-              gko::solver::Fcg<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(100u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec))
     {}
@@ -87,113 +95,148 @@ class Fcg : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> mtx_big;
-    std::unique_ptr<gko::solver::Fcg<>::Factory> fcg_factory;
-    std::unique_ptr<gko::solver::Fcg<>::Factory> fcg_factory_big;
+    std::unique_ptr<typename Solver::Factory> fcg_factory;
+    std::unique_ptr<typename Solver::Factory> fcg_factory_big;
 };
 
+TYPED_TEST_CASE(Fcg, gko::test::ValueTypes);
 
-TEST_F(Fcg, SolvesStencilSystem)
+
+TYPED_TEST(Fcg, SolvesStencilSystem)
 {
-    auto solver = fcg_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value);
 }
 
 
-TEST_F(Fcg, SolvesMultipleStencilSystems)
+TYPED_TEST(Fcg, SolvesMultipleStencilSystems)
 {
-    auto solver = fcg_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->fcg_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, 1.0}, I<T>{3.0, 0.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}),
+                        r<value_type>::value);
 }
 
 
-TEST_F(Fcg, SolvesStencilSystemUsingAdvancedApply)
+TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply)
 {
-    auto solver = fcg_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Fcg, SolvesMultipleStencilSystemsUsingAdvancedApply)
+TYPED_TEST(Fcg, SolvesMultipleStencilSystemsUsingAdvancedApply)
 {
-    auto solver = fcg_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->fcg_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{-1.0, 1.0}, I<T>{3.0, 0.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}),
+                        r<value_type>::value * 1e1);
 }
 
-TEST_F(Fcg, SolvesBigDenseSystem1)
+
+TYPED_TEST(Fcg, SolvesBigDenseSystem1)
 {
-    auto solver = fcg_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e3);
 }
 
 
-TEST_F(Fcg, SolvesBigDenseSystem2)
+TYPED_TEST(Fcg, SolvesBigDenseSystem2)
 {
-    auto solver = fcg_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
+                        r<value_type>::value * 1e3);
 }
 
 
-double infNorm(gko::matrix::Dense<> *mat, size_t col = 0)
+template <typename T>
+gko::remove_complex<T> infNorm(gko::matrix::Dense<T> *mat, size_t col = 0)
 {
     using std::abs;
-    double norm = 0.0;
+    using no_cpx_t = gko::remove_complex<T>;
+    no_cpx_t norm = 0.0;
     for (size_t i = 0; i < mat->get_size()[0]; ++i) {
-        double absEntry = abs(mat->at(i, col));
+        no_cpx_t absEntry = abs(mat->at(i, col));
         if (norm < absEntry) norm = absEntry;
     }
     return norm;
 }
 
 
-TEST_F(Fcg, SolvesMultipleBigDenseSystems)
+TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems)
 {
-    auto solver = fcg_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
-        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec);
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
     auto b2 = gko::initialize<Mtx>(
-        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec);
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
 
-    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
-    auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2});
-    auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2});
+    auto bc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2});
+    auto xc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2});
     for (size_t i = 0; i < bc->get_size()[0]; ++i) {
         bc->at(i, 0) = b1->at(i);
         bc->at(i, 1) = b2->at(i);
@@ -205,25 +248,25 @@ TEST_F(Fcg, SolvesMultipleBigDenseSystems)
     solver->apply(b1.get(), x1.get());
     solver->apply(b2.get(), x2.get());
     solver->apply(bc.get(), xc.get());
-    auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2});
+    auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2});
     for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) {
         mergedRes->at(i, 0) = x1->at(i);
         mergedRes->at(i, 1) = x2->at(i);
     }
 
-    auto alpha = gko::initialize<Mtx>({1.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
+    auto alpha = gko::initialize<Mtx>({1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
 
-    auto residual1 = Mtx::create(exec, b1->get_size());
+    auto residual1 = Mtx::create(this->exec, b1->get_size());
     residual1->copy_from(b1.get());
-    auto residual2 = Mtx::create(exec, b2->get_size());
+    auto residual2 = Mtx::create(this->exec, b2->get_size());
     residual2->copy_from(b2.get());
-    auto residualC = Mtx::create(exec, bc->get_size());
+    auto residualC = Mtx::create(this->exec, bc->get_size());
     residualC->copy_from(bc.get());
 
-    mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
-    mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
-    mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
+    this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
+    this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
+    this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
 
     double normS1 = infNorm(residual1.get());
     double normS2 = infNorm(residual2.get());
@@ -234,12 +277,46 @@ TEST_F(Fcg, SolvesMultipleBigDenseSystems)
 
     // make sure that all combined solutions are as good or better than the
     // single solutions
-    ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14);
-    ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14);
+    ASSERT_LE(normC1 / normB1, normS1 / normB1 + r<value_type>::value);
+    ASSERT_LE(normC2 / normB2, normS2 / normB2 + r<value_type>::value);
 
     // Not sure if this is necessary, the assertions above should cover what is
     // needed.
-    GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14);
+    GKO_ASSERT_MTX_NEAR(xc, mergedRes, r<value_type>::value);
+}
+
+
+TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory_big->generate(this->mtx_big);
+    auto b = gko::initialize<Mtx>(
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e3);
+}
+
+
+TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->fcg_factory_big->generate(this->mtx_big);
+    auto b = gko::initialize<Mtx>(
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}),
+                        r<value_type>::value * 1e3);
 }
 
 
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 13d28b641bf..8eb06cf8dce 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,36 +36,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class Gmres : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Gmres<value_type>;
     Gmres()
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)),
           gmres_factory(
-              gko::solver::Gmres<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(4u).on(exec),
                       gko::stop::Time::build()
                           .with_time_limit(std::chrono::seconds(6))
                           .on(exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           mtx_big(gko::initialize<Mtx>(
@@ -77,12 +83,12 @@ class Gmres : public ::testing::Test {
                {-848.1, -280.5, -381.8, -187.1, 51.2, -176.2}},
               exec)),
           gmres_factory_big(
-              gko::solver::Gmres<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(100u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec)),
           mtx_medium(
@@ -98,115 +104,148 @@ class Gmres : public ::testing::Test {
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> mtx_medium;
     std::shared_ptr<Mtx> mtx_big;
-    std::unique_ptr<gko::solver::Gmres<>::Factory> gmres_factory;
-    std::unique_ptr<gko::solver::Gmres<>::Factory> gmres_factory_big;
+    std::unique_ptr<typename Solver::Factory> gmres_factory;
+    std::unique_ptr<typename Solver::Factory> gmres_factory_big;
 };
 
+TYPED_TEST_CASE(Gmres, gko::test::ValueTypes);
+
 
-TEST_F(Gmres, SolvesStencilSystem)
+TYPED_TEST(Gmres, SolvesStencilSystem)
 {
-    auto solver = gmres_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->gmres_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Gmres, SolvesMultipleStencilSystems)
+TYPED_TEST(Gmres, SolvesMultipleStencilSystems)
 {
-    auto solver = gmres_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({{13.0, 6.0}, {7.0, 4.0}, {1.0, 1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->gmres_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{13.0, 6.0}, I<T>{7.0, 4.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}),
+                        r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Gmres, SolvesStencilSystemUsingAdvancedApply)
+TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply)
 {
-    auto solver = gmres_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->gmres_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Gmres, SolvesMultipleStencilSystemsUsingAdvancedApply)
+TYPED_TEST(Gmres, SolvesMultipleStencilSystemsUsingAdvancedApply)
 {
-    auto solver = gmres_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({{13.0, 6.0}, {7.0, 4.0}, {1.0, 1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->gmres_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{13.0, 6.0}, I<T>{7.0, 4.0}, I<T>{1.0, 1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}),
+                        r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Gmres, SolvesBigDenseSystem1)
+TYPED_TEST(Gmres, SolvesBigDenseSystem1)
 {
-    auto solver = gmres_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->gmres_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
-        {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}),
+                        r<value_type>::value * 1e3);
 }
 
 
-TEST_F(Gmres, SolvesBigDenseSystem2)
+TYPED_TEST(Gmres, SolvesBigDenseSystem2)
 {
-    auto solver = gmres_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->gmres_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
-        exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
+                        r<value_type>::value * 1e3);
 }
 
 
-double infNorm(gko::matrix::Dense<> *mat, size_t col = 0)
+template <typename T>
+gko::remove_complex<T> infNorm(gko::matrix::Dense<T> *mat, size_t col = 0)
 {
     using std::abs;
-    double norm = 0.0;
+    using no_cpx_t = gko::remove_complex<T>;
+    no_cpx_t norm = 0.0;
     for (size_t i = 0; i < mat->get_size()[0]; ++i) {
-        double absEntry = abs(mat->at(i, col));
+        no_cpx_t absEntry = abs(mat->at(i, col));
         if (norm < absEntry) norm = absEntry;
     }
     return norm;
 }
 
 
-TEST_F(Gmres, SolvesMultipleDenseSystemForDivergenceCheck)
+TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck)
 {
-    auto solver = gmres_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->gmres_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
-        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec);
+        {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
+        this->exec);
     auto b2 = gko::initialize<Mtx>(
-        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec);
+        {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
+        this->exec);
 
-    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+    auto x1 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto x2 = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
-    auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2});
-    auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2});
+    auto bc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2});
+    auto xc =
+        Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2});
     for (size_t i = 0; i < bc->get_size()[0]; ++i) {
         bc->at(i, 0) = b1->at(i);
         bc->at(i, 1) = b2->at(i);
@@ -218,88 +257,135 @@ TEST_F(Gmres, SolvesMultipleDenseSystemForDivergenceCheck)
     solver->apply(b1.get(), x1.get());
     solver->apply(b2.get(), x2.get());
     solver->apply(bc.get(), xc.get());
-    auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2});
+    auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2});
     for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) {
         mergedRes->at(i, 0) = x1->at(i);
         mergedRes->at(i, 1) = x2->at(i);
     }
 
-    auto alpha = gko::initialize<Mtx>({1.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
+    auto alpha = gko::initialize<Mtx>({1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
 
-    auto residual1 = Mtx::create(exec, b1->get_size());
+    auto residual1 = Mtx::create(this->exec, b1->get_size());
     residual1->copy_from(b1.get());
-    auto residual2 = Mtx::create(exec, b2->get_size());
+    auto residual2 = Mtx::create(this->exec, b2->get_size());
     residual2->copy_from(b2.get());
-    auto residualC = Mtx::create(exec, bc->get_size());
+    auto residualC = Mtx::create(this->exec, bc->get_size());
     residualC->copy_from(bc.get());
 
-    mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
-    mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
-    mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
+    this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get());
+    this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get());
+    this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get());
 
-    double normS1 = infNorm(residual1.get());
-    double normS2 = infNorm(residual2.get());
-    double normC1 = infNorm(residualC.get(), 0);
-    double normC2 = infNorm(residualC.get(), 1);
-    double normB1 = infNorm(b1.get());
-    double normB2 = infNorm(b2.get());
+    auto normS1 = infNorm(residual1.get());
+    auto normS2 = infNorm(residual2.get());
+    auto normC1 = infNorm(residualC.get(), 0);
+    auto normC2 = infNorm(residualC.get(), 1);
+    auto normB1 = infNorm(b1.get());
+    auto normB2 = infNorm(b2.get());
 
     // make sure that all combined solutions are as good or better than the
     // single solutions
-    ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14);
-    ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14);
+    ASSERT_LE(normC1 / normB1, normS1 / normB1 + r<value_type>::value);
+    ASSERT_LE(normC2 / normB2, normS2 / normB2 + r<value_type>::value);
 
     // Not sure if this is necessary, the assertions above should cover what is
     // needed.
-    GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14);
+    GKO_ASSERT_MTX_NEAR(xc, mergedRes, r<value_type>::value);
 }
 
 
-TEST_F(Gmres, SolvesBigDenseSystem1WithRestart)
+TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
     auto gmres_factory_restart =
-        gko::solver::Gmres<>::build()
+        Solver::build()
             .with_krylov_dim(4u)
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(200u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-15)
-                    .on(exec))
-            .on(exec);
-    auto solver = gmres_factory_restart->generate(mtx_medium);
+                gko::stop::Iteration::build().with_max_iters(200u).on(
+                    this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(r<value_type>::value)
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = gmres_factory_restart->generate(this->mtx_medium);
     auto b = gko::initialize<Mtx>(
-        {-13945.16, 11205.66, 16132.96, 24342.18, -10910.98}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        {-13945.16, 11205.66, 16132.96, 24342.18, -10910.98}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-140.20, -142.20, 48.80, -17.70, -19.60}), 1e-5);
+    GKO_ASSERT_MTX_NEAR(x, l({-140.20, -142.20, 48.80, -17.70, -19.60}),
+                        half_tol * 1e2);
 }
 
 
-TEST_F(Gmres, SolvesWithPreconditioner)
+TYPED_TEST(Gmres, SolvesWithPreconditioner)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    using value_type = typename TestFixture::value_type;
     auto gmres_factory_preconditioner =
-        gko::solver::Gmres<>::build()
+        Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(exec),
-                gko::stop::ResidualNormReduction<>::build()
-                    .with_reduction_factor(1e-15)
-                    .on(exec))
-            .with_preconditioner(gko::preconditioner::Jacobi<>::build()
-                                     .with_max_block_size(3u)
-                                     .on(exec))
-            .on(exec);
-    auto solver = gmres_factory_preconditioner->generate(mtx_big);
+                gko::stop::Iteration::build().with_max_iters(100u).on(
+                    this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(r<value_type>::value)
+                    .on(this->exec))
+            .with_preconditioner(
+                gko::preconditioner::Jacobi<value_type>::build()
+                    .with_max_block_size(3u)
+                    .on(this->exec))
+            .on(this->exec);
+    auto solver = gmres_factory_preconditioner->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
-        exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec);
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
+                        r<value_type>::value * 1e3);
+}
+
+
+TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose());
+    auto b = gko::initialize<Mtx>(
+        {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}),
+                        r<value_type>::value * 1e3);
+}
+
+
+TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver =
+        this->gmres_factory_big->generate(this->mtx_big->conj_transpose());
+    auto b = gko::initialize<Mtx>(
+        {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
+        this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}),
+                        r<value_type>::value * 1e3);
 }
 
 
diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp
index 18e3c3cf9b2..208d4f6b9d9 100644
--- a/reference/test/solver/ir_kernels.cpp
+++ b/reference/test/solver/ir_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -36,93 +36,277 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#include <core/test/utils/assertions.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Ir : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type = T;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::Ir<value_type>;
     Ir()
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{0.9, -1.0, 3.0}, {0.0, 1.0, 3.0}, {0.0, 0.0, 1.1}}, exec)),
           // Eigenvalues of mtx are 0.9, 1.0 and 1.1
-          // Richardson iteration, converges since | lambda - 1 | < 1
+          // Richardson iteration, converges since
+          // | relaxation_factor * lambda - 1 | < 1
           ir_factory(
-              gko::solver::Ir<>::build()
+              Solver::build()
                   .with_criteria(
                       gko::stop::Iteration::build().with_max_iters(30u).on(
                           exec),
-                      gko::stop::ResidualNormReduction<>::build()
-                          .with_reduction_factor(1e-15)
+                      gko::stop::ResidualNormReduction<value_type>::build()
+                          .with_reduction_factor(r<value_type>::value)
                           .on(exec))
                   .on(exec))
     {}
 
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
-    std::unique_ptr<gko::solver::Ir<>::Factory> ir_factory;
+    std::unique_ptr<typename Solver::Factory> ir_factory;
 };
 
+TYPED_TEST_CASE(Ir, gko::test::ValueTypes);
+
 
-TEST_F(Ir, SolvesTriangularSystem)
+TYPED_TEST(Ir, SolvesTriangularSystem)
 {
-    auto solver = ir_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->ir_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, SolvesTriangularSystemWithIterativeInnerSolver)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+
+    const gko::remove_complex<value_type> inner_reduction_factor = 1e-2;
+    auto inner_solver_factory =
+        gko::solver::Gmres<value_type>::build()
+            .with_criteria(gko::stop::ResidualNormReduction<value_type>::build()
+                               .with_reduction_factor(inner_reduction_factor)
+                               .on(this->exec))
+            .on(this->exec);
+
+    auto solver_factory =
+        gko::solver::Ir<value_type>::build()
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on(
+                               this->exec),
+                           gko::stop::ResidualNormReduction<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value)
+                               .on(this->exec))
+            .with_solver(gko::share(inner_solver_factory))
+            .on(this->exec);
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver_factory->generate(this->mtx)->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Ir, SolvesMultipleTriangularSystems)
+TYPED_TEST(Ir, SolvesMultipleTriangularSystems)
 {
-    auto solver = ir_factory->generate(mtx);
-    auto b = gko::initialize<Mtx>({{3.9, 2.9}, {9.0, 4.0}, {2.2, 1.1}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->ir_factory->generate(this->mtx);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{3.9, 2.9}, I<T>{9.0, 4.0}, I<T>{2.2, 1.1}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}),
+                        r<value_type>::value * 1e1);
 }
 
 
-TEST_F(Ir, SolvesTriangularSystemUsingAdvancedApply)
+TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApply)
 {
-    auto solver = ir_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, exec);
-    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->ir_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r<value_type>::value);
 }
 
 
-TEST_F(Ir, SolvesMultipleStencilSystemsUsingAdvancedApply)
+TYPED_TEST(Ir, SolvesMultipleStencilSystemsUsingAdvancedApply)
 {
-    auto solver = ir_factory->generate(mtx);
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    auto b = gko::initialize<Mtx>({{3.9, 2.9}, {9.0, 4.0}, {2.2, 1.1}}, exec);
-    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto solver = this->ir_factory->generate(this->mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto b = gko::initialize<Mtx>(
+        {I<T>{3.9, 2.9}, I<T>{9.0, 4.0}, I<T>{2.2, 1.1}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.5, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}),
+                        r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, SolvesTransposedTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->ir_factory->generate(this->mtx->transpose());
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, SolvesConjTransposedTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = this->ir_factory->generate(this->mtx->conj_transpose());
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, RichardsonSolvesTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver = gko::solver::Ir<value_type>::build()
+                      .with_criteria(
+                          gko::stop::Iteration::build().with_max_iters(100u).on(
+                              this->exec),
+                          gko::stop::ResidualNormReduction<value_type>::build()
+                              .with_reduction_factor(r<value_type>::value)
+                              .on(this->exec))
+                      .with_relaxation_factor(value_type{0.9})
+                      .on(this->exec)
+                      ->generate(this->mtx);
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, RichardsonSolvesTriangularSystemWithIterativeInnerSolver)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    const gko::remove_complex<value_type> inner_reduction_factor = 1e-2;
+    auto inner_solver_factory =
+        gko::solver::Gmres<value_type>::build()
+            .with_criteria(gko::stop::ResidualNormReduction<value_type>::build()
+                               .with_reduction_factor(inner_reduction_factor)
+                               .on(this->exec))
+            .on(this->exec);
+    auto solver_factory =
+        gko::solver::Ir<value_type>::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(100u).on(
+                    this->exec),
+                gko::stop::ResidualNormReduction<value_type>::build()
+                    .with_reduction_factor(r<value_type>::value)
+                    .on(this->exec))
+            .with_relaxation_factor(value_type{0.9})
+            .with_solver(gko::share(inner_solver_factory))
+            .on(this->exec);
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver_factory->generate(this->mtx)->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, RichardsonTransposedSolvesTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver =
+        gko::solver::Ir<value_type>::build()
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on(
+                               this->exec),
+                           gko::stop::ResidualNormReduction<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value)
+                               .on(this->exec))
+            .with_relaxation_factor(value_type{0.9})
+            .on(this->exec)
+            ->generate(this->mtx->transpose());
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
+}
+
+
+TYPED_TEST(Ir, RichardsonConjTransposedSolvesTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto solver =
+        gko::solver::Ir<value_type>::build()
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on(
+                               this->exec),
+                           gko::stop::ResidualNormReduction<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value)
+                               .on(this->exec))
+            .with_relaxation_factor(value_type{0.9})
+            .on(this->exec)
+            ->generate(this->mtx->conj_transpose());
+    auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r<value_type>::value * 1e1);
 }
 
 
diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp
index 465b576a107..65a7aab0d42 100644
--- a/reference/test/solver/lower_trs.cpp
+++ b/reference/test/solver/lower_trs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -44,17 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class LowerTrs : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
-    using CsrMtx = gko::matrix::Csr<>;
-    using Solver = gko::solver::LowerTrs<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using CsrMtx = gko::matrix::Csr<value_type, index_type>;
+    using Solver = gko::solver::LowerTrs<value_type, index_type>;
 
     LowerTrs()
         : exec(gko::ReferenceExecutor::create()),
@@ -69,64 +74,72 @@ class LowerTrs : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<CsrMtx> csr_mtx;
-    std::unique_ptr<Solver::Factory> lower_trs_factory;
+    std::unique_ptr<typename Solver::Factory> lower_trs_factory;
     std::unique_ptr<Solver> solver;
 };
 
+TYPED_TEST_CASE(LowerTrs, gko::test::ValueIndexTypes);
 
-TEST_F(LowerTrs, LowerTrsFactoryCreatesCorrectSolver)
+
+TYPED_TEST(LowerTrs, LowerTrsFactoryCreatesCorrectSolver)
 {
-    auto sys_mtx = solver->get_system_matrix();
+    auto sys_mtx = this->solver->get_system_matrix();
 
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3));
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3));
     ASSERT_NE(sys_mtx, nullptr);
-    GKO_ASSERT_MTX_NEAR(sys_mtx, csr_mtx, 0);
+    GKO_ASSERT_MTX_NEAR(sys_mtx, this->csr_mtx, 0);
 }
 
 
-TEST_F(LowerTrs, CanBeCopied)
+TYPED_TEST(LowerTrs, CanBeCopied)
 {
-    auto copy = Solver::build().on(exec)->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy =
+        Solver::build().on(this->exec)->generate(Mtx::create(this->exec));
 
-    copy->copy_from(gko::lend(solver));
+    copy->copy_from(gko::lend(this->solver));
     auto copy_mtx = copy->get_system_matrix();
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
-    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0);
+    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0);
 }
 
 
-TEST_F(LowerTrs, CanBeMoved)
+TYPED_TEST(LowerTrs, CanBeMoved)
 {
-    auto copy = Solver::build().on(exec)->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy =
+        Solver::build().on(this->exec)->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(solver));
+    copy->copy_from(std::move(this->solver));
     auto copy_mtx = copy->get_system_matrix();
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
-    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0);
+    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0);
 }
 
 
-TEST_F(LowerTrs, CanBeCloned)
+TYPED_TEST(LowerTrs, CanBeCloned)
 {
-    auto clone = solver->clone();
+    auto clone = this->solver->clone();
 
     auto clone_mtx = clone->get_system_matrix();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
-    GKO_ASSERT_MTX_NEAR(clone_mtx.get(), csr_mtx.get(), 0);
+    GKO_ASSERT_MTX_NEAR(clone_mtx.get(), this->csr_mtx.get(), 0);
 }
 
 
-TEST_F(LowerTrs, CanBeCleared)
+TYPED_TEST(LowerTrs, CanBeCleared)
 {
-    solver->clear();
+    this->solver->clear();
 
-    auto solver_mtx = solver->get_system_matrix();
+    auto solver_mtx = this->solver->get_system_matrix();
 
     ASSERT_EQ(solver_mtx, nullptr);
-    ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0));
+    ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0));
 }
 
 
diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp
index 22ba58a8912..6379dca192d 100644
--- a/reference/test/solver/lower_trs_kernels.cpp
+++ b/reference/test/solver/lower_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,20 +45,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
 #include "core/solver/lower_trs_kernels.hpp"
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class LowerTrs : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::LowerTrs<value_type, index_type>;
     LowerTrs()
         : exec(gko::ReferenceExecutor::create()),
           ref(gko::ReferenceExecutor::create()),
@@ -66,16 +72,15 @@ class LowerTrs : public ::testing::Test {
               {{1, 0.0, 0.0}, {3.0, 1, 0.0}, {1.0, 2.0, 1}}, exec)),
           mtx2(gko::initialize<Mtx>(
               {{2, 0.0, 0.0}, {3.0, 3, 0.0}, {1.0, 2.0, 4}}, exec)),
-          lower_trs_factory(gko::solver::LowerTrs<>::build().on(exec)),
-          lower_trs_factory_mrhs(
-              gko::solver::LowerTrs<>::build().with_num_rhs(2u).on(exec)),
+          lower_trs_factory(Solver::build().on(exec)),
+          lower_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)),
           mtx_big(gko::initialize<Mtx>({{124.0, 0.0, 0.0, 0.0, 0.0},
                                         {43.0, -789.0, 0.0, 0.0, 0.0},
                                         {134.5, -651.0, 654.0, 0.0, 0.0},
                                         {-642.0, 684.0, 68.0, 387.0, 0.0},
                                         {365.0, 97.0, -654.0, 8.0, 91.0}},
                                        exec)),
-          lower_trs_factory_big(gko::solver::LowerTrs<>::build().on(exec))
+          lower_trs_factory_big(Solver::build().on(exec))
     {}
 
     std::shared_ptr<const gko::Executor> exec;
@@ -83,101 +88,149 @@ class LowerTrs : public ::testing::Test {
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> mtx2;
     std::shared_ptr<Mtx> mtx_big;
-    std::unique_ptr<gko::solver::LowerTrs<>::Factory> lower_trs_factory;
-    std::unique_ptr<gko::solver::LowerTrs<>::Factory> lower_trs_factory_mrhs;
-    std::unique_ptr<gko::solver::LowerTrs<>::Factory> lower_trs_factory_big;
+    std::unique_ptr<typename Solver::Factory> lower_trs_factory;
+    std::unique_ptr<typename Solver::Factory> lower_trs_factory_mrhs;
+    std::unique_ptr<typename Solver::Factory> lower_trs_factory_big;
 };
 
+TYPED_TEST_CASE(LowerTrs, gko::test::ValueIndexTypes);
 
-TEST_F(LowerTrs, RefLowerTrsFlagCheckIsCorrect)
+
+TYPED_TEST(LowerTrs, RefLowerTrsFlagCheckIsCorrect)
 {
     bool trans_flag = true;
     bool expected_flag = false;
 
-    gko::kernels::reference::lower_trs::should_perform_transpose(ref,
+    gko::kernels::reference::lower_trs::should_perform_transpose(this->ref,
                                                                  trans_flag);
 
     ASSERT_EQ(expected_flag, trans_flag);
 }
 
 
-TEST_F(LowerTrs, SolvesTriangularSystem)
+TYPED_TEST(LowerTrs, SolvesTriangularSystem)
 {
-    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
-    auto solver = lower_trs_factory->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_factory->generate(this->mtx);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 2.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 2.0}), r<value_type>::value);
 }
 
 
-TEST_F(LowerTrs, SolvesMultipleTriangularSystems)
+TYPED_TEST(LowerTrs, SolvesMultipleTriangularSystems)
 {
-    std::shared_ptr<Mtx> b =
-        gko::initialize<Mtx>({{3.0, 4.0}, {1.0, 0.0}, {1.0, -1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
-    auto solver = lower_trs_factory_mrhs->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
+        {I<T>{3.0, 4.0}, I<T>{1.0, 0.0}, I<T>{1.0, -1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
+    auto solver = this->lower_trs_factory_mrhs->generate(this->mtx);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{3.0, 4.0}, {-8.0, -12.0}, {14.0, 19.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{3.0, 4.0}, {-8.0, -12.0}, {14.0, 19.0}}),
+                        r<value_type>::value);
 }
 
 
-TEST_F(LowerTrs, SolvesNonUnitTriangularSystem)
+TYPED_TEST(LowerTrs, SolvesNonUnitTriangularSystem)
 {
-    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({2.0, 12.0, 3.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
-    auto solver = lower_trs_factory->generate(mtx2);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({2.0, 12.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_factory->generate(this->mtx2);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), r<value_type>::value);
 }
 
-TEST_F(LowerTrs, SolvesTriangularSystemUsingAdvancedApply)
+
+TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply)
 {
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, exec);
-    auto x = gko::initialize<Mtx>({1.0, -1.0, 1.0}, exec);
-    auto solver = lower_trs_factory->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, -1.0, 1.0}, this->exec);
+    auto solver = this->lower_trs_factory->generate(this->mtx);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 3.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 3.0}), r<value_type>::value);
 }
 
 
-TEST_F(LowerTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply)
+TYPED_TEST(LowerTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply)
 {
-    auto alpha = gko::initialize<Mtx>({-1.0}, exec);
-    auto beta = gko::initialize<Mtx>({2.0}, exec);
-    std::shared_ptr<Mtx> b =
-        gko::initialize<Mtx>({{3.0, 4.0}, {1.0, 0.0}, {1.0, -1.0}}, exec);
-    auto x =
-        gko::initialize<Mtx>({{1.0, 2.0}, {-1.0, -1.0}, {0.0, -2.0}}, exec);
-    auto solver = lower_trs_factory_mrhs->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto alpha = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({2.0}, this->exec);
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
+        {I<T>{3.0, 4.0}, I<T>{1.0, 0.0}, I<T>{1.0, -1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{1.0, 2.0}, I<T>{-1.0, -1.0}, I<T>{0.0, -2.0}}, this->exec);
+    auto solver = this->lower_trs_factory_mrhs->generate(this->mtx);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(x, l({{-1.0, 0.0}, {6.0, 10.0}, {-14.0, -23.0}}),
-                        1e-14);
+                        r<value_type>::value);
 }
 
 
-TEST_F(LowerTrs, SolvesBigDenseSystem)
+TYPED_TEST(LowerTrs, SolvesBigDenseSystem)
 {
-    std::shared_ptr<Mtx> b =
-        gko::initialize<Mtx>({-124.0, -3199.0, 3147.5, 5151.0, -6021.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto solver = lower_trs_factory_big->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
+        {-124.0, -3199.0, 3147.5, 5151.0, -6021.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_factory_big->generate(this->mtx_big);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}),
+                        r<value_type>::value * 1e3);
+}
+
+
+TYPED_TEST(LowerTrs, SolvesTransposedTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_factory->generate(this->mtx);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r<value_type>::value);
+}
+
+
+TYPED_TEST(LowerTrs, SolvesConjTransposedTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_factory->generate(this->mtx);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r<value_type>::value);
 }
 
 
diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp
index 6f07b6960a4..178fa5aff3c 100644
--- a/reference/test/solver/upper_trs.cpp
+++ b/reference/test/solver/upper_trs.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -44,17 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class UpperTrs : public ::testing::Test {
 protected:
-    using CsrMtx = gko::matrix::Csr<double, int>;
-    using Mtx = gko::matrix::Dense<>;
-    using Solver = gko::solver::UpperTrs<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using CsrMtx = gko::matrix::Csr<value_type, index_type>;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::UpperTrs<value_type, index_type>;
 
     UpperTrs()
         : exec(gko::ReferenceExecutor::create()),
@@ -69,63 +74,69 @@ class UpperTrs : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<CsrMtx> csr_mtx;
-    std::unique_ptr<Solver::Factory> upper_trs_factory;
+    std::unique_ptr<typename Solver::Factory> upper_trs_factory;
     std::unique_ptr<Solver> upper_trs_solver;
 };
 
+TYPED_TEST_CASE(UpperTrs, gko::test::ValueIndexTypes);
 
-TEST_F(UpperTrs, UpperTrsFactoryCreatesCorrectSolver)
+
+TYPED_TEST(UpperTrs, UpperTrsFactoryCreatesCorrectSolver)
 {
-    auto sys_mtx = upper_trs_solver->get_system_matrix();
+    auto sys_mtx = this->upper_trs_solver->get_system_matrix();
 
-    ASSERT_EQ(upper_trs_solver->get_size(), gko::dim<2>(3, 3));
+    ASSERT_EQ(this->upper_trs_solver->get_size(), gko::dim<2>(3, 3));
     ASSERT_NE(sys_mtx, nullptr);
-    GKO_ASSERT_MTX_NEAR(sys_mtx, csr_mtx, 0);
+    GKO_ASSERT_MTX_NEAR(sys_mtx, this->csr_mtx, 0);
 }
 
 
-TEST_F(UpperTrs, CanBeCopied)
+TYPED_TEST(UpperTrs, CanBeCopied)
 {
-    auto copy = Solver::build().on(exec)->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy =
+        Solver::build().on(this->exec)->generate(Mtx::create(this->exec));
 
-    copy->copy_from(gko::lend(upper_trs_solver));
+    copy->copy_from(gko::lend(this->upper_trs_solver));
     auto copy_mtx = copy->get_system_matrix();
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
-    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0);
+    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0);
 }
 
 
-TEST_F(UpperTrs, CanBeMoved)
+TYPED_TEST(UpperTrs, CanBeMoved)
 {
-    auto copy = upper_trs_factory->generate(Mtx::create(exec));
+    using Mtx = typename TestFixture::Mtx;
+    auto copy = this->upper_trs_factory->generate(Mtx::create(this->exec));
 
-    copy->copy_from(std::move(upper_trs_solver));
+    copy->copy_from(std::move(this->upper_trs_solver));
     auto copy_mtx = copy->get_system_matrix();
 
     ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3));
-    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0);
+    GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0);
 }
 
 
-TEST_F(UpperTrs, CanBeCloned)
+TYPED_TEST(UpperTrs, CanBeCloned)
 {
-    auto clone = upper_trs_solver->clone();
+    auto clone = this->upper_trs_solver->clone();
 
     auto clone_mtx = clone->get_system_matrix();
 
     ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3));
-    GKO_ASSERT_MTX_NEAR(clone_mtx.get(), csr_mtx.get(), 0);
+    GKO_ASSERT_MTX_NEAR(clone_mtx.get(), this->csr_mtx.get(), 0);
 }
 
 
-TEST_F(UpperTrs, CanBeCleared)
+TYPED_TEST(UpperTrs, CanBeCleared)
 {
-    upper_trs_solver->clear();
+    this->upper_trs_solver->clear();
 
-    auto solver_mtx = upper_trs_solver->get_system_matrix();
+    auto solver_mtx = this->upper_trs_solver->get_system_matrix();
 
-    ASSERT_EQ(upper_trs_solver->get_size(), gko::dim<2>(0, 0));
+    ASSERT_EQ(this->upper_trs_solver->get_size(), gko::dim<2>(0, 0));
     ASSERT_EQ(solver_mtx, nullptr);
 }
 
diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp
index a9863054b94..81da3158442 100644
--- a/reference/test/solver/upper_trs_kernels.cpp
+++ b/reference/test/solver/upper_trs_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,20 +45,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 
 #include "core/solver/upper_trs_kernels.hpp"
-#include "core/test/utils/assertions.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename ValueIndexType>
 class UpperTrs : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Dense<>;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Dense<value_type>;
+    using Solver = gko::solver::UpperTrs<value_type, index_type>;
     UpperTrs()
         : exec(gko::ReferenceExecutor::create()),
           ref(gko::ReferenceExecutor::create()),
@@ -66,9 +72,8 @@ class UpperTrs : public ::testing::Test {
               {{1, 3.0, 1.0}, {0.0, 1, 2.0}, {0.0, 0.0, 1}}, exec)),
           mtx2(gko::initialize<Mtx>(
               {{2, 3.0, 1.0}, {0.0, 3, 2.0}, {0.0, 0.0, 4}}, exec)),
-          upper_trs_factory(gko::solver::UpperTrs<>::build().on(exec)),
-          upper_trs_factory_mrhs(
-              gko::solver::UpperTrs<>::build().with_num_rhs(2u).on(exec)),
+          upper_trs_factory(Solver::build().on(exec)),
+          upper_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)),
           mtx_big(gko::initialize<Mtx>({{365.0, 97.0, -654.0, 8.0, 91.0},
                                         {0.0, -642.0, 684.0, 68.0, 387.0},
                                         {0.0, 0.0, 134, -651.0, 654.0},
@@ -82,100 +87,149 @@ class UpperTrs : public ::testing::Test {
     std::shared_ptr<Mtx> mtx;
     std::shared_ptr<Mtx> mtx2;
     std::shared_ptr<Mtx> mtx_big;
-    std::unique_ptr<gko::solver::UpperTrs<>::Factory> upper_trs_factory;
-    std::unique_ptr<gko::solver::UpperTrs<>::Factory> upper_trs_factory_mrhs;
+    std::unique_ptr<typename Solver::Factory> upper_trs_factory;
+    std::unique_ptr<typename Solver::Factory> upper_trs_factory_mrhs;
 };
 
+TYPED_TEST_CASE(UpperTrs, gko::test::ValueIndexTypes);
 
-TEST_F(UpperTrs, RefUpperTrsFlagCheckIsCorrect)
+
+TYPED_TEST(UpperTrs, RefUpperTrsFlagCheckIsCorrect)
 {
     bool trans_flag = true;
     bool expected_flag = false;
 
-    gko::kernels::reference::upper_trs::should_perform_transpose(ref,
+    gko::kernels::reference::upper_trs::should_perform_transpose(this->ref,
                                                                  trans_flag);
 
     ASSERT_EQ(expected_flag, trans_flag);
 }
 
 
-TEST_F(UpperTrs, SolvesTriangularSystem)
+TYPED_TEST(UpperTrs, SolvesTriangularSystem)
 {
-    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
-    auto solver = upper_trs_factory->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_factory->generate(this->mtx);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({13.0, -4.0, 3.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({13.0, -4.0, 3.0}), r<value_type>::value);
 }
 
 
-TEST_F(UpperTrs, SolvesMultipleTriangularSystems)
+TYPED_TEST(UpperTrs, SolvesMultipleTriangularSystems)
 {
-    std::shared_ptr<Mtx> b =
-        gko::initialize<Mtx>({{4.0, 2.0}, {2.0, 1.0}, {3.0, -1.0}}, exec);
-    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
-    auto solver = upper_trs_factory_mrhs->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
+        {I<T>{4.0, 2.0}, I<T>{2.0, 1.0}, I<T>{3.0, -1.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{0.0, 0.0}, I<T>{0.0, 0.0}, I<T>{0.0, 0.0}}, this->exec);
+    auto solver = this->upper_trs_factory_mrhs->generate(this->mtx);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{13.0, -6.0}, {-4.0, 3.0}, {3.0, -1.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{13.0, -6.0}, {-4.0, 3.0}, {3.0, -1.0}}),
+                        r<value_type>::value);
 }
 
 
-TEST_F(UpperTrs, SolvesNonUnitTriangularSystem)
+TYPED_TEST(UpperTrs, SolvesNonUnitTriangularSystem)
 {
-    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({10.0, 7.0, -4.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
-    auto solver = upper_trs_factory->generate(mtx2);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b =
+        gko::initialize<Mtx>({10.0, 7.0, -4.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_factory->generate(this->mtx2);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), r<value_type>::value);
 }
 
 
-TEST_F(UpperTrs, SolvesTriangularSystemUsingAdvancedApply)
+TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply)
 {
-    auto alpha = gko::initialize<Mtx>({2.0}, exec);
-    auto beta = gko::initialize<Mtx>({-1.0}, exec);
-    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, exec);
-    auto x = gko::initialize<Mtx>({1.0, -1.0, 1.0}, exec);
-    auto solver = upper_trs_factory->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({1.0, -1.0, 1.0}, this->exec);
+    auto solver = this->upper_trs_factory->generate(this->mtx);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({25.0, -7.0, 5.0}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({25.0, -7.0, 5.0}), r<value_type>::value);
 }
 
 
-TEST_F(UpperTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply)
+TYPED_TEST(UpperTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply)
 {
-    auto alpha = gko::initialize<Mtx>({-1.0}, exec);
-    auto beta = gko::initialize<Mtx>({2.0}, exec);
-    std::shared_ptr<Mtx> b =
-        gko::initialize<Mtx>({{4.0, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
-    auto x =
-        gko::initialize<Mtx>({{1.0, 2.0}, {-1.0, -1.0}, {1.0, -2.0}}, exec);
-    auto solver = upper_trs_factory_mrhs->generate(mtx);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto alpha = gko::initialize<Mtx>({-1.0}, this->exec);
+    auto beta = gko::initialize<Mtx>({2.0}, this->exec);
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
+        {I<T>{4.0, 1.0}, I<T>{1.0, 2.0}, I<T>{2.0, 3.0}}, this->exec);
+    auto x = gko::initialize<Mtx>(
+        {I<T>{1.0, 2.0}, I<T>{-1.0, -1.0}, I<T>{1.0, -2.0}}, this->exec);
+    auto solver = this->upper_trs_factory_mrhs->generate(this->mtx);
 
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({{-9.0, -6.0}, {1.0, 2.0}, {0.0, -7.0}}), 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, l({{-9.0, -6.0}, {1.0, 2.0}, {0.0, -7.0}}),
+                        r<value_type>::value);
 }
 
 
-TEST_F(UpperTrs, SolvesBigDenseSystem)
+TYPED_TEST(UpperTrs, SolvesBigDenseSystem)
 {
-    std::shared_ptr<Mtx> b =
-        gko::initialize<Mtx>({-6021.0, 3018.0, -2055.0, 1707.0, -248.0}, exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0}, exec);
-    auto solver = upper_trs_factory->generate(mtx_big);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
+        {-6021.0, 3018.0, -2055.0, 1707.0, -248.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_factory->generate(this->mtx_big);
 
     solver->apply(b.get(), x.get());
 
-    GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}), 1e-10);
+    GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}),
+                        r<value_type>::value * 1e3);
+}
+
+
+TYPED_TEST(UpperTrs, SolvesTransposedTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_factory->generate(this->mtx);
+
+    solver->transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r<value_type>::value);
+}
+
+
+TYPED_TEST(UpperTrs, SolvesConjTransposedTriangularSystem)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_factory->generate(this->mtx);
+
+    solver->conj_transpose()->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r<value_type>::value);
 }
 
 
diff --git a/reference/test/stop/CMakeLists.txt b/reference/test/stop/CMakeLists.txt
index 771a14696dd..fb27a843369 100644
--- a/reference/test/stop/CMakeLists.txt
+++ b/reference/test/stop/CMakeLists.txt
@@ -1,5 +1,5 @@
 ginkgo_create_test(combined)
 ginkgo_create_test(criterion_kernels)
 ginkgo_create_test(iteration)
-ginkgo_create_test(residual_norm_reduction_kernels)
+ginkgo_create_test(residual_norm_kernels)
 ginkgo_create_test(time)
diff --git a/reference/test/stop/combined.cpp b/reference/test/stop/combined.cpp
index 9f8629b0068..08939c64392 100644
--- a/reference/test/stop/combined.cpp
+++ b/reference/test/stop/combined.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/combined.hpp>
 
 
-#include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/time.hpp>
-
-
-#include <gtest/gtest.h>
 #include <chrono>
 #include <thread>
 #if defined(_WIN32) || defined(__CYGWIN__)
@@ -45,6 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/time.hpp>
+
+
 namespace {
 
 
diff --git a/reference/test/stop/criterion_kernels.cpp b/reference/test/stop/criterion_kernels.cpp
index 328dc79f4d1..b2fa160f8e3 100644
--- a/reference/test/stop/criterion_kernels.cpp
+++ b/reference/test/stop/criterion_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <ginkgo/core/stop/criterion.hpp>
-#include <ginkgo/core/stop/iteration.hpp>
 
 
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/stop/iteration.hpp>
+
+
 namespace {
 
 
diff --git a/reference/test/stop/iteration.cpp b/reference/test/stop/iteration.cpp
index f2c8637c863..fd9d34114b5 100644
--- a/reference/test/stop/iteration.cpp
+++ b/reference/test/stop/iteration.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp
new file mode 100644
index 00000000000..bf2de2bc20c
--- /dev/null
+++ b/reference/test/stop/residual_norm_kernels.cpp
@@ -0,0 +1,431 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include <type_traits>
+
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class ResidualNormReduction : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<T>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+
+    ResidualNormReduction()
+    {
+        exec_ = gko::ReferenceExecutor::create();
+        factory_ = gko::stop::ResidualNormReduction<T>::build()
+                       .with_reduction_factor(r<T>::value)
+                       .on(exec_);
+    }
+
+    std::unique_ptr<typename gko::stop::ResidualNormReduction<T>::Factory>
+        factory_;
+    std::shared_ptr<const gko::Executor> exec_;
+};
+
+TYPED_TEST_CASE(ResidualNormReduction, gko::test::ValueTypes);
+
+
+TYPED_TEST(ResidualNormReduction, CanCreateFactory)
+{
+    ASSERT_NE(this->factory_, nullptr);
+    ASSERT_EQ(this->factory_->get_parameters().reduction_factor,
+              r<TypeParam>::value);
+    ASSERT_EQ(this->factory_->get_executor(), this->exec_);
+}
+
+
+TYPED_TEST(ResidualNormReduction, CannotCreateCriterionWithoutB)
+{
+    ASSERT_THROW(this->factory_->generate(nullptr, nullptr, nullptr, nullptr),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(ResidualNormReduction, CanCreateCriterionWithB)
+{
+    using Mtx = typename TestFixture::Mtx;
+    std::shared_ptr<gko::LinOp> scalar =
+        gko::initialize<Mtx>({1.0}, this->exec_);
+    auto criterion =
+        this->factory_->generate(nullptr, nullptr, nullptr, scalar.get());
+    ASSERT_NE(criterion, nullptr);
+}
+
+
+TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoal)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    auto initial_res = gko::initialize<Mtx>({100.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, this->exec_);
+    auto res_norm = gko::initialize<NormVector>({100.0}, this->exec_);
+    auto criterion =
+        this->factory_->generate(nullptr, rhs, nullptr, initial_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->exec_, 1);
+    stop_status.get_data()[0].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0) = r<TypeParam>::value * 1.1e+2;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
+    ASSERT_EQ(one_changed, false);
+
+    res_norm->at(0) = r<TypeParam>::value * 0.9e+2;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = TypeParam;
+    using T_nc = gko::remove_complex<TypeParam>;
+    auto res = gko::initialize<Mtx>({I<T>{100.0, 100.0}}, this->exec_);
+    auto res_norm =
+        gko::initialize<NormVector>({I<T_nc>{100.0, 100.0}}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({I<T>{10.0, 10.0}}, this->exec_);
+    auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->exec_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0, 0) = r<TypeParam>::value * 0.9e+2;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+
+    res_norm->at(0, 1) = r<TypeParam>::value * 0.9e+2;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+template <typename T>
+class RelativeResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<T>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+
+    RelativeResidualNorm()
+    {
+        exec_ = gko::ReferenceExecutor::create();
+        factory_ = gko::stop::RelativeResidualNorm<T>::build()
+                       .with_tolerance(r<T>::value)
+                       .on(exec_);
+    }
+
+    std::unique_ptr<typename gko::stop::RelativeResidualNorm<T>::Factory>
+        factory_;
+    std::shared_ptr<const gko::Executor> exec_;
+};
+
+TYPED_TEST_CASE(RelativeResidualNorm, gko::test::ValueTypes);
+
+
+TYPED_TEST(RelativeResidualNorm, CanCreateFactory)
+{
+    ASSERT_NE(this->factory_, nullptr);
+    ASSERT_EQ(this->factory_->get_parameters().tolerance, r<TypeParam>::value);
+    ASSERT_EQ(this->factory_->get_executor(), this->exec_);
+}
+
+
+TYPED_TEST(RelativeResidualNorm, CannotCreateCriterionWithoutB)
+{
+    ASSERT_THROW(this->factory_->generate(nullptr, nullptr, nullptr, nullptr),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(RelativeResidualNorm, CanCreateCriterionWithB)
+{
+    using Mtx = typename TestFixture::Mtx;
+    std::shared_ptr<gko::LinOp> scalar =
+        gko::initialize<Mtx>({1.0}, this->exec_);
+    auto criterion =
+        this->factory_->generate(nullptr, scalar, nullptr, nullptr);
+    ASSERT_NE(criterion, nullptr);
+}
+
+
+TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoal)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    auto initial_res = gko::initialize<Mtx>({100.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, this->exec_);
+    auto res_norm = gko::initialize<NormVector>({100.0}, this->exec_);
+    auto criterion =
+        this->factory_->generate(nullptr, rhs, nullptr, initial_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->exec_, 1);
+    stop_status.get_data()[0].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0) = r<TypeParam>::value * 1.1e+1;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
+    ASSERT_EQ(one_changed, false);
+
+    res_norm->at(0) = r<TypeParam>::value * 0.9e+1;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = TypeParam;
+    using T_nc = gko::remove_complex<TypeParam>;
+    auto res = gko::initialize<Mtx>({I<T>{100.0, 100.0}}, this->exec_);
+    auto res_norm =
+        gko::initialize<NormVector>({I<T_nc>{100.0, 100.0}}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({I<T>{10.0, 10.0}}, this->exec_);
+    auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->exec_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0, 0) = r<TypeParam>::value * 0.9e+1;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+
+    res_norm->at(0, 1) = r<TypeParam>::value * 0.9e+1;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+template <typename T>
+class AbsoluteResidualNorm : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<T>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+
+    AbsoluteResidualNorm()
+    {
+        exec_ = gko::ReferenceExecutor::create();
+        factory_ = gko::stop::AbsoluteResidualNorm<T>::build()
+                       .with_tolerance(r<T>::value)
+                       .on(exec_);
+    }
+
+    std::unique_ptr<typename gko::stop::AbsoluteResidualNorm<T>::Factory>
+        factory_;
+    std::shared_ptr<const gko::Executor> exec_;
+};
+
+TYPED_TEST_CASE(AbsoluteResidualNorm, gko::test::ValueTypes);
+
+
+TYPED_TEST(AbsoluteResidualNorm, CanCreateFactory)
+{
+    ASSERT_NE(this->factory_, nullptr);
+    ASSERT_EQ(this->factory_->get_parameters().tolerance, r<TypeParam>::value);
+    ASSERT_EQ(this->factory_->get_executor(), this->exec_);
+}
+
+
+TYPED_TEST(AbsoluteResidualNorm, CannotCreateCriterionWithoutB)
+{
+    ASSERT_THROW(this->factory_->generate(nullptr, nullptr, nullptr, nullptr),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(AbsoluteResidualNorm, CanCreateCriterionWithB)
+{
+    using Mtx = typename TestFixture::Mtx;
+    std::shared_ptr<gko::LinOp> scalar =
+        gko::initialize<Mtx>({1.0}, this->exec_);
+    auto criterion =
+        this->factory_->generate(nullptr, scalar, nullptr, nullptr);
+    ASSERT_NE(criterion, nullptr);
+}
+
+
+TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoal)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    auto initial_res = gko::initialize<Mtx>({100.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({10.0}, this->exec_);
+    auto res_norm = gko::initialize<NormVector>({100.0}, this->exec_);
+    auto criterion =
+        this->factory_->generate(nullptr, rhs, nullptr, initial_res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->exec_, 1);
+    stop_status.get_data()[0].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0) = r<TypeParam>::value * 1.1;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
+    ASSERT_EQ(one_changed, false);
+
+    res_norm->at(0) = r<TypeParam>::value * 0.9;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = TypeParam;
+    using T_nc = gko::remove_complex<TypeParam>;
+    auto res = gko::initialize<Mtx>({I<T>{100.0, 100.0}}, this->exec_);
+    auto res_norm =
+        gko::initialize<NormVector>({I<T_nc>{100.0, 100.0}}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs =
+        gko::initialize<Mtx>({I<T>{10.0, 10.0}}, this->exec_);
+    auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get());
+    bool one_changed{};
+    constexpr gko::uint8 RelativeStoppingId{1};
+    gko::Array<gko::stopping_status> stop_status(this->exec_, 2);
+    stop_status.get_data()[0].reset();
+    stop_status.get_data()[1].reset();
+
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+
+    res_norm->at(0, 0) = r<TypeParam>::value * 0.9;
+    ASSERT_FALSE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+
+    res_norm->at(0, 1) = r<TypeParam>::value * 0.9;
+    ASSERT_TRUE(
+        criterion->update()
+            .residual_norm(res_norm.get())
+            .check(RelativeStoppingId, true, &stop_status, &one_changed));
+    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
+    ASSERT_EQ(one_changed, true);
+}
+
+
+}  // namespace
diff --git a/reference/test/stop/residual_norm_reduction_kernels.cpp b/reference/test/stop/residual_norm_reduction_kernels.cpp
deleted file mode 100644
index c326280ae5e..00000000000
--- a/reference/test/stop/residual_norm_reduction_kernels.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
-
-
-#include <gtest/gtest.h>
-
-
-namespace {
-
-
-constexpr double reduction_factor = 1.0e-14;
-
-
-class ResidualNormReduction : public ::testing::Test {
-protected:
-    using Mtx = gko::matrix::Dense<>;
-
-    ResidualNormReduction()
-    {
-        exec_ = gko::ReferenceExecutor::create();
-        factory_ = gko::stop::ResidualNormReduction<>::build()
-                       .with_reduction_factor(reduction_factor)
-                       .on(exec_);
-    }
-
-    std::unique_ptr<gko::stop::ResidualNormReduction<>::Factory> factory_;
-    std::shared_ptr<const gko::Executor> exec_;
-};
-
-
-TEST_F(ResidualNormReduction, CanCreateFactory)
-{
-    ASSERT_NE(factory_, nullptr);
-    ASSERT_EQ(factory_->get_parameters().reduction_factor, reduction_factor);
-    ASSERT_EQ(factory_->get_executor(), exec_);
-}
-
-
-TEST_F(ResidualNormReduction, CannotCreateCriterionWithoutB)
-{
-    ASSERT_THROW(factory_->generate(nullptr, nullptr, nullptr, nullptr),
-                 gko::NotSupported);
-}
-
-
-TEST_F(ResidualNormReduction, CanCreateCriterionWithB)
-{
-    std::shared_ptr<gko::LinOp> scalar =
-        gko::initialize<gko::matrix::Dense<>>({1.0}, exec_);
-    auto criterion =
-        factory_->generate(nullptr, nullptr, nullptr, scalar.get());
-    ASSERT_NE(criterion, nullptr);
-}
-
-
-TEST_F(ResidualNormReduction, WaitsTillResidualGoal)
-{
-    auto scalar = gko::initialize<Mtx>({1.0}, exec_);
-    auto criterion =
-        factory_->generate(nullptr, nullptr, nullptr, scalar.get());
-    bool one_changed{};
-    constexpr gko::uint8 RelativeStoppingId{1};
-    gko::Array<gko::stopping_status> stop_status(exec_, 1);
-    stop_status.get_data()[0].reset();
-
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-
-    scalar->at(0) = reduction_factor * 1.0e+2;
-    ASSERT_FALSE(
-        criterion->update()
-            .residual_norm(scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
-    ASSERT_EQ(one_changed, false);
-
-    scalar->at(0) = reduction_factor * 1.0e-2;
-    ASSERT_TRUE(
-        criterion->update()
-            .residual_norm(scalar.get())
-            .check(RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-}
-
-
-TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS)
-{
-    auto mtx = gko::initialize<Mtx>({{1.0, 1.0}}, exec_);
-    auto criterion = factory_->generate(nullptr, nullptr, nullptr, mtx.get());
-    bool one_changed{};
-    constexpr gko::uint8 RelativeStoppingId{1};
-    gko::Array<gko::stopping_status> stop_status(exec_, 2);
-    // Array only does malloc, it *does not* construct the object
-    // therefore you get undefined values in your objects whatever you do.
-    // Proper fix is not easy, we can't just call memset. We can probably not
-    // call the placement constructor either
-    stop_status.get_data()[0].reset();
-    stop_status.get_data()[1].reset();
-
-    ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check(
-        RelativeStoppingId, true, &stop_status, &one_changed));
-
-    mtx->at(0, 0) = reduction_factor * 1.0e-2;
-    ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check(
-        RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[0].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-    one_changed = false;
-
-    mtx->at(0, 1) = reduction_factor * 1.0e-2;
-    ASSERT_TRUE(criterion->update().residual_norm(mtx.get()).check(
-        RelativeStoppingId, true, &stop_status, &one_changed));
-    ASSERT_EQ(stop_status.get_data()[1].has_converged(), true);
-    ASSERT_EQ(one_changed, true);
-}
-
-
-}  // namespace
diff --git a/reference/test/stop/time.cpp b/reference/test/stop/time.cpp
index 8d47b90ff2a..258db5d2854 100644
--- a/reference/test/stop/time.cpp
+++ b/reference/test/stop/time.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/time.hpp>
 
 
-#include <gtest/gtest.h>
 #include <chrono>
 #include <thread>
 #if defined(_WIN32) || defined(__CYGWIN__)
@@ -41,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#include <gtest/gtest.h>
+
+
 namespace {
 
 
diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp
index 9423ce12e3d..ffa09c8d431 100644
--- a/reference/test/utils/assertions_test.cpp
+++ b/reference/test/utils/assertions_test.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <core/test/utils/assertions.hpp>
+#include "core/test/utils/assertions.hpp"
 
 
 #include <gtest/gtest.h>
@@ -40,16 +40,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
+template <typename T>
+class MatricesNear : public ::testing::Test {};
+
+TYPED_TEST_CASE(MatricesNear, gko::test::ValueTypes);
+
 
-TEST(MatricesNear, CanPassAnyMatrixType)
+TYPED_TEST(MatricesNear, CanPassAnyMatrixType)
 {
     auto exec = gko::ReferenceExecutor::create();
-    auto mtx = gko::initialize<gko::matrix::Dense<>>(
+    auto mtx = gko::initialize<gko::matrix::Dense<TypeParam>>(
         {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, exec);
 
-    auto csr_mtx = gko::matrix::Csr<>::create(exec);
+    auto csr_mtx = gko::matrix::Csr<TypeParam>::create(exec);
     csr_mtx->copy_from(mtx.get());
 
     GKO_EXPECT_MTX_NEAR(csr_mtx, mtx, 0.0);
diff --git a/sonar-project.properties b/sonar-project.properties
index ae7b444bc32..154dd932951 100644
--- a/sonar-project.properties
+++ b/sonar-project.properties
@@ -8,4 +8,5 @@ sonar.sources=.
 sonar.tests=.
 sonar.exclusions="third_party/**, build/**"
 sonar.test.exclusions="benchmark/**, doc/**, examples/**"
-sonar.test.inclusions="*/test/**"
\ No newline at end of file
+sonar.test.inclusions="*/test/**"
+sonar.coverage.exclusions="third_party/**, build/**, benchmark/**, doc/**, examples/**"
diff --git a/test_install/CMakeLists.txt b/test_install/CMakeLists.txt
index 7eef8b8b7dd..bc7cf6b63a1 100644
--- a/test_install/CMakeLists.txt
+++ b/test_install/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.8)
+cmake_minimum_required(VERSION 3.9)
 
 project(TestInstall LANGUAGES CXX)
 
@@ -7,17 +7,48 @@ find_package(Ginkgo REQUIRED
             # Alternatively, use `cmake -DCMAKE_PREFIX_PATH=<ginkgo_install_dir>` to specify the install directory
             )
 
-if(GINKGO_HAVE_PAPI_SDE)
-    find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde)
-endif()
-
-# Needed because of a known issue with CUDA while linking statically.
-# For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614
-if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA)
-    enable_language(CUDA)
+if(MSVC)
+    if(GINKGO_BUILD_SHARED_LIBS)
+        ginkgo_switch_to_windows_dynamic("CXX")
+        ginkgo_switch_to_windows_dynamic("C")
+    else()
+        ginkgo_switch_to_windows_static("CXX")
+        ginkgo_switch_to_windows_static("C")
+    endif()
 endif()
 
+include(CheckLanguage)
+check_language(CUDA)
 
 add_executable(test_install test_install.cpp)
 target_compile_features(test_install PUBLIC cxx_std_11)
 target_link_libraries(test_install PRIVATE Ginkgo::ginkgo)
+
+if(GINKGO_BUILD_CUDA)
+    enable_language(CUDA)
+    if(MSVC)
+        if(GINKGO_BUILD_SHARED_LIBS)
+            ginkgo_switch_to_windows_dynamic("CUDA")
+        else()
+            ginkgo_switch_to_windows_static("CUDA")
+        endif()
+    endif()
+    add_executable(test_install_cuda test_install_cuda.cu)
+    target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo)
+endif()
+
+if(GINKGO_BUILD_HIP
+   AND GINKGO_HIP_PLATFORM MATCHES "hcc"
+   AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL "3.5"
+   AND NOT GINKGO_BUILD_SHARED_LIBS)
+    # Compile options somehow add hip-clang specific flags. Wipe them.
+    # Currently, the flags wiped out should be:
+    # -x;hip;--hip-device-lib-path=/opt/rocm/lib;--cuda-gpu-arch=gfx900;
+    # --cuda-gpu-arch=gfx906
+    set_target_properties(hip::device PROPERTIES INTERFACE_COMPILE_OPTIONS "")
+    # In addition, link libraries have a similar problem. We only keep
+    # `hip::host`. Currently, the flags should be:
+    # hip::host;--hip-device-lib-path=/opt/rocm/lib;--hip-link;
+    # --cuda-gpu-arch=gfx900;--cuda-gpu-arch=gfx906
+    set_target_properties(hip::device PROPERTIES INTERFACE_LINK_LIBRARIES "hip::host")
+endif()
diff --git a/test_install/test_install.cpp b/test_install/test_install.cpp
index 1479f74fcb8..5ea59794440 100644
--- a/test_install/test_install.cpp
+++ b/test_install/test_install.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2019, the Ginkgo authors
+Copyright (c) 2017-2020, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -196,10 +196,12 @@ int main(int, char **)
         auto test = gko::log::Stream<>::create(refExec);
     }
 
-    // core/log/convergence.hpp
+#if GKO_HAVE_PAPI_SDE
+    // core/log/papi.hpp
     {
-        auto test = gko::log::Convergence<>::create(refExec);
+        auto test = gko::log::Papi<>::create(refExec);
     }
+#endif  // GKO_HAVE_PAPI_SDE
 
     // core/matrix/coo.hpp
     {
@@ -238,6 +240,12 @@ int main(int, char **)
         auto test = Mtx::create(refExec);
     }
 
+    // core/matrix/permutation.hpp
+    {
+        using Mtx = gko::matrix::Permutation<>;
+        auto test = Mtx::create(refExec, gko::dim<2>{2, 2});
+    }
+
     // core/matrix/sellp.hpp
     {
         using Mtx = gko::matrix::Sellp<>;
@@ -255,6 +263,12 @@ int main(int, char **)
         auto test = gko::preconditioner::Ilu<>::build().on(refExec);
     }
 
+    // core/preconditioner/isai.hpp
+    {
+        auto test_l = gko::preconditioner::LowerIsai<>::build().on(refExec);
+        auto test_u = gko::preconditioner::UpperIsai<>::build().on(refExec);
+    }
+
     // core/preconditioner/jacobi.hpp
     {
         using Bj = gko::preconditioner::Jacobi<>;
@@ -337,11 +351,20 @@ int main(int, char **)
         auto time = gko::stop::Time::build()
                         .with_time_limit(std::chrono::milliseconds(10))
                         .on(refExec);
-        // residual_norm_reduction.hpp
+
+        // residual_norm.hpp
         auto res_red = gko::stop::ResidualNormReduction<>::build()
                            .with_reduction_factor(1e-10)
                            .on(refExec);
 
+        auto rel_res = gko::stop::RelativeResidualNorm<>::build()
+                           .with_tolerance(1e-10)
+                           .on(refExec);
+
+        auto abs_res = gko::stop::AbsoluteResidualNorm<>::build()
+                           .with_tolerance(1e-10)
+                           .on(refExec);
+
         // stopping_status.hpp
         auto stop_status = gko::stopping_status{};
 
diff --git a/test_install/test_install_cuda.cu b/test_install/test_install_cuda.cu
new file mode 100644
index 00000000000..ed2e18c307d
--- /dev/null
+++ b/test_install/test_install_cuda.cu
@@ -0,0 +1,376 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <chrono>
+#include <iostream>
+#include <map>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+
+// core/base/polymorphic_object.hpp
+class PolymorphicObjectTest : public gko::PolymorphicObject {};
+
+
+int main(int, char **)
+{
+    auto refExec = gko::ReferenceExecutor::create();
+    auto cudaExec = gko::CudaExecutor::create(0, refExec);
+    // core/base/abstract_factory.hpp
+    {
+        using type1 = int;
+        using type2 = double;
+        static_assert(
+            std::is_same<
+                gko::AbstractFactory<type1, type2>::abstract_product_type,
+                type1>::value,
+            "abstract_factory.hpp not included properly!");
+    }
+
+    // core/base/array.hpp
+    {
+        using type1 = int;
+        using ArrayType = gko::Array<type1>;
+        ArrayType{};
+    }
+
+    // core/base/combination.hpp
+    {
+        using type1 = int;
+        static_assert(
+            std::is_same<gko::Combination<type1>::value_type, type1>::value,
+            "combination.hpp not included properly!");
+    }
+
+    // core/base/composition.hpp
+    {
+        using type1 = int;
+        static_assert(
+            std::is_same<gko::Composition<type1>::value_type, type1>::value,
+            "composition.hpp not included properly");
+    }
+
+    // core/base/dim.hpp
+    {
+        using type1 = int;
+        gko::dim<3, type1>{4, 4, 4};
+    }
+
+    // core/base/exception.hpp
+    {
+        gko::Error(std::string("file"), 12,
+                   std::string("Test for an error class."));
+    }
+
+    // core/base/exception_helpers.hpp
+    {
+        auto test = gko::dim<2>{3};
+        GKO_ASSERT_IS_SQUARE_MATRIX(test);
+    }
+
+    // core/base/executor.hpp
+    {
+        gko::ReferenceExecutor::create();
+    }
+
+    // core/base/math.hpp
+    {
+        using testType = double;
+        static_assert(gko::is_complex<testType>() == false,
+                      "math.hpp not included properly!");
+    }
+
+    // core/base/matrix_data.hpp
+    {
+        gko::matrix_data<>{};
+    }
+
+    // core/base/mtx_io.hpp
+    {
+        static_assert(gko::layout_type::array != gko::layout_type::coordinate,
+                      "mtx_io.hpp not included properly!");
+    }
+
+    // core/base/name_demangling.hpp
+    {
+        auto testVar = 3.0;
+        gko::name_demangling::get_static_type(testVar);
+    }
+
+
+    // core/base/polymorphic_object.hpp
+    {
+        gko::PolymorphicObject *test;
+        (void)test;  // silence unused variable warning
+    }
+
+    // core/base/range.hpp
+    {
+        gko::span{12};
+    }
+
+    // core/base/range_accessors.hpp
+    {
+        auto testVar = 12;
+        gko::range<gko::accessor::row_major<decltype(testVar), 2>>(&testVar, 1u,
+                                                                   1u, 1u);
+    }
+
+    // core/base/perturbation.hpp
+    {
+        using type1 = int;
+        static_assert(
+            std::is_same<gko::Perturbation<type1>::value_type, type1>::value,
+            "perturbation.hpp not included properly");
+    }
+
+    // core/base/std_extensions.hpp
+    {
+        static_assert(std::is_same<gko::xstd::void_t<double>, void>::value,
+                      "std_extensions.hpp not included properly!");
+    }
+
+    // core/base/types.hpp
+    {
+        static_assert(gko::size_type{12} == 12,
+                      "types.hpp not included properly");
+    }
+
+    // core/base/utils.hpp
+    {
+        gko::null_deleter<double>{};
+    }
+
+    // core/base/version.hpp
+    {
+        gko::version_info::get().header_version;
+    }
+
+    // core/factorization/par_ilu.hpp
+    {
+        gko::factorization::ParIlu<>::build().on(cudaExec);
+    }
+
+    // core/log/convergence.hpp
+    {
+        gko::log::Convergence<>::create(cudaExec);
+    }
+
+    // core/log/record.hpp
+    {
+        gko::log::executor_data{};
+    }
+
+    // core/log/stream.hpp
+    {
+        gko::log::Stream<>::create(cudaExec);
+    }
+
+#if GKO_HAVE_PAPI_SDE
+    // core/log/papi.hpp
+    {
+        gko::log::Papi<>::create(cudaExec);
+    }
+#endif  // GKO_HAVE_PAPI_SDE
+
+    // core/matrix/coo.hpp
+    {
+        using Mtx = gko::matrix::Coo<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2);
+    }
+
+    // core/matrix/csr.hpp
+    {
+        using Mtx = gko::matrix::Csr<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2,
+                    std::make_shared<Mtx::load_balance>(2));
+    }
+
+    // core/matrix/dense.hpp
+    {
+        using Mtx = gko::matrix::Dense<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2});
+    }
+
+    // core/matrix/ell.hpp
+    {
+        using Mtx = gko::matrix::Ell<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2);
+    }
+
+    // core/matrix/hybrid.hpp
+    {
+        using Mtx = gko::matrix::Hybrid<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2, 2, 1);
+    }
+
+    // core/matrix/identity.hpp
+    {
+        using Mtx = gko::matrix::Identity<>;
+        Mtx::create(cudaExec);
+    }
+
+    // core/matrix/permutation.hpp
+    {
+        using Mtx = gko::matrix::Permutation<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2});
+    }
+
+    // core/matrix/sellp.hpp
+    {
+        using Mtx = gko::matrix::Sellp<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2);
+    }
+
+    // core/matrix/sparsity_csr.hpp
+    {
+        using Mtx = gko::matrix::SparsityCsr<>;
+        Mtx::create(cudaExec, gko::dim<2>{2, 2});
+    }
+
+    // core/preconditioner/ilu.hpp
+    {
+        gko::preconditioner::Ilu<>::build().on(cudaExec);
+    }
+
+    // core/preconditioner/jacobi.hpp
+    {
+        using Bj = gko::preconditioner::Jacobi<>;
+        Bj::build().with_max_block_size(1u).on(cudaExec);
+    }
+
+    // core/solver/bicgstab.hpp
+    {
+        using Solver = gko::solver::Bicgstab<>;
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec))
+            .on(cudaExec);
+    }
+
+    // core/solver/cg.hpp
+    {
+        using Solver = gko::solver::Cg<>;
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec))
+            .on(cudaExec);
+    }
+
+    // core/solver/cgs.hpp
+    {
+        using Solver = gko::solver::Cgs<>;
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec))
+            .on(cudaExec);
+    }
+
+    // core/solver/fcg.hpp
+    {
+        using Solver = gko::solver::Fcg<>;
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec))
+            .on(cudaExec);
+    }
+
+    // core/solver/gmres.hpp
+    {
+        using Solver = gko::solver::Gmres<>;
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec))
+            .on(cudaExec);
+    }
+
+    // core/solver/ir.hpp
+    {
+        using Solver = gko::solver::Ir<>;
+        Solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec))
+            .on(cudaExec);
+    }
+
+    // core/solver/lower_trs.hpp
+    {
+        using Solver = gko::solver::LowerTrs<>;
+        Solver::build().on(cudaExec);
+    }
+
+    // core/stop/
+    {
+        // iteration.hpp
+        auto iteration =
+            gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec);
+
+        // time.hpp
+        auto time = gko::stop::Time::build()
+                        .with_time_limit(std::chrono::milliseconds(10))
+                        .on(cudaExec);
+
+        // residual_norm.hpp
+        gko::stop::ResidualNormReduction<>::build()
+            .with_reduction_factor(1e-10)
+            .on(cudaExec);
+
+        gko::stop::RelativeResidualNorm<>::build()
+            .with_tolerance(1e-10)
+            .on(cudaExec);
+
+        gko::stop::AbsoluteResidualNorm<>::build()
+            .with_tolerance(1e-10)
+            .on(cudaExec);
+
+        // stopping_status.hpp
+        gko::stopping_status{};
+
+        // combined.hpp
+        auto combined =
+            gko::stop::Combined::build()
+                .with_criteria(std::move(time), std::move(iteration))
+                .on(cudaExec);
+    }
+
+    std::cout
+        << "test_install_cuda: the Ginkgo installation was correctly detected "
+           "and is complete."
+        << std::endl;
+
+    return 0;
+}
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 26eb4d1377a..884e50bf699 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -16,6 +16,8 @@ endif()
 if(GINKGO_DEVEL_TOOLS)
     set(GCF_IGNORE_LIST "third_party" CACHE STRING "Ignore directories for GCF")
     add_subdirectory(git-cmake-format)
+else()
+    add_subdirectory(dummy-hook)
 endif()
 
 if(GINKGO_BUILD_BENCHMARKS)
diff --git a/third_party/CudaArchitectureSelector/CMakeLists.txt b/third_party/CudaArchitectureSelector/CMakeLists.txt
index e4ed043c539..feccda26a92 100644
--- a/third_party/CudaArchitectureSelector/CMakeLists.txt
+++ b/third_party/CudaArchitectureSelector/CMakeLists.txt
@@ -1,6 +1,6 @@
 ginkgo_load_git_package(CudaArchitectureSelector
     "https://github.com/ginkgo-project/CudaArchitectureSelector.git"
-    "0b46fb7d653404db312cbc1fc702cb528fd1c1b0")
+    "f6e024cc2000eb870dc52166d4cdce9fe7f9a7a4")
 add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/src
     ${CMAKE_CURRENT_BINARY_DIR}/build EXCLUDE_FROM_ALL)
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE)
diff --git a/third_party/dummy-hook/CMakeLists.txt b/third_party/dummy-hook/CMakeLists.txt
new file mode 100644
index 00000000000..043ae8da917
--- /dev/null
+++ b/third_party/dummy-hook/CMakeLists.txt
@@ -0,0 +1,18 @@
+if(EXISTS "${Ginkgo_SOURCE_DIR}/.git")
+    set(ADD_HOOK FALSE)
+    set(HOOK_LOCATION "${Ginkgo_SOURCE_DIR}/.git/hooks/pre-commit")
+    if(NOT EXISTS "${HOOK_LOCATION}")
+        set(ADD_HOOK TRUE)
+    else()
+        # check if the correct hook is installed
+        execute_process(COMMAND grep git-cmake-format.py "${HOOK_LOCATION}"
+                        RESULT_VARIABLE res OUTPUT_QUIET)
+        # return value =/= 0 means the pattern was not found
+        if(NOT res EQUAL 0)
+            set(ADD_HOOK TRUE)
+        endif()
+    endif()
+    if(ADD_HOOK)
+        configure_file(dummy_hook "${HOOK_LOCATION}" COPYONLY)
+    endif()
+endif()
diff --git a/third_party/dummy-hook/dummy_hook b/third_party/dummy-hook/dummy_hook
new file mode 100755
index 00000000000..4274cb3e418
--- /dev/null
+++ b/third_party/dummy-hook/dummy_hook
@@ -0,0 +1,5 @@
+#!/bin/bash
+echo "Please only commit to Ginkgo when GINKGO_DEVEL_TOOLS is enabled in CMake."
+echo "This can be set in your initial invocation of CMake by using"
+echo " -DGINKGO_DEVEL_TOOLS=ON or by editing the CMakeCache.txt file."
+exit 1
diff --git a/third_party/gflags/CMakeLists.txt b/third_party/gflags/CMakeLists.txt
index 5581483cf0d..048e3343004 100644
--- a/third_party/gflags/CMakeLists.txt
+++ b/third_party/gflags/CMakeLists.txt
@@ -3,23 +3,33 @@ if(MSVC)
     # use the ginkgo's flags to use the same runtime libraries as ginkgo
     ginkgo_load_git_package(gflags_external
         "https://github.com/gflags/gflags.git"
-        "0b7f8db2c6b1b0b2451da0923a9ab09cc610e8d1"
+        "f7388c6655e699f777a5a74a3c9880b9cfaabe59"
         "-DGFLAGS_BUILD_TESTING=OFF" "-DGFLAGS_BUILD_gflags_LIB=OFF"
         "-DGFLAGS_BUILD_gflags_nothreads_LIB=ON" "-DGFLAGS_BUILD_STATIC_LIBS=ON"
         "-DGFLAGS_BUILD_PACKAGING=OFF" "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
         "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
         "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
 else()
+    # There is a weird issue with Intel 19 and c++17 causing a linking error.
+    # Use c++11 instead.
+    set(INTEL19_STD_FIX "")
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND CMAKE_CXX_COMPILER_VERSION MATCHES "19")
+        set(INTEL19_STD_FIX "-DCMAKE_CXX_FLAGS=-std=c++11")
+    endif()
     ginkgo_load_git_package(gflags_external
         "https://github.com/gflags/gflags.git"
-        "0b7f8db2c6b1b0b2451da0923a9ab09cc610e8d1"
+        "f7388c6655e699f777a5a74a3c9880b9cfaabe59"
         "-DGFLAGS_BUILD_TESTING=OFF" "-DGFLAGS_BUILD_gflags_LIB=OFF"
         "-DGFLAGS_BUILD_gflags_nothreads_LIB=ON" "-DGFLAGS_BUILD_STATIC_LIBS=ON"
-        "-DGFLAGS_BUILD_PACKAGING=OFF")
+        "-DGFLAGS_BUILD_PACKAGING=OFF"
+        "${INTEL19_STD_FIX}"
+        )
 endif()
 if(WIN32)
     # gflags uses gflags_nothreads_static not gflags_nothreads_static in Windows.
-    ginkgo_add_external_target(gflags gflags_nothreads_static build/include build/lib STATIC "_debug" gflags_external FALSE)
+    ginkgo_add_external_target(gflags gflags_nothreads_static build/include build/lib
+        STATIC "_debug" gflags_external FALSE)
 else()
-    ginkgo_add_external_target(gflags gflags_nothreads build/include build/lib STATIC "_debug" gflags_external FALSE)
+    ginkgo_add_external_target(gflags gflags_nothreads build/include build/lib
+        STATIC "_debug" gflags_external FALSE)
 endif()
diff --git a/third_party/git-cmake-format/CMakeLists.txt b/third_party/git-cmake-format/CMakeLists.txt
index c05253a738c..b8e3d623050 100644
--- a/third_party/git-cmake-format/CMakeLists.txt
+++ b/third_party/git-cmake-format/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_load_git_package(git-cmake-format
     "https://github.com/ginkgo-project/git-cmake-format.git"
-    "e19ab13e640d58abd3bfdbff5f77b499b2ec4169")
+    "29c23665d624e1cae1308bec651706fdaa8fe38b"
+    "-DGCF_CLANGFORMAT_MINIMAL_VERSION=5.0.0")
 add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/src
     ${CMAKE_CURRENT_BINARY_DIR}/build EXCLUDE_FROM_ALL)
diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt
index ac9da49d307..48976e93f69 100644
--- a/third_party/gtest/CMakeLists.txt
+++ b/third_party/gtest/CMakeLists.txt
@@ -10,7 +10,8 @@ else()
         "https://github.com/google/googletest.git"
         "df428ec11891f12c81e2872c0432e342b5403a34"
         # Work around the linking errors when compiling gtest with CUDA
-        "-Dgtest_disable_pthreads=ON")
+        "-Dgtest_disable_pthreads=ON"
+        "-DCMAKE_CXX_FLAGS=-fPIC")
 endif()
 
 ginkgo_add_external_target(GTest::GTest gtest src/googletest/include build/googlemock/gtest
diff --git a/third_party/rapidjson/CMakeLists.txt b/third_party/rapidjson/CMakeLists.txt
index a3d8a20af36..bd2ae7899b1 100644
--- a/third_party/rapidjson/CMakeLists.txt
+++ b/third_party/rapidjson/CMakeLists.txt
@@ -1,6 +1,6 @@
 ginkgo_load_git_package(rapidjson_external
     "https://github.com/Tencent/rapidjson.git"
-    "6a6bed2759d42891f9e29a37b21315d3192890ed"
+    "88bd956d66d348f478bceebfdadb8e26c6844695"
     "-DRAPIDJSON_BUILD_DOC=OFF" "-DRAPIDJSON_BUILD_EXAMPLES=OFF"
     "-DRAPIDJSON_BUILD_TESTS=OFF" "-DRAPIDJSON_BUILD_THIRDPARTY_GTEST=OFF"
     "-DRAPIDJSON_BUILD_CXX11=ON")