From 4622fe0501df074f8233a3a423daf86ec1a060bf Mon Sep 17 00:00:00 2001 From: IndeedMiners <32953696+IndeedMiners@users.noreply.github.com> Date: Sat, 9 Dec 2017 13:48:05 +0100 Subject: [PATCH] XMR-STAK 2.1 --- .appveyor.yml | 2 +- CMakeLists.txt | 43 ++- Dockerfile | 26 ++ README.md | 42 +-- doc/FAQ.md | 8 +- doc/Linux_deployment.md | 2 + doc/compile.md | 7 +- doc/compile_MacOS.md | 31 ++ doc/compile_Windows.md | 12 +- doc/pgp_keys.md | 69 ++++ doc/tuning.md | 28 +- doc/usage.md | 39 ++- .../build_xmr-stak_docker.sh | 135 ++++---- xmrstak/backend/amd/amd_gpu/gpu.cpp | 126 ++++--- xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + .../backend/amd/amd_gpu/opencl/cryptonight.cl | 19 +- .../backend/amd/amd_gpu/opencl/wolf-aes.cl | 30 +- xmrstak/backend/amd/autoAdjust.hpp | 2 +- xmrstak/backend/amd/config.tpl | 5 +- xmrstak/backend/amd/jconf.cpp | 9 +- xmrstak/backend/amd/jconf.hpp | 1 + xmrstak/backend/amd/minethd.cpp | 1 + xmrstak/backend/cpu/config.tpl | 6 +- .../backend/cpu/crypto/cryptonight_aesni.h | 322 +++++++++++++++++- xmrstak/backend/cpu/jconf.cpp | 11 +- xmrstak/backend/cpu/jconf.hpp | 2 +- xmrstak/backend/cpu/minethd.cpp | 251 +++++++++----- xmrstak/backend/cpu/minethd.hpp | 16 +- xmrstak/backend/nvidia/autoAdjust.hpp | 2 +- xmrstak/backend/nvidia/config.tpl | 10 +- xmrstak/backend/nvidia/jconf.cpp | 11 +- xmrstak/backend/nvidia/jconf.hpp | 1 + xmrstak/backend/nvidia/minethd.cpp | 1 + .../backend/nvidia/nvcc_code/cryptonight.hpp | 3 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 30 +- .../backend/nvidia/nvcc_code/cuda_device.hpp | 37 +- .../backend/nvidia/nvcc_code/cuda_extra.cu | 32 +- xmrstak/cli/cli-miner.cpp | 159 ++++++++- xmrstak/jconf.cpp | 4 +- xmrstak/misc/console.cpp | 9 +- xmrstak/misc/executor.cpp | 64 +++- xmrstak/params.hpp | 3 + xmrstak/version.cpp | 4 +- 43 files changed, 1224 insertions(+), 392 deletions(-) create mode 100644 Dockerfile create mode 100644 doc/compile_MacOS.md create mode 100644 doc/pgp_keys.md diff --git a/.appveyor.yml b/.appveyor.yml index efa65bf70..b463d0dd9 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -22,7 +22,7 @@ build_script: - mkdir build - cd build - set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl; - - cmake -G "Visual Studio 14 2015 Win64" -T v140,host=x64 .. -DWIN_UAC=OFF -DCUDA_ARCH=30 + - cmake -G "Visual Studio 14 2015 Win64" -T v140,host=x64 .. -DCUDA_ARCH=30 - cmake --build . --config Release --target install test_script: diff --git a/CMakeLists.txt b/CMakeLists.txt index 87c0e8aae..b8f1eefbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ project(xmr-stak) -cmake_minimum_required(VERSION 3.0.1) +cmake_minimum_required(VERSION 3.1.0) # enforce C++11 set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -42,23 +42,23 @@ set_property(CACHE XMR-STAK_CURRENCY PROPERTY STRINGS "all;monero;aeon") set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture") set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic") -if("${XMR-STAK_COMPILE}" STREQUAL "native") +if(XMR-STAK_COMPILE STREQUAL "native") if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(CMAKE_CXX_FLAGS "-march=native -mtune=native ${CMAKE_CXX_FLAGS}") set(CMAKE_C_FLAGS "-march=native -mtune=native ${CMAKE_C_FLAGS}") endif() -elseif("${XMR-STAK_COMPILE}" STREQUAL "generic") +elseif(XMR-STAK_COMPILE STREQUAL "generic") add_definitions("-DCONF_ENFORCE_OpenCL_1_2=1") else() message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'") endif() -if("${XMR-STAK_CURRENCY}" STREQUAL "all") +if(XMR-STAK_CURRENCY STREQUAL "all") message(STATUS "Set miner currency to 'monero' and 'aeon'") -elseif("${XMR-STAK_CURRENCY}" STREQUAL "aeon") +elseif(XMR-STAK_CURRENCY STREQUAL "aeon") message(STATUS "Set miner currency to 'aeon'") add_definitions("-DCONF_NO_MONERO=1") -elseif("${XMR-STAK_CURRENCY}" STREQUAL "monero") +elseif(XMR-STAK_CURRENCY STREQUAL "monero") message(STATUS "Set miner currency to 'monero'") add_definitions("-DCONF_NO_AEON=1") endif() @@ -134,7 +134,7 @@ if(CUDA_ENABLE) option(CUDA_SHOW_REGISTER "Show registers used for each kernel and compute architecture" OFF) option(CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps" OFF) - if("${CUDA_COMPILER}" STREQUAL "clang") + if(CUDA_COMPILER STREQUAL "clang") set(CLANG_BUILD_FLAGS "-O3 -x cuda --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") # activation usage of FMA set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} -ffp-contract=fast") @@ -152,9 +152,9 @@ if(CUDA_ENABLE) set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} --cuda-gpu-arch=sm_${CUDA_ARCH_ELEM}") endforeach() - elseif("${CUDA_COMPILER}" STREQUAL "nvcc") + elseif(CUDA_COMPILER STREQUAL "nvcc") # add c++11 for cuda - if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11") + if(NOT CMAKE_CXX_FLAGS MATCHES "-std=c\\+\\+11") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") endif() @@ -307,7 +307,7 @@ if(MICROHTTPD_ENABLE) ENV "MICROHTTPD_ROOT" PATH_SUFFIXES lib) - if("${MHTD}" STREQUAL "MHTD-NOTFOUND") + if(MHTD STREQUAL "MHTD-NOTFOUND") message(FATAL_ERROR "microhttpd NOT found: use `-DMICROHTTPD_ENABLE=OFF` to build without http deamon support") else() set(LIBS ${LIBS} ${MHTD}) @@ -348,7 +348,7 @@ if(HWLOC_ENABLE) /usr/local /usr ENV "PROGRAMFILES(X86)" - ENV "MICROHTTPD_ROOT" + ENV "HWLOC_ROOT" PATH_SUFFIXES include) @@ -361,7 +361,7 @@ if(HWLOC_ENABLE) PATH_SUFFIXES lib) - if("${HWLOC}" STREQUAL "MHTD-NOTFOUND" OR ${HWLOC_INCLUDE_DIR} STREQUAL "HWLOC_INCLUDE_DIR-NOTFOUND") + if(HWLOC STREQUAL "HWLOC-NOTFOUND" OR ${HWLOC_INCLUDE_DIR} STREQUAL "HWLOC_INCLUDE_DIR-NOTFOUND") message(FATAL_ERROR "hwloc NOT found: use `-DHWLOC_ENABLE=OFF` to build without hwloc support") else() set(LIBS ${LIBS} ${HWLOC}) @@ -399,10 +399,10 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -if(NOT "${GIT_COMMIT_HASH}" STREQUAL "") +if(NOT GIT_COMMIT_HASH STREQUAL "") add_definitions("-DGIT_COMMIT_HASH=${GIT_COMMIT_HASH}") endif() -if(NOT "${GIT_BRANCH}" STREQUAL "") +if(NOT GIT_BRANCH STREQUAL "") add_definitions("-DGIT_BRANCH=${GIT_BRANCH}") endif() @@ -446,7 +446,10 @@ add_library(xmr-stak-c ${SRCFILES_C} ) set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99) -target_link_libraries(xmr-stak-c ${MHTD} ${LIBS}) +if(MICROHTTPD_ENABLE) + target_link_libraries(xmr-stak-c ${MHTD}) +endif() +target_link_libraries(xmr-stak-c ${LIBS}) # compile generic backend files file(GLOB BACKEND_CPP @@ -470,7 +473,7 @@ if(CUDA_FOUND) "xmrstak/backend/nvidia/nvcc_code/*.cu" "xmrstak/backend/nvidia/*.cpp") - if("${CUDA_COMPILER}" STREQUAL "clang") + if(CUDA_COMPILER STREQUAL "clang") # build device code with clang add_library( xmrstak_cuda_backend @@ -518,19 +521,13 @@ set(LIBRARY_OUTPUT_PATH "bin") target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend) -option(WIN_UAC "Add UAC manifest on Windows" ON) - -if(WIN_UAC AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - set_property(TARGET xmr-stak PROPERTY LINK_FLAGS "/level='requireAdministrator' /uiAccess='false'") -endif() - ################################################################################ # Install ################################################################################ # do not install the binary if the project and install are equal -if( NOT "${CMAKE_INSTALL_PREFIX}" STREQUAL "${PROJECT_BINARY_DIR}" ) +if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) install(TARGETS xmr-stak RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin") if(CUDA_FOUND) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..3e996efca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +# Latest version of ubuntu +FROM nvidia/cuda:9.0-base + +# Default git repository +ENV GIT_REPOSITORY https://github.com/fireice-uk/xmr-stak.git +ENV XMRSTAK_CMAKE_FLAGS -DXMR-STAK_COMPILE=generic -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF + +# Innstall packages +RUN apt-get update \ + && set -x \ + && apt-get install -qq --no-install-recommends -y ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \ + && git clone $GIT_REPOSITORY \ + && cd /xmr-stak \ + && cmake ${XMRSTAK_CMAKE_FLAGS} . \ + && make \ + && cd - \ + && mv /xmr-stak/bin/* /usr/local/bin/ \ + && rm -rf /xmr-stak \ + && apt-get purge -y -qq cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \ + && apt-get clean -qq + +VOLUME /mnt + +WORKDIR /mnt + +ENTRYPOINT ["/usr/local/bin/xmr-stak"] diff --git a/README.md b/README.md index 5609d9dd4..4bebbc8a9 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV * [HowTo Compile](doc/compile.md) * [FAQ](doc/FAQ.md) * [Developer Donation](#default-developer-donation) -* [Cheksums](#checksums) -* [PGP Key](#pgp-key) +* [Release Cheksums](#release-checksums) +* [Developer PGP Key's](doc/pgp_keys.md) ## Features @@ -32,7 +32,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV ## Download -You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/xmr-stak/xmr-stak/releases). +You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases). If you are running on Linux (especially Linux VMs), checkout [Linux Portable Binary](doc/Linux_deployment.md). ## Default Developer Donation @@ -51,7 +51,7 @@ psychocrypt: 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU ``` -## Checksums +## Release Checksums ``` -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA256 @@ -87,37 +87,3 @@ cvX6gTCsFYfLw/p+sz+DN7kh7zJlCvIFga3HaFByxCSuyMY08qerXS/0862ZMdo= -----END PGP SIGNATURE----- ``` - -## PGP Key -``` ------BEGIN PGP PUBLIC KEY BLOCK----- -Version: GnuPG v2 - -mQENBFhYUmUBCAC6493W5y1MMs38ApRbI11jWUqNdFm686XLkZWGDfYImzL6pEYk -RdWkyt9ziCyA6NUeWFQYniv/z10RxYKq8ulVVJaKb9qPGMU0ESfdxlFNJkU/pf28 -sEVBagGvGw8uFxjQONnBJ7y7iNRWMN7qSRS636wN5ryTHNsmqI4ClXPHkXkDCDUX -QvhXZpG9RRM6jsE3jBGz/LJi3FyZLo/vB60OZBODJ2IA0wSR41RRiOq01OqDueva -9jPoAokNglJfn/CniQ+lqUEXj1vjAZ1D5Mn9fISzA/UPen5Z7Sipaa9aAtsDBOfP -K9iPKOsWa2uTafoyXgiwEVXCCeMMUjCGaoFBABEBAAG0ImZpcmVpY2VfdWsgPGZp -cmVpY2UueG1yQGdtYWlsLmNvbT6JATcEEwEIACEFAlhYUmUCGwMFCwkIBwIGFQgJ -CgsCBBYCAwECHgECF4AACgkQ+yT3mn7UHDTEcQf8CMhqaZ0IOBxeBnsq5HZr2X6z -E5bODp5cPs6ha1tjH3CWpk1AFeykNtXH7kPW9hcDt/e4UQtcHs+lu6YU59X7xLJQ -udOkpWdmooJMXRWS/zeeon4ivT9d69jNnwubh8EJOyw8xm/se6n48BcewfHekW/6 -mVrbhLbF1dnuUGXzRN1WxsUZx3uJd2UvrkJhAtHtX92/qIVhT0+3PXV0bmpHURlK -YKhhm8dPLV9jPX8QVRHQXCOHSMqy/KoWEe6CnT0Isbkq3JtS3K4VBVeTX9gkySRc -IFxrNJdXsI9BxKv4O8yajP8DohpoGLMDKZKSO0yq0BRMgMh0cw6Lk22uyulGALkB -DQRYWFJlAQgAqikfViOmIccCZKVMZfNHjnigKtQqNrbJpYZCOImql4FqbZu9F7TD -9HIXA43SPcwziWlyazSy8Pa9nCpc6PuPPO1wxAaNIc5nt+w/x2EGGTIFGjRoubmP -3i5jZzOFYsvR2W3PgVa3/ujeYYJYo1oeVeuGmmJRejs0rp1mbvBSKw1Cq6C4cI0x -GTY1yXFGLIgdfYNMmiLsTy1Qwq8YStbFKeUYAMMG3128SAIaT3Eet911f5Jx4tC8 -6kWUr6PX1rQ0LQJqyIsLq9U53XybUksRfJC9IEfgvgBxRBHSD8WfqEhHjhW1VsZG -dcYgr7A1PIneWsCEY+5VUnqTlt2HPaKweQARAQABiQEfBBgBCAAJBQJYWFJlAhsM -AAoJEPsk95p+1Bw0Pr8H/0vZ6U2zaih03jOHOvsrYxRfDXSmgudOp1VS45aHIREd -2nrJ+drleeFVyb14UQqO/6iX9GuDX2yBEHdCg2aljeP98AaMU//RiEtebE6CUWsL -HPVXHIkxwBCBe0YkJINHUQqLz/5f6qLsNUp1uTH2++zhdBWvg+gErTYbx8aFMFYH -0GoOtqE5rtlAh5MTvDZm+UcDwKJCxhrLaN3R3dDoyrDNRTgHQQuX5/opJBiUnVNK -d+vugnxzpMIJQP11yCZkz/KxV8zQ2QPMuZdAoh3znd/vGCJcp0rWphn4pqxA4vDp -c4hC0Yg9Dha1OoE5CJCqVL+ic4vAyB1urAwBlsd/wH8= -=B5I+ ------END PGP PUBLIC KEY BLOCK----- -``` diff --git a/doc/FAQ.md b/doc/FAQ.md index 215048d3d..23507f264 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -29,7 +29,7 @@ Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid ## VirtualAlloc failed -If you set up the user rights properly (see above), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. +If you set up the user rights properly ([see above](https://github.com/fireice-uk/xmr-stak/blob/master/doc/FAQ.md#selockmemoryprivilege-failed)), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. If that happens, disable all auto-staring applications and run the miner after a reboot. @@ -56,10 +56,8 @@ This typically means you are trying to run it on a CPU that does not have [AES]( ## Virus Protection Alert -Some Virus protection software flag the miner binary as *Male Ware*. -In this case the binary is moved to the quarantine area of the protection software. -This is a wrong alert and not avoid by use. -Add the binary to to protection software white list to solve this issue.s +Some virus protection software flags the miner binary as *malware*. This is a false positive — the software does not contain any malware (and since it is open source, you can verify that yourself!) +If your antivirus software flags **xmr-stak**, it will likely move it to its quarantine area. You may have to whitelist **xmr-stak** in your antivirus. ## Change Currency to Mine diff --git a/doc/Linux_deployment.md b/doc/Linux_deployment.md index caf762c65..323a97f5b 100644 --- a/doc/Linux_deployment.md +++ b/doc/Linux_deployment.md @@ -1,5 +1,7 @@ # Deploying portable **XMR-Stak** on Linux systems +**This is an experimental feature** we reserve the right to remove the binary if we get too many issues. + XMR-Stak releases include a pre-built portable version. If you are simply using it to avoid having to compile the application, you can simply download **xmr-stak-portbin-linux.tar.gz** from our [latest releases](https://github.com/fireice-uk/xmr-stak/releases/latest). Open up command line, and use the following commands: ``` diff --git a/doc/compile.md b/doc/compile.md index cf2513537..e97affa7d 100644 --- a/doc/compile.md +++ b/doc/compile.md @@ -9,6 +9,7 @@ * [Compile on Windows](compile_Windows.md) * [Compile on Linux](compile_Linux.md) * [Compile on FreeBSD](compile_FreeBSD.md) +* [Compile on MacOS](compile_MacOS.md) ## Build System @@ -30,6 +31,7 @@ After the configuration you need to compile the miner, follow the guide for your * [Compile in Windows](compile_Windows.md) * [Compile in Linux](compile_Linux.md) * [Compile in FreeBSD](compile_FreeBSD.md) +* [Compile in MacOS](compile_MacOS.md) ## Generic Build Options - `CMAKE_INSTALL_PREFIX` install miner to the home folder @@ -50,11 +52,6 @@ After the configuration you need to compile the miner, follow the guide for your - native means the miner binary can be used only on the system where it is compiled but will archive the highest hash rate - use `cmake .. -DXMR-STAK_COMPILE=generic` to run the miner on all CPU's with sse2 -### only available for Windows -- `WIN_UAC` will enable or disable the "Run As Administrator" prompt on Windows. - - UAC confirmation is needed to use large pages on Windows 7. - - On Windows 10 it is only needed once to set up the account to use them. - ## CPU Build Options - `CPU_ENABLE` allow to disable/enable the CPU backend of the miner diff --git a/doc/compile_MacOS.md b/doc/compile_MacOS.md new file mode 100644 index 000000000..1b0af91dc --- /dev/null +++ b/doc/compile_MacOS.md @@ -0,0 +1,31 @@ +# Compile **xmr-stak** for MacOS + +## Dependencies + +Assuming you already have [Homebrew](https://brew.sh) installed, the installation of dependencies is pretty straightforward and will generate the `xmr-stak` binary in the `bin/` directory. + +### For NVIDIA GPUs + +```shell +brew tap caskroom/drivers +brew cask install nvidia-cuda +brew install hwloc libmicrohttpd gcc openssl cmake +cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DOpenCL_ENABLE=OFF +make install +``` + +[All available CMake options](compile.md#nvidia-build-options) + +### For AMD GPUs + +> 🖐 We need help with AMD GPU compilation instructions. Please submit a PR if you managed to install [AMD APP SDK](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/) and to compile `xmr-stak` on MacOS. + +### For CPU-only mining + +```shell +brew install hwloc libmicrohttpd gcc openssl cmake +cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF +make install +``` + +[All available CMake options](compile.md#cpu-build-options) diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index dd7af1e98..c9a8ff78e 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -12,7 +12,7 @@ - download VS2017 Community and install from [https://www.visualstudio.com/downloads/](https://www.visualstudio.com/downloads/) - during the install chose the components - `Desktop development with C++` (left side) - - `Toolset for Visual Studio C++ 2015.3 v140...` (right side) + - `VC++ 2015.3 v140 toolset for desktop` (right side) ### CMake for Win64 @@ -34,8 +34,12 @@ - download and install the latest version from [http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/) ### Dependencies OpenSSL/Hwloc and Microhttpd -- with CUDA 8: download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip) -- with CUDA 9: download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip) +- for CUDA 8*: + - download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip) + - version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141 +- for CUDA 9 **and/or** AMD GPUs, CPU: + - download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip) + - version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140 - unzip all to `C:\xmr-stak-dep` ### Validate the Dependency Folder @@ -85,7 +89,7 @@ ``` cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 .. ``` - - for CUDA 9 and/or AMD GPUs, CPU + - for CUDA 9 **and/or** AMD GPUs, CPU ``` cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 .. ``` diff --git a/doc/pgp_keys.md b/doc/pgp_keys.md new file mode 100644 index 000000000..69ab41aed --- /dev/null +++ b/doc/pgp_keys.md @@ -0,0 +1,69 @@ +# Developer PGP Key's + +## Key @fireice-uk +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2 + +mQENBFhYUmUBCAC6493W5y1MMs38ApRbI11jWUqNdFm686XLkZWGDfYImzL6pEYk +RdWkyt9ziCyA6NUeWFQYniv/z10RxYKq8ulVVJaKb9qPGMU0ESfdxlFNJkU/pf28 +sEVBagGvGw8uFxjQONnBJ7y7iNRWMN7qSRS636wN5ryTHNsmqI4ClXPHkXkDCDUX +QvhXZpG9RRM6jsE3jBGz/LJi3FyZLo/vB60OZBODJ2IA0wSR41RRiOq01OqDueva +9jPoAokNglJfn/CniQ+lqUEXj1vjAZ1D5Mn9fISzA/UPen5Z7Sipaa9aAtsDBOfP +K9iPKOsWa2uTafoyXgiwEVXCCeMMUjCGaoFBABEBAAG0ImZpcmVpY2VfdWsgPGZp +cmVpY2UueG1yQGdtYWlsLmNvbT6JATcEEwEIACEFAlhYUmUCGwMFCwkIBwIGFQgJ +CgsCBBYCAwECHgECF4AACgkQ+yT3mn7UHDTEcQf8CMhqaZ0IOBxeBnsq5HZr2X6z +E5bODp5cPs6ha1tjH3CWpk1AFeykNtXH7kPW9hcDt/e4UQtcHs+lu6YU59X7xLJQ +udOkpWdmooJMXRWS/zeeon4ivT9d69jNnwubh8EJOyw8xm/se6n48BcewfHekW/6 +mVrbhLbF1dnuUGXzRN1WxsUZx3uJd2UvrkJhAtHtX92/qIVhT0+3PXV0bmpHURlK +YKhhm8dPLV9jPX8QVRHQXCOHSMqy/KoWEe6CnT0Isbkq3JtS3K4VBVeTX9gkySRc +IFxrNJdXsI9BxKv4O8yajP8DohpoGLMDKZKSO0yq0BRMgMh0cw6Lk22uyulGALkB +DQRYWFJlAQgAqikfViOmIccCZKVMZfNHjnigKtQqNrbJpYZCOImql4FqbZu9F7TD +9HIXA43SPcwziWlyazSy8Pa9nCpc6PuPPO1wxAaNIc5nt+w/x2EGGTIFGjRoubmP +3i5jZzOFYsvR2W3PgVa3/ujeYYJYo1oeVeuGmmJRejs0rp1mbvBSKw1Cq6C4cI0x +GTY1yXFGLIgdfYNMmiLsTy1Qwq8YStbFKeUYAMMG3128SAIaT3Eet911f5Jx4tC8 +6kWUr6PX1rQ0LQJqyIsLq9U53XybUksRfJC9IEfgvgBxRBHSD8WfqEhHjhW1VsZG +dcYgr7A1PIneWsCEY+5VUnqTlt2HPaKweQARAQABiQEfBBgBCAAJBQJYWFJlAhsM +AAoJEPsk95p+1Bw0Pr8H/0vZ6U2zaih03jOHOvsrYxRfDXSmgudOp1VS45aHIREd +2nrJ+drleeFVyb14UQqO/6iX9GuDX2yBEHdCg2aljeP98AaMU//RiEtebE6CUWsL +HPVXHIkxwBCBe0YkJINHUQqLz/5f6qLsNUp1uTH2++zhdBWvg+gErTYbx8aFMFYH +0GoOtqE5rtlAh5MTvDZm+UcDwKJCxhrLaN3R3dDoyrDNRTgHQQuX5/opJBiUnVNK +d+vugnxzpMIJQP11yCZkz/KxV8zQ2QPMuZdAoh3znd/vGCJcp0rWphn4pqxA4vDp +c4hC0Yg9Dha1OoE5CJCqVL+ic4vAyB1urAwBlsd/wH8= +=B5I+ +-----END PGP PUBLIC KEY BLOCK----- +``` + +## Key @psychocrypt +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2 + +mQENBFoq84cBCACm4moXhW5kLRByCh7q1lZz/RP88fINfX2jsxS3QyagjsAD7yxy +CZPlwF+NsZTX/jVbMTUTHDO1KEejWO4peYYeJT50BcTS7MOF+O3EVI3j/G45v5L5 +yI0MOgsQFVM4k5A9n0W+oULZK7ejiBSolzSG7PbwMpUjUSMEl4boUd5zFPNq/kpo +OMmJR/Q17LOqvnQsbkQDUprl+qvbD5q2xEFPNKt3KmUUEOF8a1dIDkUZmXEklLFp +cUjLTviscgP1+Mfyasz6cAjfaN+7IwYS+vGnFcwXx93sIq4J3wkpgpyMNdtoK1hY +ALJxzk8TF7NRFU68uIqDGrROEDa5asW9L75DABEBAAG0JnBzeWNob2NyeXB0IDxw +c3ljaG9jcnlwdGhwY0BnbWFpbC5jb20+iQE3BBMBCAAhBQJaKvOHAhsDBQsJCAcC +BhUICQoLAgQWAgMBAh4BAheAAAoJEAUWOMCIZelDeLoH/j+nZE3E636tKvHoP2Uv +7PG1dP9F+fTHhru53iVIxR+UXubobgAYH8lOo7yBuO+JGWDf4KjyNRrRf/To1xD5 +udpU6BrJ8iS3MTPk0jQ1delk+7jaFLXaJbQPdOVRR7dddi32j3Vw6wPaCWhK8xt/ +xDIEJJ6TlSwNBiYIriLa2uB5q0DhwdWBsZqna5xhp2jihxhtEUXs4IkkIETIVs8e +FIzqxNALUNWRit2Bm1Etm4KId9GV5N6eyjekYUk3zGLivsyTHbl6XhNFmQk8UzIP +N5OjcJb1UFr7Q43kRvUGDZEh08l+k5P2qQ1y3g1WypcPsfSh/+XMeCe27DaLeOwZ +SD25AQ0EWirzhwEIANUkGzShhAscwJt5L/huftn/TQYaBIcYtKLYyIyQsG7c3/bO +aNB4t5ZkYBPrVRyqRcnaJffIvi4oq+wSnCUn++jXZbH1OrSCZhcPBsdvgHz0KV9D +71KpJ2p9cdjqO6MWM7DrKy30QNSf5eiDzhqTB4NMKLidgGIDCz7ahFZDH2vONaOn +1A8WFXMy06lFWsYTe4TjpPOG5ZFHhSNsTthYp7sUgLwvThKhXRl0nM5C9mwShw8I +9r5/gbzEvyJJx0anJLHyka2AYFtrCcK9WGx4I6PsTfj318T1bVzWfjnq9FXZ1p3c +UeTYX8f92EqAs0dcvC0hrNWyhNr2D4G+YF6KAysAEQEAAYkBHwQYAQgACQUCWirz +hwIbDAAKCRAFFjjAiGXpQ5CPCACCuhM8SbtnG83oQJbxWA63DdXZPTyuFGbiMRS4 +9EJkx7hVu94NJEaJMFGETn+qKLjaV+QtsBK/ZtZBRbKoqBhFzbRt8NOfC26JHEx0 +tdrBb4Ct8SAPhEhZDZFJt3kac038E3mBeXDoDAqdoltqG8C24uk99QHJwAhjWNb9 +uOMTGcm/j7ieyGF87bMKCdnTDXWABTAUbeBTD+MCfyEJgeMa6G3LWSsoj2cOwj0K +Nla4ixBctXWPewAyobNaN+EGJj99TMuz/3EMtxSzh//u2czenic3IUzoG1jSWwi1 ++5AETDxKdVzpZYolUBYZGmnsStLvyh/+n6Xt19LM1+NBos1y +=JTYP +-----END PGP PUBLIC KEY BLOCK----- +``` diff --git a/doc/tuning.md b/doc/tuning.md index 474553b4c..53e682bb4 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -1,6 +1,7 @@ # Tuning Guide ## Content Overview +* [Windows](windows) * [NVIDIA Backend](#nvidia-backend) * [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks) * [Add more GPUs](#add-more-gpus) @@ -8,6 +9,14 @@ * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) * [Add more GPUs](#add-more-gpus) * [Increase Memory Pool](#increase-memory-pool) + * [Scratchpad Indexing](#scratchpad-indexing) +* [CPU Backend](#cpu-backend) + * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode) + +## Windows +"Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7. +On Windows 10 it is only needed once to set up the account to use them. +Disable the dialog with the command line option `--noUAC` ## NVIDIA Backend @@ -80,4 +89,21 @@ export GPU_MAX_ALLOC_PERCENT=99 export GPU_SINGLE_ALLOC_PERCENT=99 ``` -*Note:* Windows user must use `set` instead of `export` to define an environment variable. \ No newline at end of file +*Note:* Windows user must use `set` instead of `export` to define an environment variable. + +### Scratchpad Indexing + +The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`. +Try to change the value from the default `true` to `false`. + +## CPU Backend + +By default the CPU backend can be tuned in the config file `cpu.txt` + +### Choose Value for `low_power_mode` + +The optimal value for `low_power_mode` depends on the cache size of your CPU, and the number of threads. + +The `low_power_mode` can be set to a number between `1` to `5`. When set to a value `N` greater than `1`, this mode increases the single thread performance by `N` times, but also requires at least `2*N` MB of cache per thread. It can also be set to `false` or `true`. The value `false` is equivalent to `1`, and `true` is equivalent to `2`. + +This setting is particularly useful for CPUs with very large cache. For example the Intel Crystal Well Processors are equipped with 128MB L4 cache, enough to run 8 threads at an optimal `low_power_mode` value of `5`. diff --git a/doc/usage.md b/doc/usage.md index bb35769ee..60cf69b40 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -20,35 +20,34 @@ The number of files depends on the available backends. 1) Double click the `xmr-stak.exe` file 2) Fill in the pool url, username and password -## Usage on Linux +`set XMRSTAK_NOWAIT=1` disable the dialog `Press any key to exit.` for non UAC execution. + + +## Usage on Linux & MacOS 1) Open a terminal within the folder with the binary 2) Start the miner with `./xmr-stak` ## Command Line Options The miner allow to overwrite some of the settings via command line options. +Run `xmr-stak --help` to show all available command line options. + +## Docker image usage + +You can run the Docker image the following way: + +``` +docker run --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak +docker stop xmr-stak +docker run --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak --config config.txt +``` + +Debug the docker image by getting inside: ``` -Usage: xmr-stak [OPTION]... - - -h, --help show this help - -v, --version show version number - -V, --version-long show long version number - -c, --config FILE common miner configuration file - --currency NAME currency to mine: monero or aeon - --noCPU disable the CPU miner backend - --cpu FILE CPU backend miner config file - --noAMD disable the AMD miner backend - --amd FILE AMD backend miner config file - --noNVIDIA disable the NVIDIA miner backend - --nvidia FILE NVIDIA backend miner config file - -The Following options temporary overwrites the config file settings: - -o, --url URL pool url and port, e.g. pool.usxmrpool.com:3333 - -u, --user USERNAME pool user name or wallet address - -p, --pass PASSWD pool password, in the most cases x or empty "" +docker run --entrypoint=/bin/bash --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak ``` ## HTML and JSON API report configuraton -To configure the reports shown on the [README](README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json +To configure the reports shown on the [README](../README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json diff --git a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh index e046cb02e..bfee1b8d0 100755 --- a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh +++ b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh @@ -5,110 +5,131 @@ if [[ $EUID -ne 0 ]]; then exit 1 fi -if [ -d xmr-stak-cpu ]; then - git -C xmr-stak-cpu clean -fd +if [ -d xmr-stak ]; then + git -C xmr-stak clean -fd else - git clone https://github.com/fireice-uk/xmr-stak-cpu.git + git clone https://github.com/fireice-uk/xmr-stak.git fi +wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run +chmod a+x cuda_*_linux-run + ######################## -# Fedora 26 +# Fedora 27 ######################## -docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu fedora:26 /bin/bash -c " -set -ex ; -dnf install -y -q gcc gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel cmake ; -cd /xmr-stak-cpu ; -cmake -DCMAKE_LINK_STATIC=ON . ; -make install ; +# CUDA is not going to work on Fedora 27 beacuse it's only support these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html +docker run --rm -it -v $PWD:/mnt fedora:27 /bin/bash -c " +set -x ; +dnf install -y -q cmake gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel; +cd /mnt/xmr-stak ; +cmake -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ; +make ; " -mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_fedora_26 -git -C xmr-stak-cpu clean -fd + +test -d fedora_27 || mkdir fedora_27 +mv xmr-stak/bin/* fedora_27 +git -C xmr-stak clean -fd ######################## # Ubuntu (17.04) ######################## -docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:17.04 /bin/bash -c " -set -ex ; +docker run --rm -it -v $PWD:/mnt ubuntu:17.04 /bin/bash -c " +set -x ; apt update -qq ; apt install -y -qq libmicrohttpd-dev libssl-dev cmake build-essential libhwloc-dev ; -cd /xmr-stak-cpu ; -cmake -DCMAKE_LINK_STATIC=ON . ; -make install ; +cd /mnt/xmr-stak ; +/mnt/cuda_*_linux-run --silent --toolkit ; +cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ; +make ; " -mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_17.04 -git -C xmr-stak-cpu clean -fd + +test -d ubuntu_17.10 || mkdir ubuntu_17.10 +mv xmr-stak/bin/* ubuntu_17.10 +git -C xmr-stak clean -fd ######################## # Ubuntu 16.04 ######################## -docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:16.04 /bin/bash -c " -set -ex ; +docker run --rm -it -v $PWD:/mnt ubuntu:16.04 /bin/bash -c " +set -x ; apt update -qq ; -apt install -y -qq libmicrohttpd-dev libssl-dev cmake build-essential libhwloc-dev ; -cd /xmr-stak-cpu ; -cmake -DCMAKE_LINK_STATIC=ON . ; -make install ; +apt install -y -qq cmake g++ libmicrohttpd-dev libssl-dev libhwloc-dev ; +cd /mnt/xmr-stak ; +/mnt/cuda_*_linux-run --silent --toolkit ; +cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ; +make ; " -mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_16.04 -git -C xmr-stak-cpu clean -fd + +test -d ubuntu_16.04 || mkdir ubuntu_16.04 +mv xmr-stak/bin/* ubuntu_16.04 +git -C xmr-stak clean -fd ######################## # Ubuntu 14.04 ######################## -docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:14.04 /bin/bash -c " -set -ex ; +docker run --rm -it -v $PWD:/mnt ubuntu:14.04 /bin/bash -c " +set -x ; apt update -qq ; apt install -y -qq curl libmicrohttpd-dev libssl-dev libhwloc-dev software-properties-common ; add-apt-repository -y ppa:ubuntu-toolchain-r/test ; apt update -qq ; -apt install -y -qq gcc-7 g++-7 make ; -update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 1 --slave /usr/bin/g++ g++ /usr/bin/g++-7 ; -curl -L https://cmake.org/files/v3.9/cmake-3.9.0.tar.gz | tar -xzf - -C /tmp/ ; -( cd /tmp/cmake-3.9.0/ && ./configure && make && sudo make install && cd - ) > /dev/null +apt install -y -qq gcc-6 g++-6 make ; +update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 1 --slave /usr/bin/g++ g++ /usr/bin/g++-6 ; +curl -L https://cmake.org/files/LatestRelease/cmake-3.10.0.tar.gz | tar -xzf - -C /tmp/ ; +( cd /tmp/cmake-*/ && ./configure && make && sudo make install && cd - ) > /dev/null update-alternatives --install /usr/bin/cmake cmake /usr/local/bin/cmake 1 --force ; -cd /xmr-stak-cpu ; -cmake -DCMAKE_LINK_STATIC=ON . ; -make install ; +cd /mnt/xmr-stak ; +/mnt/cuda_*_linux-run --silent --toolkit ; +cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ; +make ; " -mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_14.04 -git -C xmr-stak-cpu clean -fd + +test -d ubuntu_14.04 || mkdir ubuntu_14.04 +mv xmr-stak/bin/* ubuntu_14.04 +git -C xmr-stak clean -fd ######################## # CentOS 7 ######################## -docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu centos:7 /bin/bash -c " -set -ex ; +# CUDA is not going to work on CentOS/RHEL beacuse it's only support gcc-4 in these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html +docker run --rm -it -v $PWD:/mnt centos:7 /bin/bash -c " +set -x ; yum install -y -q centos-release-scl epel-release ; -yum install -y -q cmake3 devtoolset-4-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ; -scl enable devtoolset-4 - << EOF -cd /xmr-stak-cpu ; -cmake3 -DCMAKE_LINK_STATIC=ON . ; -make install ; +yum install -y -q cmake3 devtoolset-7-gcc* hwloc-devel libmicrohttpd-devel make openssl-devel perl ; +scl enable devtoolset-7 - << EOF +cd /mnt/xmr-stak ; +cmake3 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ; +make ; EOF " -mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_centos_7 -git -C xmr-stak-cpu clean -fd + +test -d centos_7 || mkdir centos_7 +mv xmr-stak/bin/* centos_7 +git -C xmr-stak clean -fd ######################## # CentOS 6.x ######################## -docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu centos:6 /bin/bash -c " -set -ex ; +# CUDA is not going to work on CentOS/RHEL beacuse it's only support gcc-4 in these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html +docker run --rm -it -v $PWD:/mnt centos:6 /bin/bash -c " +set -x ; yum install -y -q centos-release-scl epel-release ; -yum install -y -q cmake3 devtoolset-4-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ; -scl enable devtoolset-4 - << EOF -cd /xmr-stak-cpu ; -cmake3 -DCMAKE_LINK_STATIC=ON . ; -make install ; +yum install -y -q cmake3 devtoolset-7-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ; +scl enable devtoolset-7 - << EOF +cd /mnt/xmr-stak ; +cmake3 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ; +make ; EOF " -mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_centos_6 -git -C xmr-stak-cpu clean -fd -rm -rf xmr-stak-cpu +test -d centos_6 || mkdir centos_6 +mv xmr-stak/bin/* centos_6 +git -C xmr-stak clean -fd + +rm -rf xmr-stak diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 15b845715..d9bc96235 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize)); + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", + hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) { @@ -448,68 +449,85 @@ uint32_t getNumPlatforms() std::vector getAMDDevices(int index) { std::vector ctxVec; - cl_platform_id * platforms = NULL; + std::vector platforms; + std::vector device_list; + cl_int clStatus; cl_uint num_devices; - cl_device_id *device_list = NULL; - uint32_t numPlatforms = getNumPlatforms(); - if(numPlatforms) + if(numPlatforms == 0) + return ctxVec; + + platforms.resize(numPlatforms); + if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); + return ctxVec; + } + + if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); + return ctxVec; + } + + device_list.resize(num_devices); + if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); + return ctxVec; + } + + for (size_t k = 0; k < num_devices; k++) { - platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); - clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); - if(clStatus == CL_SUCCESS) + std::vector devVendorVec(1024); + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS) { - clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); - if(clStatus == CL_SUCCESS) + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); + continue; + } + + std::string devVendor(devVendorVec.data()); + if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos) + { + GpuContext ctx; + std::vector devNameVec(1024); + size_t maxMem; + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k); + continue; + } + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS) { - device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); - clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); - if(clStatus == CL_SUCCESS) - { - for (int k = 0; k < num_devices; k++) - { - cl_int clError; - std::vector devVendorVec(1024); - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL); - if(clStatus == CL_SUCCESS) - { - std::string devVendor(devVendorVec.data()); - if( devVendor.find("Advanced Micro Devices") != std::string::npos) - { - GpuContext ctx; - ctx.deviceIdx = k; - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL); - size_t maxMem; - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL); - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL); - // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory - ctx.freeMem = std::min(ctx.freeMem, maxMem); - std::vector devNameVec(1024); - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL); - ctx.name = std::string(devNameVec.data()); - printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); - ctx.DeviceID = device_list[k]; - ctxVec.push_back(ctx); - } - } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name.", err_to_str(clStatus)); - } - } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); - free(device_list); + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); + continue; } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k); + continue; + } + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); + continue; + } + printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); + + // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory + ctx.deviceIdx = k; + ctx.freeMem = std::min(ctx.freeMem, maxMem); + ctx.name = std::string(devNameVec.data()); + ctx.DeviceID = device_list[k]; + ctxVec.push_back(ctx); } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); } - - free(platforms); return ctxVec; } @@ -541,7 +559,7 @@ int getAMDPlatformIdx() clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); std::string platformName(platformNameVec.data()); - if( platformName.find("Advanced Micro Devices") != std::string::npos) + if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos) { platformIndex = i; printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str()); diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index abbd08d2f..c17bac11b 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -24,6 +24,7 @@ struct GpuContext size_t deviceIdx; size_t rawIntensity; size_t workSize; + int stridedIndex; /*Output vars*/ cl_device_id DeviceID; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 966199bc4..255fcbbff 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf) } } -#define IDX(x) (x) +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#else +# define IDX(x) ((x) * (Threads)) +#endif __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) @@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul if(gIdx < Threads) { states += 25 * gIdx; + +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif ((ulong8 *)State)[0] = vload8(0, input); State[8] = input[8]; @@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre if(gIdx < Threads) { states += 25 * gIdx; +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif a[0] = states[0] ^ states[4]; b[0] = states[2] ^ states[6]; @@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u if(gIdx < Threads) { states += 25 * gIdx; +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif #if defined(__Tahiti__) || defined(__Pitcairn__) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl index 996944bc5..81e1644f1 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -74,15 +74,29 @@ static const __constant uint AES0_C[256] = #define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) -uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key) +uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key) { - uint4 Y; - Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)]; - Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)]; - Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)]; - Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)]; - Y ^= key; - return(Y); + key.s0 ^= AES0[BYTE(X.s0, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; + + key.s0 ^= AES2[BYTE(X.s2, 2)]; + key.s1 ^= AES2[BYTE(X.s3, 2)]; + key.s2 ^= AES2[BYTE(X.s0, 2)]; + key.s3 ^= AES2[BYTE(X.s1, 2)]; + + key.s0 ^= AES1[BYTE(X.s1, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; + + key.s0 ^= AES3[BYTE(X.s3, 3)]; + key.s1 ^= AES3[BYTE(X.s0, 3)]; + key.s2 ^= AES3[BYTE(X.s1, 3)]; + key.s3 ^= AES3[BYTE(X.s2, 3)]; + + return key; } #endif diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 0b91212cb..0bc52395a 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -123,7 +123,7 @@ class autoAdjust // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \n" + " \"affine_to_cpu\" : false, \"strided_index\" : true\n" " },\n"; ++i; } diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index a93859cdf..af662f852 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -5,9 +5,12 @@ R"===( * intensity - Number of parallel GPU threads (nothing to do with CPU threads) * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. + * strided_index - switch memory pattern used for the scratch pad memory + * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * false = use a contiguous block of memory per thread * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true }, * ], */ diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 0617aeb2f..07afb1964 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + stridedIndex = GetObjectMember(oThdConf, "strided_index"); - if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr) + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!aff->IsUint64() && !aff->IsBool()) return false; + if(!stridedIndex->IsBool()) + return false; + cfg.index = idx->GetUint64(); cfg.intensity = intensity->GetUint64(); cfg.w_size = w_size->GetUint64(); + cfg.stridedIndex = stridedIndex->GetBool(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt64(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index da024a412..ee1882aad 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -26,6 +26,7 @@ class jconf size_t intensity; size_t w_size; long long cpu_aff; + bool stridedIndex; }; size_t GetThreadCount(); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index c1399e0b8..103688f8a 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -96,6 +96,7 @@ bool minethd::init_gpus() vGpuData[i].deviceIdx = cfg.index; vGpuData[i].rawIntensity = cfg.intensity; vGpuData[i].workSize = cfg.w_size; + vGpuData[i].stridedIndex = cfg.stridedIndex; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 990a31d3f..b21a22d24 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -1,9 +1,11 @@ R"===( /* * Thread configuration for each thread. Make sure it matches the number above. - * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will + * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true, + this mode will double the cache usage, and double the single thread performance. It will * consume much less power (as less cores are working), but will max out at around 80-85% of - * the maximum performance. + * the maximum performance. When set to a number N greater than 1, this mode will increase the + * cache usage and single thread performance by N times. * * no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2a6a7695d..9b6e1dc28 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -353,19 +353,19 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) template -void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) +void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - keccak((const uint8_t *)input, len, ctx0->hash_state, 200); - keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200); + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - cn_explode_scratchpad((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state); - uint8_t* l0 = ctx0->long_state; - uint64_t* h0 = (uint64_t*)ctx0->hash_state; - uint8_t* l1 = ctx1->long_state; - uint64_t* h1 = (uint64_t*)ctx1->hash_state; + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; uint64_t axl0 = h0[0] ^ h0[4]; uint64_t axh0 = h0[1] ^ h0[5]; @@ -444,13 +444,305 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto } // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - cn_implode_scratchpad((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state); + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state); // Optim - 99% time boundary - keccakf((uint64_t*)ctx0->hash_state, 24); - extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); - keccakf((uint64_t*)ctx1->hash_state, 24); - extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); + keccakf((uint64_t*)ctx[1]->hash_state, 24); + extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); +} + +#define CN_STEP1(a, b, c, l, ptr, idx) \ + a = _mm_xor_si128(a, c); \ + idx = _mm_cvtsi128_si64(a); \ + ptr = (__m128i *)&l[idx & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ + c = _mm_load_si128(ptr) + +#define CN_STEP2(a, b, c, l, ptr, idx) \ + if(SOFT_AES) \ + c = soft_aesenc(c, a); \ + else \ + c = _mm_aesenc_si128(c, a); \ + b = _mm_xor_si128(b, c); \ + _mm_store_si128(ptr, b) + +#define CN_STEP3(a, b, c, l, ptr, idx) \ + idx = _mm_cvtsi128_si64(c); \ + ptr = (__m128i *)&l[idx & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ + b = _mm_load_si128(ptr) + +#define CN_STEP4(a, b, c, l, ptr, idx) \ + lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \ + a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ + _mm_store_si128(ptr, a) + +// This lovelier creation will do 3 cn hashes at a time. +template +void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 3; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, hi, lo; + __m128i *ptr0, *ptr1, *ptr2; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + } + + for (size_t i = 0; i < 3; i++) + { + cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This even lovelier creation will do 4 cn hashes at a time. +template +void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 4; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + __m128i cx3 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, idx3, hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3); + } + + for (size_t i = 0; i < 4; i++) + { + cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This most lovely creation will do 5 cn hashes at a time. +template +void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 5; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + uint8_t* l4 = ctx[4]->long_state; + uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + __m128i cx3 = _mm_set_epi64x(0, 0); + __m128i cx4 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4); + } + + for (size_t i = 0; i < 5; i++) + { + cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } } diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 2ded8c05c..6e709bd35 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -116,7 +116,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; - if(!mode->IsBool() || !no_prefetch->IsBool()) + if(!mode->IsBool() && !mode->IsNumber()) + return false; + + if(!no_prefetch->IsBool()) return false; if(!aff->IsNumber() && !aff->IsBool()) @@ -125,7 +128,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(aff->IsNumber() && aff->GetInt64() < 0) return false; - cfg.bDoubleMode = mode->GetBool(); + if(mode->IsNumber()) + cfg.iMultiway = (int)mode->GetInt64(); + else + cfg.iMultiway = mode->GetBool() ? 2 : 1; + cfg.bNoPrefetch = no_prefetch->GetBool(); if(aff->IsNumber()) diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index f843ed4d1..e98ed160a 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -22,7 +22,7 @@ class jconf bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str()); struct thd_cfg { - bool bDoubleMode; + int iMultiway; bool bNoPrefetch; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index cbb01f955..1c0e49176 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -92,7 +92,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) { this->backendType = iBackend::CPU; oWork = pWork; @@ -105,10 +105,25 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefet std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); - if(double_work) + switch (iMultiway) + { + case 5: + oWorkThd = std::thread(&minethd::penta_work_main, this); + break; + case 4: + oWorkThd = std::thread(&minethd::quad_work_main, this); + break; + case 3: + oWorkThd = std::thread(&minethd::triple_work_main, this); + break; + case 2: oWorkThd = std::thread(&minethd::double_work_main, this); - else + break; + case 1: + default: oWorkThd = std::thread(&minethd::work_main, this); + break; + } order_guard.wait(); @@ -154,6 +169,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() return nullptr; //Should never happen } +static constexpr size_t MAX_N = 5; bool minethd::self_test() { alloc_msg msg = { 0 }; @@ -191,14 +207,15 @@ bool minethd::self_test() if(res == 0 && fatal) return false; - cryptonight_ctx *ctx0, *ctx1; - if((ctx0 = minethd_alloc_ctx()) == nullptr) - return false; - - if((ctx1 = minethd_alloc_ctx()) == nullptr) + cryptonight_ctx *ctx[MAX_N] = {0}; + for (int i = 0; i < MAX_N; i++) { - cryptonight_free_ctx(ctx0); - return false; + if ((ctx[i] = minethd_alloc_ctx()) == nullptr) + { + for (int j = 0; j < i; j++) + cryptonight_free_ctx(ctx[j]); + return false; + } } bool bResult = true; @@ -206,31 +223,52 @@ bool minethd::self_test() bool mineMonero = ::jconf::inst()->IsCurrencyMonero(); if(mineMonero) { - unsigned char out[64]; + unsigned char out[32 * MAX_N]; cn_hash_fun hashf; - cn_hash_fun_dbl hashdf; - + cn_hash_fun_multi hashf_multi; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero); - hashf("This is a test", 14, out, ctx0); + hashf("This is a test", 14, out, ctx[0]); bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero); - hashf("This is a test", 14, out, ctx0); + hashf("This is a test", 14, out, ctx[0]); bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero); - hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero); - hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, mineMonero); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; + + hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; + + hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; } - cryptonight_free_ctx(ctx0); - cryptonight_free_ctx(ctx1); + + for (int i = 0; i < MAX_N; i++) + cryptonight_free_ctx(ctx[i]); if(!bResult) printer::inst()->print_msg(L0, @@ -272,12 +310,12 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory."); #endif - printer::inst()->print_msg(L1, "Starting %s thread, affinity: %d.", cfg.bDoubleMode ? "double" : "single", (int)cfg.iCpuAff); + printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff); } else - printer::inst()->print_msg(L1, "Starting %s thread, no affinity.", cfg.bDoubleMode ? "double" : "single"); + printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); pvThreads.push_back(thd); } @@ -326,7 +364,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, boo // define aeon settings #if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO) - // ignore 3rd bit if only on currency is active + // ignore 3rd bit if only one currency is active digit.set(2, 0); #else digit.set(2, !mineMonero); @@ -416,22 +454,34 @@ void minethd::work_main() cryptonight_free_ctx(ctx); } -minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero) +minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero) { // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary - // Digit order SOFT_AES, NO_PREFETCH, MINER_ALGO + // Digit order SOFT_AES, NO_PREFETCH - static const cn_hash_fun_dbl func_table[] = { - /* there will be 8 function entries if `CONF_NO_MONERO` and `CONF_NO_AEON` - * is not defined. If one is defined there will be 4 entries. + static const cn_hash_fun_multi func_table[] = { + /* there will be 8*(MAX_N-1) function entries if `CONF_NO_MONERO` and `CONF_NO_AEON` + * is not defined. If one is defined there will be 4*(MAX_N-1) entries. */ #ifndef CONF_NO_MONERO cryptonight_double_hash, cryptonight_double_hash, cryptonight_double_hash, - cryptonight_double_hash + cryptonight_double_hash, + cryptonight_triple_hash, + cryptonight_triple_hash, + cryptonight_triple_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_quad_hash, + cryptonight_quad_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + cryptonight_penta_hash, + cryptonight_penta_hash, + cryptonight_penta_hash #endif #if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO)) // comma will be added only if Monero and Aeon is build @@ -441,33 +491,71 @@ minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefe cryptonight_double_hash, cryptonight_double_hash, cryptonight_double_hash, - cryptonight_double_hash + cryptonight_double_hash, + cryptonight_triple_hash, + cryptonight_triple_hash, + cryptonight_triple_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_quad_hash, + cryptonight_quad_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + cryptonight_penta_hash, + cryptonight_penta_hash, + cryptonight_penta_hash #endif }; - std::bitset<3> digit; + std::bitset<2> digit; digit.set(0, !bNoPrefetch); digit.set(1, !bHaveAes); // define aeon settings #if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO) - // ignore 3rd bit if only on currency is active - digit.set(2, 0); + // ignore miner algo if only one currency is active + size_t miner_algo_base = 0; #else - digit.set(2, !mineMonero); + size_t miner_algo_base = mineMonero ? 0 : 4*(MAX_N-1); #endif - return func_table[digit.to_ulong()]; + N = (N<2) ? 2 : (N>MAX_N) ? MAX_N : N; + return func_table[miner_algo_base + 4*(N-2) + digit.to_ulong()]; } -uint32_t* minethd::prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]) +void minethd::double_work_main() { - memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); - memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize); - return (uint32_t*)(bDoubleWorkBlob + oWork.iWorkSize + 39); + multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); } -void minethd::double_work_main() +void minethd::triple_work_main() +{ + multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); +} + +void minethd::quad_work_main() +{ + multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); +} + +void minethd::penta_work_main() +{ + multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); +} + +template +void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce) +{ + for (size_t i = 0; i < N; i++) + { + memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize); + if (i > 0) + piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39); + } +} + +template +void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi) { if(affinity >= 0) //-1 means no affinity bindMemoryToNUMANode(affinity); @@ -477,31 +565,26 @@ void minethd::double_work_main() lck.release(); std::this_thread::yield(); - cn_hash_fun_dbl hash_fun; - cryptonight_ctx* ctx0; - cryptonight_ctx* ctx1; + cryptonight_ctx *ctx[MAX_N]; uint64_t iCount = 0; - uint64_t *piHashVal0, *piHashVal1; - uint32_t *piNonce0, *piNonce1; - uint8_t bDoubleHashOut[64]; - uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]; + uint64_t *piHashVal[MAX_N]; + uint32_t *piNonce[MAX_N]; + uint8_t bHashOut[MAX_N * 32]; + uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N]; uint32_t iNonce; job_result res; - hash_fun = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()); - ctx0 = minethd_alloc_ctx(); - ctx1 = minethd_alloc_ctx(); - - piHashVal0 = (uint64_t*)(bDoubleHashOut + 24); - piHashVal1 = (uint64_t*)(bDoubleHashOut + 32 + 24); - piNonce0 = (uint32_t*)(bDoubleWorkBlob + 39); + for (size_t i = 0; i < N; i++) + { + ctx[i] = minethd_alloc_ctx(); + piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24); + piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr; + } if(!oWork.bStall) - piNonce1 = prep_double_work(bDoubleWorkBlob); - else - piNonce1 = nullptr; + prep_multiway_work(bWorkBlob, piNonce); - globalStates::inst().inst().iConsumeCnt++; + globalStates::inst().iConsumeCnt++; while (bQuit == 0) { @@ -515,55 +598,57 @@ void minethd::double_work_main() std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); - piNonce1 = prep_double_work(bDoubleWorkBlob); + prep_multiway_work(bWorkBlob, piNonce); continue; } - size_t nonce_ctr = 0; - constexpr size_t nonce_chunk = 4096; //Needs to be a power of 2 + constexpr uint32_t nonce_chunk = 4096; + int64_t nonce_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); if(oWork.bNiceHash) - iNonce = *piNonce0; + iNonce = *piNonce[0]; while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { - if ((iCount & 0x7) == 0) //Store stats every 16 hashes + if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes { using namespace std::chrono; uint64_t iStamp = time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); - iHashCount.store(iCount, std::memory_order_relaxed); + iHashCount.store(iCount * N, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); } - iCount += 2; - - - if((nonce_ctr++ & (nonce_chunk/2 - 1)) == 0) + + nonce_ctr -= N; + if(nonce_ctr <= 0) { globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk); + nonce_ctr = nonce_chunk; } - *piNonce0 = ++iNonce; - *piNonce1 = ++iNonce; + for (size_t i = 0; i < N; i++) + *piNonce[i] = ++iNonce; - hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1); + hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx); - if (*piHashVal0 < oWork.iTarget) - executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut, iThreadNo), oWork.iPoolId)); - - if (*piHashVal1 < oWork.iTarget) - executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce, bDoubleHashOut + 32, iThreadNo), oWork.iPoolId)); + for (size_t i = 0; i < N; i++) + { + if (*piHashVal[i] < oWork.iTarget) + { + executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId)); + } + } std::this_thread::yield(); } consume_work(); - piNonce1 = prep_double_work(bDoubleWorkBlob); + prep_multiway_work(bWorkBlob, piNonce); } - cryptonight_free_ctx(ctx0); - cryptonight_free_ctx(ctx1); + for (int i = 0; i < N; i++) + cryptonight_free_ctx(ctx[i]); } } // namespace cpu diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 5520d9e5c..0433d0d36 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -29,16 +29,24 @@ class minethd : public iBackend static cryptonight_ctx* minethd_alloc_ctx(); private: + typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**); + static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero); - typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict); - static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); - minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity); + template + void multiway_work_main(cn_hash_fun_multi hash_fun_multi); + + template + void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce); void work_main(); void double_work_main(); + void triple_work_main(); + void quad_work_main(); + void penta_work_main(); + void consume_work(); - uint32_t* prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]); uint64_t iJobNo; diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index c6a7dca7e..be7d1ce43 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -95,7 +95,7 @@ class autoAdjust conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" + " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + - " \"affine_to_cpu\" : false,\n" + + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + " },\n"; } } diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 99dc0235a..54791729e 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -9,6 +9,12 @@ R"===( * bsleep - Insert a delay of X microseconds between kernel launches. * Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 100 * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. + * sync_mode - method used to synchronize the device + * documentation: http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g69e73c7dda3fc05306ae7c811a690fac + * 0 = cudaDeviceScheduleAuto + * 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu + * 2 = cudaDeviceScheduleYield + * 3 = cudaDeviceScheduleBlockingSync (default) * * On the first run the miner will look at your system and suggest a basic configuration that will work, * you can try to tweak it from there to get the best performance. @@ -16,7 +22,9 @@ R"===( * A filled out configuration should look like this: * "gpu_threads_conf" : * [ - * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false}, + * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, + * "affine_to_cpu" : false, "sync_mode" : 3, + * }, * ], */ diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index 4208145df..46c572673 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -123,16 +123,17 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff; + const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode; gid = GetObjectMember(oThdConf, "index"); blocks = GetObjectMember(oThdConf, "blocks"); threads = GetObjectMember(oThdConf, "threads"); bfactor = GetObjectMember(oThdConf, "bfactor"); bsleep = GetObjectMember(oThdConf, "bsleep"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + syncMode = GetObjectMember(oThdConf, "sync_mode"); if(gid == nullptr || blocks == nullptr || threads == nullptr || - bfactor == nullptr || bsleep == nullptr || aff == nullptr) + bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr) { return false; } @@ -155,11 +156,17 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!aff->IsUint64() && !aff->IsBool()) return false; + if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3) + { + printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)"); + return false; + } cfg.id = gid->GetInt(); cfg.blocks = blocks->GetInt(); cfg.threads = threads->GetInt(); cfg.bfactor = bfactor->GetInt(); cfg.bsleep = bsleep->GetInt(); + cfg.syncMode = syncMode->GetInt(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index b09a162fd..7f60f1d90 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -28,6 +28,7 @@ class jconf bool bDoubleMode; bool bNoPrefetch; int32_t cpu_aff; + int syncMode; long long iCpuAff; }; diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 9eab1c0ab..6e628fdaf 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -77,6 +77,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.device_threads = (int)cfg.threads; ctx.device_bfactor = (int)cfg.bfactor; ctx.device_bsleep = (int)cfg.bsleep; + ctx.syncMode = cfg.syncMode; this->affinity = cfg.cpu_aff; std::unique_lock lck(thd_aff_set); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index 1b6337995..afbdbaf88 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -11,7 +11,8 @@ typedef struct { int device_blocks; int device_threads; int device_bfactor; - int device_bsleep; + int device_bsleep; + int syncMode; uint32_t *d_input; uint32_t inputlen; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index a92fa8c31..0b175b5f3 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -167,10 +167,10 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #endif } +template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) #endif -template __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b ) { __shared__ uint32_t sharedMemory[1024]; @@ -327,18 +327,22 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx) for ( int i = 0; i < partcount; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<<< - grid, - block4, - block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ctx->device_blocks*ctx->device_threads, - ctx->device_bfactor, - i, - ctx->d_long_state, - ctx->d_ctx_a, - ctx->d_ctx_b - )); + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", + cryptonight_core_gpu_phase2<<< + grid, + block4, + block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ctx->device_blocks*ctx->device_threads, + ctx->device_bfactor, + i, + ctx->d_long_state, + ctx->d_ctx_a, + ctx->d_ctx_b + ) + ); if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp index 078c16537..563bb3b9e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp @@ -9,22 +9,41 @@ /** execute and check a CUDA api command * * @param id gpu id (thread id) + * @param msg message string which should be added to the error message * @param ... CUDA api command */ -#define CUDA_CHECK(id, ...) { \ - cudaError_t error = __VA_ARGS__; \ - if(error!=cudaSuccess){ \ - std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__ << std::endl; \ - throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ - } \ -} \ +#define CUDA_CHECK_MSG(id, msg, ...) { \ + cudaError_t error = __VA_ARGS__; \ + if(error!=cudaSuccess){ \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \ + std::cerr << msg << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ + } \ +} \ ( (void) 0 ) +/** execute and check a CUDA api command + * + * @param id gpu id (thread id) + * @param ... CUDA api command + */ +#define CUDA_CHECK(id, ...) CUDA_CHECK_MSG(id, "", __VA_ARGS__) + /** execute and check a CUDA kernel * * @param id gpu id (thread id) * @param ... CUDA kernel call */ -#define CUDA_CHECK_KERNEL(id, ...) \ - __VA_ARGS__; \ +#define CUDA_CHECK_KERNEL(id, ...) \ + __VA_ARGS__; \ CUDA_CHECK(id, cudaGetLastError()) + +/** execute and check a CUDA kernel + * + * @param id gpu id (thread id) + * @param msg message string which should be added to the error message + * @param ... CUDA kernel call + */ +#define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \ + __VA_ARGS__; \ + CUDA_CHECK_MSG(id, msg, cudaGetLastError()) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 5501d8da3..492201d24 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -189,7 +189,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) } cudaDeviceReset(); - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + switch(ctx->syncMode) + { + case 0: + cudaSetDeviceFlags(cudaDeviceScheduleAuto); + break; + case 1: + cudaSetDeviceFlags(cudaDeviceScheduleSpin); + break; + case 2: + cudaSetDeviceFlags(cudaDeviceScheduleYield); + break; + case 3: + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + break; + + }; cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); size_t hashMemSize; @@ -203,7 +218,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) } size_t wsize = ctx->device_blocks * ctx->device_threads; - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, hashMemSize * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize)); @@ -213,6 +227,10 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) )); + CUDA_CHECK_MSG( + ctx->device_id, + "\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**", + cudaMalloc(&ctx->d_long_state, hashMemSize * wsize)); return 1; } @@ -239,7 +257,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) )); - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state )); + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state ) + ); CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )); @@ -380,6 +402,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) */ ctx->device_blocks = props.multiProcessorCount * ( props.major < 3 ? 2 : 3 ); + + // increase bfactor for low end devices to avoid that the miner is killed by the OS + if(props.multiProcessorCount < 6) + ctx->device_bfactor += 2; } if(ctx->device_threads == -1) { diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index 8859075e7..22c8330e3 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -66,6 +66,9 @@ void help() cout<<" -v, --version show version number"<> userName; } auto& passwd = params::inst().poolPasswd; - if(passwd.empty() && (!userSetPasswd)) + if(passwd.empty() && !params::inst().userSetPwd) { + prompt_once(prompted); + // clear everything from stdin to allow an empty password std::cin.clear(); std::cin.ignore(INT_MAX,'\n'); std::cout<<"- Password (mostly empty or x):"<print_msg(L0, + "This window has been opened because xmr-stak needed to run as administrator. It can be safely closed now."); + WaitForSingleObject(shExInfo.hProcess, INFINITE); + CloseHandle(shExInfo.hProcess); + // do not start the miner twice + std::exit(0); + } +} +#endif + int main(int argc, char *argv[]) { #ifndef CONF_NO_TLS @@ -295,8 +375,16 @@ int main(int argc, char *argv[]) params::inst().executablePrefix += seperator; } - bool userSetPasswd = false; - for(int i = 1; i < argc; ++i) + bool uacDialog = true; + bool pool_url_set = false; + for(size_t i = 1; i < argc-1; i++) + { + std::string opName(argv[i]); + if(opName == "-o" || opName == "-O" || opName == "--url" || opName == "--tls-url") + pool_url_set = true; + } + + for(size_t i = 1; i < argc; ++i) { std::string opName(argv[i]); if(opName.compare("-h") == 0 || opName.compare("--help") == 0) @@ -383,9 +471,29 @@ int main(int argc, char *argv[]) return 1; } params::inst().poolURL = argv[i]; + params::inst().poolUseTls = false; + } + else if(opName.compare("-O") == 0 || opName.compare("--tls-url") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '-O/--tls-url' given"); + win_exit(); + return 1; + } + params::inst().poolURL = argv[i]; + params::inst().poolUseTls = true; } else if(opName.compare("-u") == 0 || opName.compare("--user") == 0) { + if(!pool_url_set) + { + printer::inst()->print_msg(L0, "Pool address has to be set if you want to specify username and password."); + win_exit(); + return 1; + } + ++i; if( i >=argc ) { @@ -397,6 +505,13 @@ int main(int argc, char *argv[]) } else if(opName.compare("-p") == 0 || opName.compare("--pass") == 0) { + if(!pool_url_set) + { + printer::inst()->print_msg(L0, "Pool address has to be set if you want to specify username and password."); + win_exit(); + return 1; + } + ++i; if( i >=argc ) { @@ -404,9 +519,13 @@ int main(int argc, char *argv[]) win_exit(); return 1; } - userSetPasswd = true; + params::inst().userSetPwd = true; params::inst().poolPasswd = argv[i]; } + else if(opName.compare("--use-nicehash") == 0) + { + params::inst().nicehashMode = true; + } else if(opName.compare("-c") == 0 || opName.compare("--config") == 0) { ++i; @@ -418,6 +537,10 @@ int main(int argc, char *argv[]) } params::inst().configFile = argv[i]; } + else if(opName.compare("--noUAC") == 0) + { + uacDialog = false; + } else { printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]); @@ -426,9 +549,23 @@ int main(int argc, char *argv[]) } } +#ifdef _WIN32 + if(uacDialog) + { + std::string minerArgs; + for(int i = 1; i < argc; i++) + { + minerArgs += " "; + minerArgs += argv[i]; + } + + UACDialog(argv[0], minerArgs); + } +#endif + // check if we need a guided start if(!configEditor::file_exist(params::inst().configFile)) - do_guided_config(userSetPasswd); + do_guided_config(); if(!jconf::inst()->parse_config(params::inst().configFile.c_str())) { diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 34bde6ce5..f279f5242 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -152,8 +152,8 @@ bool jconf::GetPoolConfig(size_t id, pool_cfg& cfg) size_t dlt = wt_max - wt_min; if(dlt != 0) { - /* Normalise weights between 0 and 9.9 */ - cfg.weight = double(cfg.raw_weight - wt_min) * 9.9; + /* Normalise weights between 0 and 9.8 */ + cfg.weight = double(cfg.raw_weight - wt_min) * 9.8; cfg.weight /= dlt; } else /* Special case - user selected same weights for everything */ diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp index 8de5948bc..980760ecf 100644 --- a/xmrstak/misc/console.cpp +++ b/xmrstak/misc/console.cpp @@ -222,8 +222,13 @@ void printer::print_str(const char* str) #ifdef _WIN32 void win_exit(size_t code) { - printer::inst()->print_str("Press any key to exit."); - get_key(); + size_t envSize = 0; + getenv_s(&envSize, nullptr, 0, "XMRSTAK_NOWAIT"); + if(envSize == 0) + { + printer::inst()->print_str("Press any key to exit."); + get_key(); + } std::exit(code); } diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 74666c329..af4004186 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -388,10 +388,16 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob) if(dat.pool_id != pool_id) { - if(dat.pool_id == invalid_pool_id) - printer::inst()->print_msg(L2, "Pool logged in."); + jpsock* prev_pool; + if(dat.pool_id != invalid_pool_id && (prev_pool = pick_pool_by_id(dat.pool_id)) != nullptr) + { + if(prev_pool->is_dev_pool()) + printer::inst()->print_msg(L2, "Switching back to user pool."); + else + printer::inst()->print_msg(L2, "Pool switched."); + } else - printer::inst()->print_msg(L2, "Pool switched."); + printer::inst()->print_msg(L2, "Pool logged in."); } else printer::inst()->print_msg(L3, "New block detected."); @@ -491,8 +497,10 @@ void executor::ex_main() set_timestamp(); size_t pc = jconf::inst()->GetPoolCount(); - bool tls = true; - for(size_t i=0; i < pc; i++) + bool dev_tls = true; + bool already_have_cli_pool = false; + size_t i=0; + for(; i < pc; i++) { jconf::pool_cfg cfg; jconf::inst()->GetPoolConfig(i, cfg); @@ -503,20 +511,45 @@ void executor::ex_main() win_exit(); } #endif - if(!cfg.tls) tls = false; - pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash); + if(!cfg.tls) dev_tls = false; + + if(!xmrstak::params::inst().poolURL.empty() && xmrstak::params::inst().poolURL == cfg.sPoolAddr) + { + auto& params = xmrstak::params::inst(); + already_have_cli_pool = true; + + const char* wallet = params.poolUsername.empty() ? cfg.sWalletAddr : params.poolUsername.c_str(); + const char* pwd = params.userSetPwd ? params.poolPasswd.c_str() : cfg.sPasswd; + bool nicehash = cfg.nicehash || params.nicehashMode; + + pools.emplace_back(i+1, cfg.sPoolAddr, wallet, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash); + } + else + pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash); + } + + if(!xmrstak::params::inst().poolURL.empty() && !already_have_cli_pool) + { + auto& params = xmrstak::params::inst(); + if(params.poolUsername.empty()) + { + printer::inst()->print_msg(L1, "ERROR: You didn't specify the username / wallet address for %s", xmrstak::params::inst().poolURL.c_str()); + win_exit(); + } + + pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode); } if(jconf::inst()->IsCurrencyMonero()) { - if(tls) + if(dev_tls) pools.emplace_front(0, "indeedminers.eu:1111", "", "", 0.0, true, false, "", true); else pools.emplace_front(0, "indeedminers.eu:1111", "", "", 0.0, true, false, "", true); } else { - if(tls) + if(dev_tls) pools.emplace_front(0, "indeedminers.eu:2222", "", "", 0.0, true, false, "", true); else pools.emplace_front(0, "indeedminers.eu:2222", "", "", 0.0, true, false, "", true); @@ -535,7 +568,7 @@ void executor::ex_main() if(jconf::inst()->GetVerboseLevel() >= 4) push_timed_event(ex_event(EV_HASHRATE_LOOP), jconf::inst()->GetAutohashTime()); - size_t cnt = 0, i; + size_t cnt = 0; while (true) { ev = oEventQ.pop(); @@ -625,14 +658,11 @@ inline const char* hps_format(double h, char* buf, size_t l) { if(std::isnormal(h) || h == 0.0) { - if(h < 10.0) - snprintf(buf, l, " %03.1f", h); - else - snprintf(buf, l, " %04.1f", h); + snprintf(buf, l, " %6.1f", h); return buf; } else - return " (na)"; + return " (na)"; } bool executor::motd_filter_console(std::string& motd) @@ -726,9 +756,9 @@ void executor::hashrate_report(std::string& out) std::transform(name.begin(), name.end(), name.begin(), ::toupper); out.append("HASHRATE REPORT - ").append(name).append("\n"); - out.append("| ID | 10s | 60s | 15m |"); + out.append("| ID | 10s | 60s | 15m |"); if(nthd != 1) - out.append(" ID | 10s | 60s | 15m |\n"); + out.append(" ID | 10s | 60s | 15m |\n"); else out.append(1, '\n'); diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp index 2aedc38c4..bc3261274 100644 --- a/xmrstak/params.hpp +++ b/xmrstak/params.hpp @@ -24,9 +24,12 @@ struct params bool useNVIDIA; bool useCPU; + bool poolUseTls = false; std::string poolURL; + bool userSetPwd = false; std::string poolPasswd; std::string poolUsername; + bool nicehashMode = false; std::string currency; diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index c268a9b05..c8b9df2c9 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -3,7 +3,7 @@ //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. #define GIT_ARCHIVE 1 #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH) -#define GIT_COMMIT_HASH 0c67b32 +#define GIT_COMMIT_HASH a29e7d5 #endif #ifndef GIT_COMMIT_HASH @@ -19,7 +19,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.0.0" +#define XMR_STAK_VERSION "2.1.0" #if defined(_WIN32) #define OS_TYPE "win"