Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing optimization level of host compiler causes NVRTC_ERROR_COMPILATION #195

Open
ShatrovOA opened this issue Dec 25, 2024 · 0 comments

Comments

@ShatrovOA
Copy link

Hello!

I am using Nvidia HPC SDK Compiler 24.5
When trying to build following sample program with flag "-O2" or "-O3 -fast" VkFFT produces invalid device code, which fails to compile.

The error is
VkFFT.cu(453): error: identifier "f" is undefined

loc_1.x = fma(temp_2.x, 8.85456025653209896e-01f, loc_1.x);
loc_1.y = fma(temp_7.y, 8.85456025653209896e-01f, loc_1.y);
loc_12.x = fma(temp_2.y, 4.64723172043768546e-01f, loc_12.x);
loc_12.y = fma(temp_7.x, 4.64723172043768546e-01f, loc_12.y);
loc_2.x = fma(temp_2.x, -9.70941817426052027e-01f, loc_2.x);
loc_2.y = fma(temp_7.y, -9.70941817426052027e-01f, loc_2.y);
loc_11.x = fma(temp_2.y, f, loc_11.x); // HERE and also a lot of times later
loc_11.y = fma(temp_7.x, f, loc_11.y);

Passing flag "-O1" seems to work just fine.
VkFFT is latest from develop branch

Here is the sample:

#include <stdio.h>
#include <stdint.h>

#define VKFFT_BACKEND 1
#include "vkFFT.h"

#define VKFFT_CALL(call)                                          \
  do {                                                            \
    VkFFTResult ierr = call;                                      \
    if (ierr != VKFFT_SUCCESS) {                                  \
      fprintf(stderr, "Fatal error in vkFFT: %s at %s:%d\n",      \
          getVkFFTErrorString(ierr), __FILE__, __LINE__);         \
      exit(ierr);                            \
    }                                                             \
  } while (0)

#define CUDA_CALL(call)                                           \
  do {                                                            \
    cudaError_t ierr = call;                                      \
    if ( ierr != cudaSuccess ) {                                 \
      fprintf(stderr, "Fatal error in CUDA: %s at %s:%d\n",       \
          cudaGetErrorString(ierr), __FILE__, __LINE__);          \
      exit(ierr);                            \
    }                                                             \
  } while (0)


void vkfft_create(const int8_t rank, const int *dims, const int how_many,
                  const int8_t r2c, const int8_t c2r, cudaStream_t stream, VkFFTApplication **app_handle) {
  VkFFTConfiguration config = {};
  VkFFTApplication* app = (VkFFTApplication*)calloc(1, sizeof(VkFFTApplication));

  config.FFTdim = rank;
  int dim;
  for (dim = 0; dim < rank; dim++)
  {
    config.size[dim] = dims[dim];
  }
  config.doublePrecision = 0;
  config.numberBatches = how_many;

  CUdevice device;
  int device_num;

  CUDA_CALL( cudaGetDevice(&device_num) );
  CUDA_CALL( cuDeviceGet(&device, device_num) );
  config.device = &device;
  config.stream = (cudaStream_t*)malloc(sizeof(cudaStream_t));
  config.stream[0] = stream;
  config.num_streams = 1;

  config.isInputFormatted = 1;
  config.isOutputFormatted = 1;
  config.performDCT = 0;
  config.performDST = 0;

  if ( r2c || c2r ) {
    config.performR2C = 1;
    if ( r2c ) {
      config.inputBufferStride[0] = dims[0];
      config.outputBufferStride[0] = (dims[0] / 2) + 1;
      config.makeForwardPlanOnly = 1;
    } else {
      config.inputBufferStride[0] = (dims[0] / 2) + 1;
      config.outputBufferStride[0] = dims[0];
      config.makeInversePlanOnly = 1;
    }
    for ( dim = 1; dim < rank; dim++ ) {
      config.inputBufferStride[dim] = config.inputBufferStride[dim - 1] * dims[dim];
      config.outputBufferStride[dim] = config.outputBufferStride[dim - 1] * dims[dim];
    }
  }

  VKFFT_CALL( initializeVkFFT(app, config) );
  *app_handle = app;
}

int main() {

  CUDA_CALL( cudaSetDevice(0));
  CUDA_CALL( cudaFree(0) );
  cudaStream_t stream;

  CUDA_CALL( cudaStreamCreate(&stream) );

  int dims[2] = {513, 711};

  VkFFTApplication *plan_forward;
  VkFFTApplication *plan_backward;

  vkfft_create(2, dims,  33, 1, 0, stream, &plan_forward);
  printf("Forward created\n");

  vkfft_create(2, dims,  33, 0, 1, stream, &plan_backward);
  printf("Backward created\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant