diff --git a/warp/build.py b/warp/build.py index 34f99680d..0c630d3fd 100644 --- a/warp/build.py +++ b/warp/build.py @@ -18,12 +18,13 @@ def build_cuda( ): with open(cu_path, "rb") as src_file: src = src_file.read() - cu_path = cu_path.encode("utf-8") + cu_path_bytes = cu_path.encode("utf-8") + program_name_bytes = os.path.basename(cu_path).encode("utf-8") inc_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "native").encode("utf-8") output_path = output_path.encode("utf-8") if warp.config.llvm_cuda: - warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False) + warp.context.runtime.llvm.compile_cuda(src, cu_path_bytes, inc_path, output_path, False) else: if ltoirs is None: @@ -34,6 +35,7 @@ def build_cuda( arr_lroir_sizes = (ctypes.c_size_t * num_ltoirs)(*[len(l) for l in ltoirs]) err = warp.context.runtime.core.cuda_compile_program( src, + program_name_bytes, arch, inc_path, 0, diff --git a/warp/context.py b/warp/context.py index 3334f21d3..620e96172 100644 --- a/warp/context.py +++ b/warp/context.py @@ -3473,6 +3473,7 @@ def __init__(self): self.core.cuda_compile_program.argtypes = [ ctypes.c_char_p, # cuda_src + ctypes.c_char_p, # program name ctypes.c_int, # arch ctypes.c_char_p, # include_dir ctypes.c_int, # num_cuda_include_dirs diff --git a/warp/native/warp.cpp b/warp/native/warp.cpp index 7f20b17cd..182a39ab5 100644 --- a/warp/native/warp.cpp +++ b/warp/native/warp.cpp @@ -1038,7 +1038,7 @@ WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; } WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; } -WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; } +WP_API size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; } WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; } WP_API void cuda_unload_module(void* context, void* module) {} diff --git a/warp/native/warp.cu b/warp/native/warp.cu index 2a6b06614..ac7141fa2 100644 --- a/warp/native/warp.cu +++ b/warp/native/warp.cu @@ -2654,7 +2654,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char* } #endif -size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) +size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { // use file extension to determine whether to output PTX or CUBIN const char* output_ext = strrchr(output_path, '.'); @@ -2757,7 +2757,7 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_ res = nvrtcCreateProgram( &prog, // prog cuda_src, // buffer - NULL, // name + program_name, // name 0, // numHeaders NULL, // headers NULL); // includeNames diff --git a/warp/native/warp.h b/warp/native/warp.h index 7ceaf5958..371bb3ef1 100644 --- a/warp/native/warp.h +++ b/warp/native/warp.h @@ -320,7 +320,7 @@ extern "C" WP_API bool cuda_graph_launch(void* graph, void* stream); WP_API bool cuda_graph_destroy(void* context, void* graph); - WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes); + WP_API size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes); WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size); WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads);