diff --git a/CHANGELOG.md b/CHANGELOG.md index 466ebea18..773c93578 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,13 @@ - Add optimization example for soft-body properties ([GH-419](https://github.com/NVIDIA/warp/pull/419)). - Add per-module option to disable fused floating point operations, use `wp.set_module_options({"fuse_fp": False})` ([GH-379](https://github.com/NVIDIA/warp/issues/379)). +- Add per-module option to add CUDA-C line information for profiling, use `wp.set_module_options({"lineinfo": True})`. ### Changed +- Files in the kernel cache will be named according to their directory. Previously, all files began with + `module_codegen` ([GH-431](https://github.com/NVIDIA/warp/issues/431)). + ### Fixed - Fix errors during graph capture caused by module unloading ([GH-401](https://github.com/NVIDIA/warp/issues/401)). diff --git a/docs/configuration.rst b/docs/configuration.rst index 4a0e1f721..774f51bc7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -148,6 +148,13 @@ The options for a module can also be queried using ``wp.get_module_options()``. | | | | that functionally equivalent kernels will produce identical results | | | | | unaffected by the presence or absence of fused operations. | +--------------------+---------+-------------+--------------------------------------------------------------------------+ +|``lineinfo`` | Boolean | ``False`` | If ``True``, CUDA kernels will be compiled with the | +| | | | ``--generate-line-info`` compiler option, which generates line-number | +| | | | information for device code, e.g. to allow NVIDIA Nsight Compute to | +| | | | correlate CUDA-C source and SASS. Line-number information is always | +| | | | included when compiling kernels in ``"debug"`` mode regardless of this | +| | | | setting. | ++--------------------+---------+-------------+--------------------------------------------------------------------------+ |``cuda_output`` | String | ``None`` | The preferred CUDA output format for kernels. Valid choices are ``None``,| | | | | ``"ptx"``, and ``"cubin"``. If ``None``, a format will be determined | | | | | automatically. The module-level setting takes precedence over the global | diff --git a/warp/build.py b/warp/build.py index 0c630d3fd..aefd81597 100644 --- a/warp/build.py +++ b/warp/build.py @@ -14,8 +14,16 @@ # builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension) def build_cuda( - cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, fuse_fp=True, ltoirs=None -): + cu_path, + arch, + output_path, + config="release", + verify_fp=False, + fast_math=False, + fuse_fp=True, + lineinfo=False, + ltoirs=None, +) -> None: with open(cu_path, "rb") as src_file: src = src_file.read() cu_path_bytes = cu_path.encode("utf-8") @@ -45,6 +53,7 @@ def build_cuda( verify_fp, fast_math, fuse_fp, + lineinfo, output_path, num_ltoirs, arr_lroirs, diff --git a/warp/context.py b/warp/context.py index 620e96172..955032f98 100644 --- a/warp/context.py +++ b/warp/context.py @@ -1823,6 +1823,7 @@ def __init__(self, name, loader): "enable_backward": warp.config.enable_backward, "fast_math": False, "fuse_fp": True, + "lineinfo": False, "cuda_output": None, # supported values: "ptx", "cubin", or None (automatic) "mode": warp.config.mode, "block_dim": 256, @@ -1998,7 +1999,8 @@ def load(self, device, block_dim=None) -> ModuleExec: module_hash = self.hasher.module_hash # use a unique module path using the module short hash - module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}") + module_name_short = f"{module_name}_{module_hash.hex()[:7]}" + module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short) with warp.ScopedTimer( f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet @@ -2006,7 +2008,7 @@ def load(self, device, block_dim=None) -> ModuleExec: # ----------------------------------------------------------- # determine output paths if device.is_cpu: - output_name = "module_codegen.o" + output_name = f"{module_name_short}.o" output_arch = None elif device.is_cuda: @@ -2026,10 +2028,10 @@ def load(self, device, block_dim=None) -> ModuleExec: if use_ptx: output_arch = min(device.arch, warp.config.ptx_target_arch) - output_name = f"module_codegen.sm{output_arch}.ptx" + output_name = f"{module_name_short}.sm{output_arch}.ptx" else: output_arch = device.arch - output_name = f"module_codegen.sm{output_arch}.cubin" + output_name = f"{module_name_short}.sm{output_arch}.cubin" # final object binary path binary_path = os.path.join(module_dir, output_name) @@ -2067,7 +2069,7 @@ def load(self, device, block_dim=None) -> ModuleExec: if device.is_cpu: # build try: - source_code_path = os.path.join(build_dir, "module_codegen.cpp") + source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp") # write cpp sources cpp_source = builder.codegen("cpu") @@ -2096,7 +2098,7 @@ def load(self, device, block_dim=None) -> ModuleExec: elif device.is_cuda: # build try: - source_code_path = os.path.join(build_dir, "module_codegen.cu") + source_code_path = os.path.join(build_dir, f"{module_name_short}.cu") # write cuda sources cu_source = builder.codegen("cuda") @@ -2113,9 +2115,10 @@ def load(self, device, block_dim=None) -> ModuleExec: output_arch, output_path, config=self.options["mode"], + verify_fp=warp.config.verify_fp, fast_math=self.options["fast_math"], fuse_fp=self.options["fuse_fp"], - verify_fp=warp.config.verify_fp, + lineinfo=self.options["lineinfo"], ltoirs=builder.ltoirs.values(), ) @@ -2128,7 +2131,7 @@ def load(self, device, block_dim=None) -> ModuleExec: # build meta data meta = builder.build_meta() - meta_path = os.path.join(build_dir, "module_codegen.meta") + meta_path = os.path.join(build_dir, f"{module_name_short}.meta") with open(meta_path, "w") as meta_file: json.dump(meta, meta_file) @@ -2192,7 +2195,7 @@ def safe_rename(src, dst, attempts=5, delay=0.1): # ----------------------------------------------------------- # Load CPU or CUDA binary - meta_path = os.path.join(module_dir, "module_codegen.meta") + meta_path = os.path.join(module_dir, f"{module_name_short}.meta") with open(meta_path, "r") as meta_file: meta = json.load(meta_file) @@ -3483,6 +3486,7 @@ def __init__(self): ctypes.c_bool, # verify_fp ctypes.c_bool, # fast_math ctypes.c_bool, # fuse_fp + ctypes.c_bool, # lineinfo ctypes.c_char_p, # output_path ctypes.c_size_t, # num_ltoirs ctypes.POINTER(ctypes.c_char_p), # ltoirs diff --git a/warp/native/warp.cpp b/warp/native/warp.cpp index 182a39ab5..afcae954a 100644 --- a/warp/native/warp.cpp +++ b/warp/native/warp.cpp @@ -1038,7 +1038,7 @@ WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; } WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; } -WP_API size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; } +WP_API size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; } WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; } WP_API void cuda_unload_module(void* context, void* module) {} diff --git a/warp/native/warp.cu b/warp/native/warp.cu index ac7141fa2..14f78ab37 100644 --- a/warp/native/warp.cu +++ b/warp/native/warp.cu @@ -2654,7 +2654,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char* } #endif -size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) +size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { // use file extension to determine whether to output PTX or CUBIN const char* output_ext = strrchr(output_path, '.'); @@ -2715,8 +2715,13 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int //opts.push_back("--device-debug"); } else + { opts.push_back("--define-macro=NDEBUG"); + if (lineinfo) + opts.push_back("--generate-line-info"); + } + if (verify_fp) opts.push_back("--define-macro=WP_VERIFY_FP"); else diff --git a/warp/native/warp.h b/warp/native/warp.h index 371bb3ef1..404d8f594 100644 --- a/warp/native/warp.h +++ b/warp/native/warp.h @@ -320,7 +320,7 @@ extern "C" WP_API bool cuda_graph_launch(void* graph, void* stream); WP_API bool cuda_graph_destroy(void* context, void* graph); - WP_API size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes); + WP_API size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes); WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size); WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads);