Linux support for CUDA IPC memory and events

NVIDIA · Jan 16, 2025 · 5553c43 · 5553c43
1 parent 7298be5
commit 5553c43
Show file tree

Hide file tree

Showing 12 changed files with 487 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@
 - Support `assert` statements in kernels ([docs](https://nvidia.github.io/warp/debugging.html#assertions)).
   Assertions can only be triggered in `"debug"` mode ([GH-366](https://github.com/NVIDIA/warp/issues/336)).
 - Add optimization example for soft-body properties ([GH-419](https://github.com/NVIDIA/warp/pull/419)).
+- CUDA IPC support on Linux. Call the `ipc_handle()` method get an IPC handle for a `wp.Event` or a `wp.array`,
+  and call `wp.from_ipc_handle()` or `wp.event_from_ipc_handle()` in another process to open the handle
+  ([docs](https://nvidia.github.io/warp/modules/runtime.html#interprocess-communication-ipc)).
 - Add per-module option to disable fused floating point operations, use `wp.set_module_options({"fuse_fp": False})`
   ([GH-379](https://github.com/NVIDIA/warp/issues/379)).
 - Add per-module option to add CUDA-C line information for profiling, use `wp.set_module_options({"lineinfo": True})`.

diff --git a/docs/modules/runtime.rst b/docs/modules/runtime.rst
@@ -115,7 +115,6 @@ Additionally, data can be copied between arrays in different memory spaces using
     :undoc-members:
     :exclude-members: vars
 
-
 Multi-dimensional Arrays
 ########################
 
@@ -1200,3 +1199,42 @@ See :doc:`../profiling` documentation for more information.
 
 .. autoclass:: warp.ScopedTimer
     :noindex:
+
+Interprocess Communication (IPC)
+--------------------------------
+
+Interprocess communication can be used to share Warp arrays and events across
+processes without creating copies of the underlying data.
+
+Some basic requirements for using IPC include:
+
+* Linux operating system
+* The array must be allocated on a GPU device using the default memory allocator (see :doc:`allocators`)
+
+  The ``wp.ScopedMempool`` context manager is useful for temporarily disabling
+  memory pools for the purpose of allocating arrays that can be shared using IPC.
+
+Support for IPC on a device is indicated by the :attr:`is_ipc_supported <warp.context.Device.is_ipc_supported>`
+attribute of the :class:`Device <warp.context.Device>`.
+
+To share a Warp array between processes, use :meth:`array.ipc_handle` in the
+originating process to obtain an IPC handle for the array's memory allocation.
+The handle is a ``bytes`` object with a length of 64.
+The IPC handle along with information about the array (data type, shape, and
+optionally strides) should be shared with another process, e.g. via shared
+memory or files.
+Another process can use this information to import the original array by
+calling :func:`from_ipc_handle`.
+
+Events can be shared in a similar manner, but they must be constructed with
+``interprocess=True``. Additionally, events cannot be created with both
+``interprocess=True`` and ``enable_timing=True``. Use :meth:`Event.ipc_handle`
+in the originating process to obtain an IPC handle for the event. Another
+process can use this information to import the original event by calling
+:func:`event_from_ipc_handle`.
+
+
+
+.. autofunction:: from_ipc_handle
+
+.. autofunction:: event_from_ipc_handle
diff --git a/warp/__init__.py b/warp/__init__.py
@@ -43,6 +43,8 @@
 # numpy interop
 from warp.types import dtype_from_numpy, dtype_to_numpy
 
+from warp.types import from_ipc_handle
+
 from warp.context import init, func, func_grad, func_replay, func_native, kernel, struct, overload
 from warp.context import is_cpu_available, is_cuda_available, is_device_available
 from warp.context import get_devices, get_preferred_device
@@ -65,6 +67,7 @@
     synchronize,
     force_load,
     load_module,
+    event_from_ipc_handle,
 )
 from warp.context import set_module_options, get_module_options, get_module
 from warp.context import capture_begin, capture_end, capture_launch

diff --git a/warp/context.py b/warp/context.py
@@ -5,6 +5,8 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
+from __future__ import annotations
+
 import ast
 import ctypes
 import errno
@@ -2358,14 +2360,17 @@ class Flags:
         DEFAULT = 0x0
         BLOCKING_SYNC = 0x1
         DISABLE_TIMING = 0x2
+        INTERPROCESS = 0x4
 
     def __new__(cls, *args, **kwargs):
         """Creates a new event instance."""
         instance = super(Event, cls).__new__(cls)
         instance.owner = False
         return instance
 
-    def __init__(self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False):
+    def __init__(
+        self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False, interprocess: bool = False
+    ):
         """Initializes the event on a CUDA device.
 
         Args:
@@ -2377,6 +2382,12 @@ def __init__(self, device: "Devicelike" = None, cuda_event=None, enable_timing:
               :func:`~warp.get_event_elapsed_time` can be used to measure the
               time between two events created with ``enable_timing=True`` and
               recorded onto streams.
+            interprocess: If ``True`` this event may be used as an interprocess event.
+
+        Raises:
+            RuntimeError: The event could not be created.
+            ValueError: The combination of ``enable_timing=True`` and
+                ``interprocess=True`` is not allowed.
         """
 
         device = get_device(device)
@@ -2391,11 +2402,48 @@ def __init__(self, device: "Devicelike" = None, cuda_event=None, enable_timing:
             flags = Event.Flags.DEFAULT
             if not enable_timing:
                 flags |= Event.Flags.DISABLE_TIMING
+            if interprocess:
+                if enable_timing:
+                    raise ValueError("The combination of 'enable_timing=True' and 'interprocess=True' is not allowed.")
+                flags |= Event.Flags.INTERPROCESS
+
             self.cuda_event = runtime.core.cuda_event_create(device.context, flags)
             if not self.cuda_event:
                 raise RuntimeError(f"Failed to create event on device {device}")
             self.owner = True
 
+    def ipc_handle(self) -> bytes:
+        """Return a CUDA IPC handle of the event as a 64-byte ``bytes`` object.
+
+        The event must have been created with ``interprocess=True`` in order to
+        obtain a valid interprocess handle.
+
+        IPC is currently only supported on Linux.
+
+        Example:
+            Create an event and get its IPC handle::
+
+                e1 = wp.Event(interprocess=True)
+                event_handle = e1.ipc_handle()
+
+        Raises:
+            RuntimeError: Device does not support IPC.
+        """
+
+        if self.device.is_ipc_supported:
+            # Allocate a buffer for the data (64-element char array)
+            ipc_handle_buffer = (ctypes.c_char * 64)()
+
+            warp.context.runtime.core.cuda_ipc_get_event_handle(self.device.context, self.cuda_event, ipc_handle_buffer)
+
+            if ipc_handle_buffer.raw == bytes(64):
+                warp.utils.warn("IPC event handle appears to be invalid. Was interprocess=True used?")
+
+            return ipc_handle_buffer.raw
+
+        else:
+            raise RuntimeError(f"Device {self.device} does not support IPC.")
+
     def __del__(self):
         if not self.owner:
             return
@@ -2543,23 +2591,22 @@ class Device:
     """A device to allocate Warp arrays and to launch kernels on.
 
     Attributes:
-        ordinal: A Warp-specific integer label for the device. ``-1`` for CPU devices.
-        name: A string label for the device. By default, CPU devices will be named according to the processor name,
+        ordinal (int): A Warp-specific label for the device. ``-1`` for CPU devices.
+        name (str): A label for the device. By default, CPU devices will be named according to the processor name,
             or ``"CPU"`` if the processor name cannot be determined.
-        arch: An integer representing the compute capability version number calculated as
-            ``10 * major + minor``. ``0`` for CPU devices.
-        is_uva: A boolean indicating whether the device supports unified addressing.
+        arch (int): The compute capability version number calculated as ``10 * major + minor``.
+            ``0`` for CPU devices.
+        is_uva (bool): Indicates whether the device supports unified addressing.
             ``False`` for CPU devices.
-        is_cubin_supported: A boolean indicating whether Warp's version of NVRTC can directly
+        is_cubin_supported (bool): Indicates whether Warp's version of NVRTC can directly
             generate CUDA binary files (cubin) for this device's architecture. ``False`` for CPU devices.
-        is_mempool_supported: A boolean indicating whether the device supports using the
-            ``cuMemAllocAsync`` and ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for
-            CPU devices.
-        is_primary: A boolean indicating whether this device's CUDA context is also the
-            device's primary context.
-        uuid: A string representing the UUID of the CUDA device. The UUID is in the same format used by
-            ``nvidia-smi -L``. ``None`` for CPU devices.
-        pci_bus_id: A string identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
+        is_mempool_supported (bool): Indicates whether the device supports using the ``cuMemAllocAsync`` and
+            ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for CPU devices.
+        is_ipc_supported (bool): Indicates whether the device supports IPC.
+        is_primary (bool): Indicates whether this device's CUDA context is also the device's primary context.
+        uuid (str): The UUID of the CUDA device. The UUID is in the same format used by ``nvidia-smi -L``.
+            ``None`` for CPU devices.
+        pci_bus_id (str): An identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
             ``domain``, ``bus``, and ``device`` are all hexadecimal values. ``None`` for CPU devices.
     """
 
@@ -2592,6 +2639,7 @@ def __init__(self, runtime, alias, ordinal=-1, is_primary=False, context=None):
             self.is_uva = False
             self.is_mempool_supported = False
             self.is_mempool_enabled = False
+            self.is_ipc_supported = False  # TODO: Support IPC for CPU arrays
             self.is_cubin_supported = False
             self.uuid = None
             self.pci_bus_id = None
@@ -2607,8 +2655,11 @@ def __init__(self, runtime, alias, ordinal=-1, is_primary=False, context=None):
             # CUDA device
             self.name = runtime.core.cuda_device_get_name(ordinal).decode()
             self.arch = runtime.core.cuda_device_get_arch(ordinal)
-            self.is_uva = runtime.core.cuda_device_is_uva(ordinal)
-            self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal)
+            self.is_uva = runtime.core.cuda_device_is_uva(ordinal) > 0
+            self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal) > 0
+            self.is_ipc_supported = (
+                runtime.core.cuda_device_is_ipc_supported(ordinal) > 0 and platform.system() == "Linux"
+            )
             if warp.config.enable_mempools_at_init:
                 # enable if supported
                 self.is_mempool_enabled = self.is_mempool_supported
@@ -3375,6 +3426,8 @@ def __init__(self):
             self.core.cuda_device_is_uva.restype = ctypes.c_int
             self.core.cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
             self.core.cuda_device_is_mempool_supported.restype = ctypes.c_int
+            self.core.cuda_device_is_ipc_supported.argtypes = [ctypes.c_int]
+            self.core.cuda_device_is_ipc_supported.restype = ctypes.c_int
             self.core.cuda_device_set_mempool_release_threshold.argtypes = [ctypes.c_int, ctypes.c_uint64]
             self.core.cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
             self.core.cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
@@ -3428,6 +3481,22 @@ def __init__(self):
             self.core.cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
             self.core.cuda_set_mempool_access_enabled.restype = ctypes.c_int
 
+            # inter-process communication
+            self.core.cuda_ipc_get_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_get_mem_handle.restype = None
+            self.core.cuda_ipc_open_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_open_mem_handle.restype = ctypes.c_void_p
+            self.core.cuda_ipc_close_mem_handle.argtypes = [ctypes.c_void_p]
+            self.core.cuda_ipc_close_mem_handle.restype = None
+            self.core.cuda_ipc_get_event_handle.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_char),
+            ]
+            self.core.cuda_ipc_get_event_handle.restype = None
+            self.core.cuda_ipc_open_event_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_open_event_handle.restype = ctypes.c_void_p
+
             self.core.cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
             self.core.cuda_stream_create.restype = ctypes.c_void_p
             self.core.cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
@@ -4894,6 +4963,40 @@ def from_numpy(
     )
 
 
+def event_from_ipc_handle(handle, device: "Devicelike" = None) -> Event:
+    """Create an event from an IPC handle.
+
+    Args:
+        handle: The interprocess event handle for an existing CUDA event.
+        device (Devicelike): Device to associate with the array.
+
+    Returns:
+        An event created from the interprocess event handle ``handle``.
+
+    Raises:
+        RuntimeError: IPC is not supported on ``device``.
+    """
+
+    try:
+        # Performance note: try first, ask questions later
+        device = warp.context.runtime.get_device(device)
+    except Exception:
+        # Fallback to using the public API for retrieving the device,
+        # which takes take of initializing Warp if needed.
+        device = warp.context.get_device(device)
+
+    if not device.is_ipc_supported:
+        raise RuntimeError(f"IPC is not supported on device {device}.")
+
+    event = Event(
+        device=device, cuda_event=warp.context.runtime.core.cuda_ipc_open_event_handle(device.context, handle)
+    )
+    # Events created from IPC handles must be freed with cuEventDestroy
+    event.owner = True
+
+    return event
+
+
 # given a kernel destination argument type and a value convert
 #  to a c-type that can be passed to a kernel
 def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):

diff --git a/warp/native/cuda_util.cpp b/warp/native/cuda_util.cpp
@@ -102,6 +102,11 @@ static PFN_cuGraphicsGLRegisterBuffer_v3000 pfn_cuGraphicsGLRegisterBuffer;
 static PFN_cuGraphicsUnregisterResource_v3000 pfn_cuGraphicsUnregisterResource;
 static PFN_cuModuleGetGlobal_v3020 pfn_cuModuleGetGlobal;
 static PFN_cuFuncSetAttribute_v9000 pfn_cuFuncSetAttribute;
+static PFN_cuIpcGetEventHandle_v4010 pfn_cuIpcGetEventHandle;
+static PFN_cuIpcOpenEventHandle_v4010 pfn_cuIpcOpenEventHandle;
+static PFN_cuIpcGetMemHandle_v4010 pfn_cuIpcGetMemHandle;
+static PFN_cuIpcOpenMemHandle_v11000 pfn_cuIpcOpenMemHandle;
+static PFN_cuIpcCloseMemHandle_v4010 pfn_cuIpcCloseMemHandle;
 
 static bool cuda_driver_initialized = false;
 
@@ -238,6 +243,11 @@ bool init_cuda_driver()
     get_driver_entry_point("cuGraphicsUnregisterResource", 3000, &(void*&)pfn_cuGraphicsUnregisterResource);
     get_driver_entry_point("cuModuleGetGlobal", 3020, &(void*&)pfn_cuModuleGetGlobal);
     get_driver_entry_point("cuFuncSetAttribute", 9000, &(void*&)pfn_cuFuncSetAttribute);
+    get_driver_entry_point("cuIpcGetEventHandle", 4010, &(void*&)pfn_cuIpcGetEventHandle);
+    get_driver_entry_point("cuIpcOpenEventHandle", 4010, &(void*&)pfn_cuIpcOpenEventHandle);
+    get_driver_entry_point("cuIpcGetMemHandle", 4010, &(void*&)pfn_cuIpcGetMemHandle);
+    get_driver_entry_point("cuIpcOpenMemHandle", 11000, &(void*&)pfn_cuIpcOpenMemHandle);
+    get_driver_entry_point("cuIpcCloseMemHandle", 4010, &(void*&)pfn_cuIpcCloseMemHandle);
 
     if (pfn_cuInit)
         cuda_driver_initialized = check_cu(pfn_cuInit(0));
@@ -585,4 +595,29 @@ CUresult cuFuncSetAttribute_f(CUfunction hfunc, CUfunction_attribute attrib, int
     return pfn_cuFuncSetAttribute ? pfn_cuFuncSetAttribute(hfunc, attrib, value) : DRIVER_ENTRY_POINT_ERROR;
 }
 
+CUresult cuIpcGetEventHandle_f(CUipcEventHandle *pHandle, CUevent event)
+{
+    return pfn_cuIpcGetEventHandle ? pfn_cuIpcGetEventHandle(pHandle, event) : DRIVER_ENTRY_POINT_ERROR;
+}
+
+CUresult cuIpcOpenEventHandle_f(CUevent *phEvent, CUipcEventHandle handle)
+{
+    return pfn_cuIpcOpenEventHandle ? pfn_cuIpcOpenEventHandle(phEvent, handle) : DRIVER_ENTRY_POINT_ERROR;
+}
+
+CUresult cuIpcGetMemHandle_f(CUipcMemHandle *pHandle, CUdeviceptr dptr)
+{
+    return pfn_cuIpcGetMemHandle ? pfn_cuIpcGetMemHandle(pHandle, dptr) : DRIVER_ENTRY_POINT_ERROR;
+}
+
+CUresult cuIpcOpenMemHandle_f(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int flags)
+{
+    return pfn_cuIpcOpenMemHandle ? pfn_cuIpcOpenMemHandle(pdptr, handle, flags) : DRIVER_ENTRY_POINT_ERROR;
+}
+
+CUresult cuIpcCloseMemHandle_f(CUdeviceptr dptr)
+{
+    return pfn_cuIpcCloseMemHandle ? pfn_cuIpcCloseMemHandle(dptr) : DRIVER_ENTRY_POINT_ERROR;
+}
+
 #endif // WP_ENABLE_CUDA
diff --git a/warp/native/cuda_util.h b/warp/native/cuda_util.h
@@ -101,6 +101,11 @@ CUresult cuGraphicsGLRegisterBuffer_f(CUgraphicsResource *pCudaResource, unsigne
 CUresult cuGraphicsUnregisterResource_f(CUgraphicsResource resource);
 CUresult cuModuleGetGlobal_f(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name );
 CUresult cuFuncSetAttribute_f(CUfunction hfunc, CUfunction_attribute attrib, int value);
+CUresult cuIpcGetEventHandle_f(CUipcEventHandle *pHandle, CUevent event);
+CUresult cuIpcOpenEventHandle_f(CUevent *phEvent, CUipcEventHandle handle);
+CUresult cuIpcGetMemHandle_f(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+CUresult cuIpcOpenMemHandle_f(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int flags);
+CUresult cuIpcCloseMemHandle_f(CUdeviceptr dptr);
 
 bool init_cuda_driver();
 bool is_cuda_driver_initialized();

diff --git a/warp/native/warp.cpp b/warp/native/warp.cpp
@@ -992,6 +992,7 @@ WP_API int cuda_device_get_pci_bus_id(int ordinal) { return -1; }
 WP_API int cuda_device_get_pci_device_id(int ordinal) { return -1; }
 WP_API int cuda_device_is_uva(int ordinal) { return 0; }
 WP_API int cuda_device_is_mempool_supported(int ordinal) { return 0; }
+WP_API int cuda_device_is_ipc_supported(int ordinal) { return 0; }
 WP_API int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold) { return 0; }
 WP_API uint64_t cuda_device_get_mempool_release_threshold(int ordinal) { return 0; }
 WP_API void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem) {}
@@ -1015,6 +1016,12 @@ WP_API int cuda_set_peer_access_enabled(void* target_context, void* peer_context
 WP_API int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal) { return 0; }
 WP_API int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable) { return 0; }
 
+WP_API void cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {}
+WP_API void* cuda_ipc_open_mem_handle(void* context, char* handle) { return NULL; }
+WP_API void cuda_ipc_close_mem_handle(void* ptr) {}
+WP_API void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {}
+WP_API void* cuda_ipc_open_event_handle(void* context, char* handle) { return NULL; }
+
 WP_API void* cuda_stream_create(void* context, int priority) { return NULL; }
 WP_API void cuda_stream_destroy(void* context, void* stream) {}
 WP_API void cuda_stream_register(void* context, void* stream) {}