diff --git a/WORKSPACE b/WORKSPACE
index 65ad3b377..45f8667db 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -13,8 +13,8 @@ git_repository(
 git_repository(
 	name = "build_bazel_rules_cuda",
 	remote = "https://github.com/liuliu/rules_cuda.git",
-	commit = "be346d4d12883469878edd693097f87723400c5b",
-	shallow_since = "1681409802 -0400"
+	commit = "7dcc4673fa487ad12fe3abe84d01edd9fe588e85",
+	shallow_since = "1732063237 -0500"
 )
 
 http_archive(
diff --git a/config/ccv.bzl b/config/ccv.bzl
index fbc63a203..4ef7f703d 100644
--- a/config/ccv.bzl
+++ b/config/ccv.bzl
@@ -98,8 +98,8 @@ def ccv_deps():
         git_repository,
         name = "build_bazel_rules_cuda",
         remote = "https://github.com/liuliu/rules_cuda.git",
-        commit = "be346d4d12883469878edd693097f87723400c5b",
-        shallow_since = "1681409802 -0400"
+	    commit = "7dcc4673fa487ad12fe3abe84d01edd9fe588e85",
+	    shallow_since = "1732063237 -0500"
     )
     _maybe(
         http_archive,
diff --git a/lib/configure b/lib/configure
index 8329af1b5..81ae2a304 100755
--- a/lib/configure
+++ b/lib/configure
@@ -4506,7 +4506,7 @@ if [ -d "$cuda_prefix" ]; then
 	DEFINE_MACROS="$DEFINE_MACROS-D HAVE_CUDA "
 
 	# CUB requires stdc++, unfortunately.
-	MKLDFLAGS="$MKLDFLAGS-lcudart -lcublas -lcurand -lstdc++ "
+	MKLDFLAGS="$MKLDFLAGS-lcudart -lcublas -lcurand -lcufile -lstdc++ "
 
 	CPPFLAGS="$CPPFLAGS-I$cuda_prefix/include "
 
@@ -4760,7 +4760,7 @@ if test "$mps_support" = yes; then
 printf "%s\n" "yes" >&6; }
 	DEFINE_MACROS="$DEFINE_MACROS-D HAVE_MPS "
 
-	MKLDFLAGS="$MKLDFLAGS-framework MetalPerformanceShaders -framework MetalPerformanceShadersGraph -framework Foundation -framework Metal -framework OpenCL -lc++ "
+	MKLDFLAGS="$MKLDFLAGS-framework MetalPerformanceShaders -framework MetalPerformanceShadersGraph -framework Foundation -framework Metal -lc++ "
 
 	CUDA_SRCS=""
 
diff --git a/lib/configure.ac b/lib/configure.ac
index ba49af529..544e059e3 100644
--- a/lib/configure.ac
+++ b/lib/configure.ac
@@ -159,7 +159,7 @@ AC_ARG_WITH(cuda, [AS_HELP_STRING([--with-cuda], [CUDA installation [ARG=/usr/lo
 if [[ -d "$cuda_prefix" ]]; then
 	AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_CUDA "])
 	# CUB requires stdc++, unfortunately.
-	AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lcudart -lcublas -lcurand -lstdc++ "])
+	AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lcudart -lcublas -lcurand -lcufile -lstdc++ "])
 	AC_SUBST(CPPFLAGS, ["$CPPFLAGS-I$cuda_prefix/include "])
 	if [[ -d "$cuda_prefix/lib64" ]]; then
 		AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-L$cuda_prefix/lib64 "])
diff --git a/lib/nnc/ccv_nnc_tensor.c b/lib/nnc/ccv_nnc_tensor.c
index 497d55036..5869644b0 100644
--- a/lib/nnc/ccv_nnc_tensor.c
+++ b/lib/nnc/ccv_nnc_tensor.c
@@ -132,11 +132,8 @@ ccv_nnc_tensor_t* ccv_nnc_tensor_new_from_file(const ccv_nnc_tensor_param_t para
 			// This is not supported yet on CUDA.
 			tensor->data.u8 = (uint8_t*)cumalloc(CCV_TENSOR_GET_DEVICE_ID(params.type), size);
 			int fd = open(filename, O_RDONLY, 0);
-			void* bufptr = mmap(0, size, PROT_READ, MAP_PRIVATE, fd, offset);
+			cufileread(fd, offset, tensor->data.u8, size);
 			close(fd);
-			madvise(bufptr, size, MADV_SEQUENTIAL | MADV_WILLNEED);
-			cumemcpy(tensor->data.u8, CCV_TENSOR_GPU_MEMORY, bufptr, CCV_TENSOR_CPU_MEMORY, size);
-			munmap(bufptr, size);
 		} else
 			tensor->data.u8 = 0;
 	} else {
diff --git a/lib/nnc/gpu/ccv_nnc_compat.cu b/lib/nnc/gpu/ccv_nnc_compat.cu
index e6d8c7a29..620eba428 100644
--- a/lib/nnc/gpu/ccv_nnc_compat.cu
+++ b/lib/nnc/gpu/ccv_nnc_compat.cu
@@ -1,4 +1,5 @@
 #include "ccv_nnc_compat.h"
+#include <cufile.h> // For GPUDirect Storage
 extern "C" {
 #include <nnc/ccv_nnc_easy.h>
 #include <nnc/_ccv_nnc_stream.h>
@@ -305,6 +306,21 @@ void cuunregister(void* ptr)
 	CUDA_ENFORCE(cudaHostUnregister(ptr));
 }
 
+void cufileread(const int fd, const off_t file_offset, void* const buf, const size_t size)
+{
+	CUfileDescr_t file_descr = {
+		.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD,
+		.handle = {
+			.fd = fd,
+		},
+		.fs_ops = 0,
+	};
+	CUfileHandle_t file_handle;
+	CUFILE_ENFORCE(cuFileHandleRegister(&file_handle, &file_descr));
+	cuFileRead(file_handle, buf, size, file_offset, 0);
+	cuFileHandleDeregister(file_handle);
+}
+
 typedef struct {
 	cudaStream_t stream;
 	cublasHandle_t cublas;
diff --git a/lib/nnc/gpu/ccv_nnc_compat.h b/lib/nnc/gpu/ccv_nnc_compat.h
index ecf6b94a3..e88f81eb1 100644
--- a/lib/nnc/gpu/ccv_nnc_compat.h
+++ b/lib/nnc/gpu/ccv_nnc_compat.h
@@ -34,6 +34,7 @@ int curegmp(int device_id, cump_f func, void* const context); // register memory
 void cuunregmp(const int id); // un-register memory pressure handler.
 void cusetprofiler(int state);
 void cusetdevicemap(const int* const device_map, const int size);
+void cufileread(const int fd, const off_t file_offset, void* const buf, const size_t size);
 
 // Stream context
 CCV_WARN_UNUSED(ccv_nnc_stream_context_t*) ccv_nnc_init_stream_context(ccv_nnc_stream_context_t* const stream_context);
@@ -120,6 +121,20 @@ CCV_WARN_UNUSED(size_t) ccv_nnc_cublas_workspace_size_in_bytes(const ccv_nnc_ten
 } while (0)
 #endif
 
+#ifdef NDEBUG
+#define CUFILE_ENFORCE(status) status
+#else
+#define CUFILE_ENFORCE(status) do {                               \
+	const CUfileError_t __status = status;                        \
+	if (__status.err != CU_FILE_SUCCESS) {                        \
+		printf("[%s:%d]:CUDA - Error: %s\n",                      \
+				__FILE__, __LINE__, CUFILE_ERRSTR(__status.err)); \
+		assert(0);                                                \
+		exit(EXIT_FAILURE);                                       \
+	}                                                             \
+} while (0)
+#endif
+
 #ifdef NDEBUG
 #define CUBLAS_ENFORCE(status) status
 #else