From 7b25b158542ba0bf8d2287b9107ec642895960e8 Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Sat, 9 Nov 2024 13:25:44 -0800 Subject: [PATCH] Improve memory leak --- .../include/mlir-executor/Runtime/API/API.h | 6 ---- .../executor/lib/CAPI/Runtime/Runtime.cpp | 7 +++-- .../executor/lib/Runtime/API/API.cpp | 14 --------- .../Backend/Lua/Modules/CUDA/CUDAModule.cpp | 30 ++++++------------- .../Lua/Modules/TensorRT/TensorRTModule.cpp | 1 - .../python/bindings/Runtime/RuntimePyBind.cpp | 2 -- 6 files changed, 13 insertions(+), 47 deletions(-) diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h index c750f3188..6bd406435 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h @@ -795,12 +795,6 @@ class AllocTracker { /// Returns true if the ptr is released internally. bool isReleasedInternally(uintptr_t ptr) const; - /// Set the pointer is allocated by TensorRT. - void setTensorRTAllocated(uintptr_t ptr); - - /// Get that pointer is allocated by TensorRT. - bool getTensorRTAllocated(uintptr_t ptr); - private: struct Metadata { std::atomic externalReferenceCount = {0}; diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 70046bb21..2939e060d 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -27,6 +27,7 @@ #include "mlir-executor/Runtime/API/API.h" #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h" #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h" +#include "mlir-executor/Runtime/Support/Support.h" #include "mlir-executor/Support/Status.h" #include "mlir/Support/FileUtilities.h" #include "llvm/Support/Debug.h" @@ -324,9 +325,9 @@ MTRT_Status mtrtMemRefCreateExternal( MTRT_Status mtrtMemRefValueDestroyAsync(MTRT_MemRefValue buffer, MTRT_Stream stream) { - MemRefValue *memref = unwrap(buffer); - llvm::dbgs() << "[MLIR-TRT] Deallocating memref pointer " << memref->getMemory() << "\n"; + MTRT_DBGF("destroying memref pointer 0x%lx asynchronously", + memref->getMemory()); Status s = memref->getClient()->deallocate( std::unique_ptr(memref), mtrtStreamIsNull(stream) ? std::nullopt @@ -338,7 +339,7 @@ MTRT_Status mtrtMemRefValueDestroyAsync(MTRT_MemRefValue buffer, MTRT_Status mtrtMemRefValueDestroy(MTRT_MemRefValue buffer) { MemRefValue *memref = unwrap(buffer); - llvm::dbgs() << "[MLIR-TRT] Deallocating memref pointer " << memref->getMemory() << "\n"; + MTRT_DBGF("destroying memref pointer 0x%lx", memref->getMemory()); Status s = memref->getClient()->deallocate(std::unique_ptr(memref)); if (!s.isOk()) diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp index 76f55a50c..7f0f9b4a9 100644 --- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp @@ -396,20 +396,6 @@ AllocTracker::~AllocTracker() { MTRT_DBGF("freed %zu bytes of unfreed memory", totalSize); } -void AllocTracker::setTensorRTAllocated(uintptr_t ptr) { - assert(llvm::is_contained(map, ptr) && - llvm::formatv("Untracked pointer {0}", ptr).str().c_str()); - std::unique_ptr const &metadata = map.at(ptr); - metadata->tensorrtAllocated = true; -} - -bool AllocTracker::getTensorRTAllocated(uintptr_t ptr) { - assert(llvm::is_contained(map, ptr) && - llvm::formatv("Untracked pointer {0}", ptr).str().c_str()); - std::unique_ptr const &metadata = map.at(ptr); - return metadata->tensorrtAllocated; -} - void AllocTracker::markReleasedInternally(uintptr_t ptr) { assert(llvm::is_contained(map, ptr) && llvm::formatv("Untracked pointer {0}", ptr).str().c_str()); diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp index 3095b1703..1341a9b91 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp @@ -432,13 +432,9 @@ registerCudaMemoryManagementOps(sol::state_view &lua, cudaMemcpyDeviceToHost, stream), state); - if (allocTracker->getTensorRTAllocated( - reinterpret_cast(srcPtr))) { - // Free tensorrt allocate source pointer, since there it won't be - // released by external memref. - SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state); - allocTracker->untrack(reinterpret_cast(srcPtr)); - } + if (allocTracker->get(src).isInternallyManaged() && + allocTracker->getExternalReferenceCount(src)) + allocTracker->markReleasedInternally(src); }; lua["__cuda_memcpy_host_pinned2device"] = @@ -487,13 +483,9 @@ registerCudaMemoryManagementOps(sol::state_view &lua, cudaMemcpyDeviceToHost, stream), state); - if (allocTracker->getTensorRTAllocated( - reinterpret_cast(srcPtr))) { - // Free tensorrt allocate source pointer, since there it won't be - // released by external memref. - SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state); - allocTracker->untrack(reinterpret_cast(srcPtr)); - } + if (allocTracker->get(src).isInternallyManaged() && + allocTracker->getExternalReferenceCount(src)) + allocTracker->markReleasedInternally(src); }; lua["__cuda_memcpy_device2device"] = [allocTracker]( sol::this_state state, @@ -518,13 +510,9 @@ registerCudaMemoryManagementOps(sol::state_view &lua, cudaMemcpyDeviceToDevice, stream), state); - if (allocTracker->getTensorRTAllocated( - reinterpret_cast(srcPtr))) { - // Free tensorrt allocate source pointer, since there it won't be - // released by external memref. - SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state); - allocTracker->untrack(reinterpret_cast(srcPtr)); - } + if (allocTracker->get(src).isInternallyManaged() && + allocTracker->getExternalReferenceCount(src)) + allocTracker->markReleasedInternally(src); return; }; } diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index ea4f08ef8..943493f99 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -155,7 +155,6 @@ class OutputAllocatorImpl : public nvinfer1::IOutputAllocator { if (memory.isOk()) { mOutputPtr = (*memory).ptr; mOutputSize = memory->size; - mTracker->setTensorRTAllocated(memory->ptr); MTRT_DBGF( "tensorrt module output allocator allocating %lu bytes at 0x%lx", mOutputSize, mOutputPtr); diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index 760bca574..24e75bf19 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -346,8 +346,6 @@ static std::unique_ptr createMemRefViewFromDLPack(PyRuntimeClient &client, py::capsule capsule, std::optional assertCanonicalStrides) { - llvm::dbgs() << "Creating a memref view from DL pack tensors\n"; - DLManagedTensor *managedTensor = static_cast( PyCapsule_GetPointer(capsule.ptr(), "dltensor"));