diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
index 3f4d09bf76..5f4b2f4801 100644
--- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
+++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
@@ -216,7 +216,8 @@ class CuStateVecCircuitSimulator
     void *newDeviceStateVector;
     HANDLE_CUDA_ERROR(cudaMalloc((void **)&newDeviceStateVector,
                                  stateDimension * sizeof(CudaDataType)));
-
+    HANDLE_CUDA_ERROR(cudaMemset(newDeviceStateVector, 0,
+                                 stateDimension * sizeof(CudaDataType)));
     // Place the state data on device. Could be that
     // we just need the zero state, or the user could have provided one
     void *otherState;
@@ -283,6 +284,8 @@ class CuStateVecCircuitSimulator
     void *newDeviceStateVector;
     HANDLE_CUDA_ERROR(cudaMalloc((void **)&newDeviceStateVector,
                                  stateDimension * sizeof(CudaDataType)));
+    HANDLE_CUDA_ERROR(cudaMemset(newDeviceStateVector, 0,
+                                 stateDimension * sizeof(CudaDataType)));
     constexpr int32_t threads_per_block = 256;
     uint32_t n_blocks =
         (stateDimension + threads_per_block - 1) / threads_per_block;