Skip to content

Commit

Permalink
Add support for conv3d on call to cudnn side.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Jan 21, 2025
1 parent e3ccc41 commit d157996
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 4 deletions.
22 changes: 21 additions & 1 deletion lib/nnc/cmd/convolution/gpu/ccv_nnc_conv_gpu_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,27 @@ static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint
CUDNN_ENFORCE(cudnnConvolutionForward(cudnn, &one, a.descriptor, a.data.u8, w.descriptor, weight_data, conv.descriptor, algo, workspace, workspace_size, &zero, b.descriptor, b.data.u8));
if (input_size > 2 && inputs[2])
{
const ccv_nnc_cudnn_tensor_view_descriptor_t bias = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[2]);
const int w_nd = ccv_nnc_tensor_nd(inputs[1]->info.dim);
ccv_nnc_cudnn_tensor_view_descriptor_t bias;
if (w_nd <= 4)
bias = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[2]);
else if (w_nd == 5) {
ccv_nnc_tensor_view_t biast = ccv_nnc_get_tensor_view(inputs[2]);
const int b_nd = ccv_nnc_tensor_nd(outputs[0]->info.dim);
if (outputs[0]->info.format == CCV_TENSOR_FORMAT_NHWC)
{
biast.info.format = CCV_TENSOR_FORMAT_NHWC;
biast.info.dim[0] = biast.info.dim[1] = biast.info.dim[2] = biast.info.dim[3] = 1;
biast.info.dim[b_nd == 4 ? 3 : 4] = inputs[1]->info.dim[0];
} else if (outputs[0]->info.format == CCV_TENSOR_FORMAT_NCHW) {
biast.info.format = CCV_TENSOR_FORMAT_NCHW;
biast.info.dim[0] = biast.info.dim[1] = biast.info.dim[2] = biast.info.dim[3] = biast.info.dim[4] = 1;
biast.info.dim[b_nd == 4 ? 0 : 1] = inputs[1]->info.dim[0];
}
bias = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, &biast);
} else {
assert(0 && "w should be either 4-dimension or 5-dimension");
}
CUDNN_ENFORCE(cudnnAddTensor(cudnn, &one, bias.descriptor, bias.data.u8, &one, b.descriptor, b.data.u8));
ccv_nnc_cudnn_deinit_tensor_view_descriptor(bias);
}
Expand Down
4 changes: 2 additions & 2 deletions lib/nnc/gpu/ccv_nnc_compat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ ccv_nnc_cudnn_filter_descriptor_t ccv_nnc_cudnn_get_filter_descriptor(const ccv_
};
assert(CCV_IS_TENSOR_CONTIGUOUS(tensor));
const int nd = ccv_nnc_tensor_nd(tensor->info.dim);
assert(nd == CCV_NNC_MAX_DIM + 2);
assert(nd == CCV_NNC_MAX_DIM + 2 || nd == CCV_NNC_MAX_DIM + 3);
int dim[CCV_NNC_MAX_DIM_ALLOC] = {};
int i;
if (tensor->info.format == CCV_TENSOR_FORMAT_NCHW)
Expand Down Expand Up @@ -1486,7 +1486,7 @@ ccv_nnc_cudnn_convolution_descriptor_t ccv_nnc_cudnn_get_convolution_descriptor(
{
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(convolution_desc.descriptor, p[0], p[1], v[0], v[1], u[0], u[1], CUDNN_CROSS_CORRELATION, ccv_nnc_cudnn_datatype(datatype)));
} else {
CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(convolution_desc.descriptor, CCV_NNC_MAX_DIM, p, v, u, CUDNN_CROSS_CORRELATION, ccv_nnc_cudnn_datatype(datatype)));
CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(convolution_desc.descriptor, size_nd, p, v, u, CUDNN_CROSS_CORRELATION, ccv_nnc_cudnn_datatype(datatype)));
}
CUDNN_ENFORCE(cudnnSetConvolutionMathType(convolution_desc.descriptor, CUDNN_TENSOR_OP_MATH));
return convolution_desc;
Expand Down
124 changes: 123 additions & 1 deletion test/int/nnc/cudnn.tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ TEST_SETUP()

#define KERNEL_SIZE (7)

#define BATCH_SIZE (64)
#define BATCH_SIZE (16)

TEST_CASE("cudnn forward convolution")
{
Expand Down Expand Up @@ -345,6 +345,128 @@ TEST_CASE("cudnn forward convolution with dilation 2, 3")
ccv_nnc_tensor_free(ga);
}

TEST_CASE("cudnn forward convolution 3d")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
hint.stride.dim[0] = 2;
hint.border.begin[0] = 1;
hint.border.end[0] = 1;
assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
// configure the inlets.
dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, 0);
int i;
for (i = 0; i < INPUT_DIM * 3 * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
for (i = 0; i < OUTPUT_DIM; i++)
bias->data.f32[i] = (float)i / OUTPUT_DIM;
// Copy generated matrix values over to GPU.
ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
move.backend = CCV_NNC_BACKEND_GPU_REF;
assert(move.backend >= 0);
ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
assert(transform.backend >= 0);
ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
ccv_nnc_stream_context_wait(stream_context);
ccv_nnc_tensor_free(gw);
cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
assert(cmd.backend >= 0);
cmd.algorithm = -1;
cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
ccv_nnc_stream_context_wait(stream_context);
ccv_nnc_stream_context_free(stream_context);
ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
cmd.backend = CCV_NNC_BACKEND_CPU_REF;
assert(cmd.backend >= 0);
ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
ccv_nnc_tensor_free(c);
ccv_nnc_tensor_free(gc);
ccv_nnc_tensor_free(bias);
ccv_nnc_tensor_free(w);
ccv_nnc_tensor_free(b);
ccv_nnc_tensor_free(a);
ccv_nnc_tensor_free(gbias);
ccv_nnc_tensor_free(gwo);
ccv_nnc_tensor_free(ga);
}

TEST_CASE("cudnn forward convolution 3d in nchw format")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
hint.stride.dim[0] = 2;
hint.border.begin[0] = 1;
hint.border.end[0] = 1;
assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM), 0);
// configure the inlets.
dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, 0);
int i;
for (i = 0; i < 3 * INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
for (i = 0; i < OUTPUT_DIM; i++)
bias->data.f32[i] = (float)i / OUTPUT_DIM;
// Copy generated matrix values over to GPU.
ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
move.backend = CCV_NNC_BACKEND_GPU_REF;
assert(move.backend >= 0);
ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);

ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
assert(transform.backend >= 0);
cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
assert(cmd.backend >= 0);
cmd.algorithm = -1;
cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
cmd.backend = CCV_NNC_BACKEND_CPU_REF;
assert(cmd.backend >= 0);
ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
ccv_nnc_tensor_free(c);
ccv_nnc_tensor_free(gc);
ccv_nnc_tensor_free(bias);
ccv_nnc_tensor_free(w);
ccv_nnc_tensor_free(b);
ccv_nnc_tensor_free(a);
ccv_nnc_tensor_free(gbias);
ccv_nnc_tensor_free(gw);
ccv_nnc_tensor_free(ga);
}

TEST_CASE("cudnn backward convolution")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
Expand Down

0 comments on commit d157996

Please sign in to comment.