-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
99 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#include <ccv.h> | ||
#include <nnc/ccv_nnc.h> | ||
#include <nnc/ccv_nnc_easy.h> | ||
#include <3rdparty/dsfmt/dSFMT.h> | ||
#include <sys/time.h> | ||
#include <ctype.h> | ||
|
||
static double get_current_time(void) | ||
{ | ||
struct timeval tv; | ||
gettimeofday(&tv, NULL); | ||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; | ||
} | ||
|
||
|
||
int main(int argc, char** argv) | ||
{ | ||
ccv_nnc_init(); | ||
// Bypass error: variable-sized object may not be initialized | ||
#define num_trials 14 | ||
int B_candidates[num_trials] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; | ||
int R_candidates[num_trials] = { 32, 64, 128, 256, 512, 1024, 2048, 4096, 1024, 2048, 4096, 1024, 2048, 4096 }; | ||
int C_candidates[num_trials] = { 32, 64, 128, 256, 512, 1024, 2048, 4096, 1024, 2048, 4096, 1024, 2048, 4096 }; | ||
int Hq_candidates[num_trials] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }; | ||
int Hk_candidates[num_trials] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }; | ||
int D_candidates[num_trials] = { 64, 64, 64, 64, 64, 64, 64, 64, 80, 80, 80, 128, 128, 128 }; | ||
int is_causal_candidates[num_trials] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; | ||
|
||
for (int trial = 0; trial < num_trials; ++trial) { | ||
int B = B_candidates[trial]; | ||
int R = R_candidates[trial]; | ||
int C = C_candidates[trial]; | ||
int Hq = Hq_candidates[trial]; | ||
int Hk = Hk_candidates[trial]; | ||
int D = D_candidates[trial]; | ||
int is_causal = is_causal_candidates[trial]; | ||
float scale = 1.0 / sqrt((float)D); | ||
|
||
ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0); | ||
ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0); | ||
ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0); | ||
|
||
for (int i = 0; i < B * R * Hq * D; ++i) { | ||
q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D); | ||
} | ||
for (int i = 0; i < B * C * Hk * D; ++i) { | ||
k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D); | ||
} | ||
for (int i = 0; i < B * C * Hk * D; ++i) { | ||
v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D); | ||
} | ||
|
||
ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0); | ||
ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, NULL, NULL, NULL), TENSOR_LIST(o_tensor, NULL), 0); | ||
ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0); | ||
ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0); | ||
ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0); | ||
ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0); | ||
|
||
// Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC? | ||
ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0); | ||
ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0); | ||
ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0); | ||
ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0); | ||
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0); | ||
|
||
for (int i = 0; i < 5; i++) | ||
ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, NULL), 0); | ||
double elapsed_time = get_current_time(); | ||
for (int i = 0; i < 40; i++) | ||
ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, NULL), 0); | ||
elapsed_time = get_current_time() - elapsed_time; | ||
printf("%d, %d, %d, %d, %d, %d, %2.3f\n", B, R, C, Hq, Hk, D, elapsed_time); | ||
|
||
ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0); | ||
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0); | ||
ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0); | ||
ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0); | ||
|
||
// REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 3e-3, "GPU computed output should be the same as CPU computed ones"); | ||
|
||
ccv_nnc_tensor_free(o_tensor); | ||
ccv_nnc_tensor_free(gpu_o_tensor); | ||
ccv_nnc_tensor_free(copy_of_gpu_o_tensor); | ||
ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16); | ||
ccv_nnc_tensor_free(q_tensor); | ||
ccv_nnc_tensor_free(k_tensor); | ||
ccv_nnc_tensor_free(v_tensor); | ||
ccv_nnc_tensor_free(q_tensor_f16); | ||
ccv_nnc_tensor_free(k_tensor_f16); | ||
ccv_nnc_tensor_free(v_tensor_f16); | ||
ccv_nnc_tensor_free(gpu_q_tensor); | ||
ccv_nnc_tensor_free(gpu_k_tensor); | ||
ccv_nnc_tensor_free(gpu_v_tensor); | ||
} | ||
#undef num_trials | ||
return 0; | ||
} |