diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.cpp b/src/plugins/intel_gpu/src/graph/debug_helper.cpp index b69d10e137010e..0eff057741f4b1 100644 --- a/src/plugins/intel_gpu/src/graph/debug_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/debug_helper.cpp @@ -52,12 +52,14 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump file_stream << "shape: " << size.to_string() << " "; file_stream << "(count: " << size.count() << ", addr: " << mem->buffer_ptr() + << ", original dt: " << mem->get_layout().data_type << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << (dump_raw ? " raw data" : "") << std::endl; } else { file_stream << "shape: " << tmp_size.to_string() << " "; file_stream << "(count: " << tmp_size.count() << ", addr: " << mem->buffer_ptr() + << ", original dt: " << mem->get_layout().data_type << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ", original shape: " << size.to_string() << ")" << (dump_raw ? " raw data" : "") << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index 1ffbfbbfbade37..4153e1311e66f1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -431,6 +431,8 @@ struct kv_cache_impl : multi_stage_primitive { auto comp_scale_past_layout = impl_param.input_layouts[input_idx]; auto comp_scale_present_layout = impl_param.output_layouts[output_idx]; + GPU_DEBUG_TRACE_DETAIL << "Update params, input: " << comp_scale_past_layout.to_short_string() << ", output: " << comp_scale_past_layout << "\n"; + params.inputs.resize(inputs_count); params.inputs[0] = convert_data_tensor(comp_scale_past_layout); params.outputs[0] = convert_data_tensor(comp_scale_present_layout); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index 86b49484282238..63ddc25701b2c9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -313,7 +313,11 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_kv_compressed) { + GPU_DEBUG_TRACE_DETAIL << "Update scale layout: " << impl_param.get_input_layout(data_inputs_num) << "\n"; params.key_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num)); + GPU_DEBUG_TRACE_DETAIL << "Updated scale tensor pad: " << params.key_cache_comp_scale.Y().pad.before << " " + << params.key_cache_comp_scale.Y().pad.after << "\n"; + params.value_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 1)); if (has_zp_input_buffers) { diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 808a593c601ad0..8d0396c61c03dd 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -104,7 +104,21 @@ int32_t kv_cache_inst::get_prealloc_iter_num() { // iteration. // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts, // we assigned different prealloc-size for each kv cache so that we could prevent a memory peak - return 128 + kv_cache_id % 64; + int KV_STRIDE = 0; + if (const auto env_var = std::getenv("KV_STRIDE")) { + std::istringstream ss(env_var); + ss >> KV_STRIDE; + static bool print_once = true; + if (print_once) { + std::cout << ">>> KV_STRIDE = " << KV_STRIDE << "\n"; + print_once = false; + } + } + if (KV_STRIDE != 0) { + return KV_STRIDE; + } + + return 128 + kv_cache_id % 64;; } void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl index b50410a3cf761f..a50afd78bec92f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl @@ -151,10 +151,10 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG #endif int d, int k, int q #ifdef KV_COMPRESSED - , const global KEY_ATTR_SCALES_DATA_T *K_scales - , const global KEY_ATTR_ZP_DATA_T *K_zp - , const global VAL_ATTR_SCALES_DATA_T *V_scales - , const global VAL_ATTR_ZP_DATA_T *V_zp + , global KEY_ATTR_SCALES_DATA_T *K_scales + , global KEY_ATTR_ZP_DATA_T *K_zp + , global VAL_ATTR_SCALES_DATA_T *V_scales + , global VAL_ATTR_ZP_DATA_T *V_zp #endif ) { uint sg_ij = sub_group_broadcast(get_local_id(1), 0); @@ -164,6 +164,87 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n; + + + + +// if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) { +// for (int j = 0; j < 32; j++) { +// #ifdef KV_COMPRESSED +// for (int i = k; i < k + KEY_SCALE_PAD_AFTER_SIZE_Y; i++) { +// K_scales[(j * (k + KEY_SCALE_PAD_AFTER_SIZE_Y)) + i] = 0; +// K_zp[(j * (k + KEY_SCALE_PAD_AFTER_SIZE_Y)) + i] = 0; +// } + +// for (int i = k; i < k + VAL_SCALE_PAD_AFTER_SIZE_Y; i++) { +// V_scales[(j * (k + VAL_SCALE_PAD_AFTER_SIZE_Y)) + i] = 0; +// V_zp[(j * (k + VAL_SCALE_PAD_AFTER_SIZE_Y)) + i] = 0; +// } +// #endif +// } + +// printf("h=0 key scales[%p]={", K_scales); +// for (int i = 0; i < max(k, k + KEY_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%f, ", K_scales[i]); +// printf("total_num=%d, pad=%d\n", k, KEY_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=0 key zp[%p]={", K_zp); +// for (int i = 0; i < max(k, k + KEY_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%d, ", K_zp[i]); +// printf("total_num=%d, pad=%d\n", k, KEY_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=0 val scales[%p]={", V_scales); +// for (int i = 0; i < max(k, k + VAL_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%f, ", V_scales[i]); +// printf("total_num=%d, pad=%d\n", k, VAL_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=0 key zp[%p]={", V_zp); +// for (int i = 0; i < max(k, k + VAL_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%d, ", V_zp[i]); +// printf("total_num=%d, pad=%d\n", k, VAL_SCALE_PAD_AFTER_SIZE_Y); + + +// printf("h=1 key scales[%p]={", K_scales); +// for (int i = 0; i < max(k, k + KEY_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%f, ", K_scales[k + KEY_SCALE_PAD_AFTER_SIZE_Y + i]); +// printf("total_num=%d, pad=%d\n", k, KEY_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=1 key zp[%p]={", K_zp); +// for (int i = 0; i < max(k, k + KEY_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%d, ", K_zp[k + KEY_SCALE_PAD_AFTER_SIZE_Y + i]); +// printf("total_num=%d, pad=%d\n", k, KEY_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=1 val scales[%p]={", V_scales); +// for (int i = 0; i < max(k, k + VAL_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%f, ", V_scales[k + VAL_SCALE_PAD_AFTER_SIZE_Y + i]); +// printf("total_num=%d, pad=%d\n", k, VAL_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=1 key zp[%p]={", V_zp); +// for (int i = 0; i < max(k, k + VAL_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%d, ", V_zp[k + VAL_SCALE_PAD_AFTER_SIZE_Y + i]); +// printf("total_num=%d, pad=%d\n", k, VAL_SCALE_PAD_AFTER_SIZE_Y); + + +// printf("h=31 key scales[%p]={", K_scales); +// for (int i = 0; i < max(k, k + KEY_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%f, ", K_scales[(31 * (k + KEY_SCALE_PAD_AFTER_SIZE_Y)) + i]); +// printf("total_num=%d, pad=%d\n", k, KEY_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=31 key zp[%p]={", K_zp); +// for (int i = 0; i < max(k, k + KEY_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%d, ", K_zp[(31 * (k + KEY_SCALE_PAD_AFTER_SIZE_Y)) + i]); +// printf("total_num=%d, pad=%d\n", k, KEY_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=31 val scales[%p]={", V_scales); +// for (int i = 0; i < max(k, k + VAL_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%f, ", V_scales[(31 * (k + VAL_SCALE_PAD_AFTER_SIZE_Y)) + i]); +// printf("total_num=%d, pad=%d\n", k, VAL_SCALE_PAD_AFTER_SIZE_Y); + +// printf("h=31 key zp[%p]={", V_zp); +// for (int i = 0; i < max(k, k + VAL_SCALE_PAD_AFTER_SIZE_Y); i++) +// printf("%d, ", V_zp[(31 * (k + VAL_SCALE_PAD_AFTER_SIZE_Y)) + i]); +// printf("total_num=%d, pad=%d\n", k, VAL_SCALE_PAD_AFTER_SIZE_Y); +// } /* Leading dimension for matrices */ uint ldk = TRANSPOSE_K ? KEY_S3 : KEY_S2; uint ldq = QRY_S2; @@ -307,6 +388,7 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG #if WITH_ATTN_MASK mask_tile_type mask_tile; + // tile_load_t(&mask_tile, msk, q, k, sg_j0_kq + wg_j0, k0 + sg_i0_kq); tile_load_t(&mask_tile, msk, q, k, q, sg_j0_kq + wg_j0, k0 + sg_i0_kq); #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index f11d2fc91c5cec..3f4517e97f33ab 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -272,8 +272,10 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag opts_kq.scaleA = params.conf.is_kv_compressed && !kq_common_scales; opts_kq.offsetA = params.conf.is_kv_compressed && params.conf.use_asymmetric_quantization; + // auto key_dt_size = micro::data_type_size(convert_type(params.inputs[1].GetDType())); problem_kq.B.layout = micro::MatrixLayout::Pr; problem_kq.C.layout = micro::MatrixLayout::T; + // problem_kq.A.setAlignment(micro::alignment_for_ld(head_size * key_dt_size)); problem_kq.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta)); problem_kq.B.setAlignment(64); // Q is packed in VNNI format in SLM problem_kq.B.crosspack = 2; @@ -337,8 +339,10 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag opts_vs.scaleA = params.conf.is_kv_compressed && !vs_common_scales; opts_vs.offsetA = params.conf.is_kv_compressed && params.conf.use_asymmetric_quantization; + // auto val_dt_size = micro::data_type_size(convert_type(params.inputs[2].GetDType())); problem_vs.B.layout = micro::MatrixLayout::Pr; problem_vs.C.layout = micro::MatrixLayout::N; + // problem_vs.A.setAlignment(micro::alignment_for_ld(head_size * val_dt_size)); problem_vs.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta)); problem_vs.B.setAlignment(64); // S is packed in SLM problem_vs.B.crosspack = 16; @@ -536,6 +540,12 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m // TODO: Causes accuracy drop for static SD model. Enable back once the issue is resolved // if (lda % 4 == 0 && v_full) // jit.AddConstant(MakeJitConstant("BLOCK_A", 1)); + // if (params.inputs.size() > 3 && !params.inputs[3].is_dynamic()) { + // auto ldmsk = params.inputs[3].X().v * params.inputs[3].ElementSize(); + // if (ldmsk % 4 == 0) + // jit.AddConstant(MakeJitConstant("BLOCK_MSK", 1)); + // } + // if (ldmsk % 4 == 0) kernel_ctx.define_int("BLOCK_MSK", 1); jit.AddConstant(MakeJitConstant("REMAINDER_Q", !q_full)); } else if (params.engineInfo.arch >= gpu_arch::xe_hpc) { auto vbytes = n_values.v * V.ElementSize(); @@ -753,6 +763,12 @@ void SDPAKernelMicro::GetUpdateDispatchDataFunc(KernelData& kd) const { const auto n_queries = get_seq_length(Q, prim_params.input0_order); const auto n_keys = get_seq_length(K, prim_params.input1_order); + GPU_DEBUG_TRACE_DETAIL << "Key scale pad_before=" << prim_params.key_cache_comp_scale.Y().pad.before + << "pad_after=" << prim_params.key_cache_comp_scale.Y().pad.after << "\n"; + + GPU_DEBUG_TRACE_DETAIL << "Value scale pad_before=" << prim_params.value_cache_comp_scale.Y().pad.before + << "pad_after=" << prim_params.value_cache_comp_scale.Y().pad.after << "\n"; + auto head_size = prim_params.conf.head_size; ScalarDescriptor s_d;