Skip to content

Commit

Permalink
[GPU] Updated GPU cache size retrieval and refined closest_pow_of_2 (o…
Browse files Browse the repository at this point in the history
…penvinotoolkit#28059)

Details:
Existing method for cache size calculation was static and need
continious updates to the sku table which was already being missed for
latest skus e.g DG2.
This update introduces a new member variable, max_global_cache_size, to
store the GPU's global cache size, obtained via the OpenCL property
CL_DEVICE_GLOBAL_MEM_CACHE_SIZE. The existing hard coded cache
calculations are removed. Additionally, the closest_pow_of_2 function
has been enhanced to return the nearest power of 2, favoring the upper
value if the input is within 30% of the range for the upper bound. These
changes improve memory management and ensure better utilization of GPU
resources towards bottle neck situations.

Tickets:
     CVS-159076

Signed-off-by: Arshad Mehmood <[email protected]>
  • Loading branch information
arshadlab authored and 11happy committed Dec 23, 2024
1 parent ae964c3 commit 25c8a28
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct device_info {
uint64_t max_local_mem_size; ///< Maximum size of local memory arena in bytes.
uint64_t max_global_mem_size; ///< Maximum size of global device memory in bytes.
uint64_t max_alloc_mem_size; ///< Maximum size of memory object allocation in bytes.
uint64_t max_global_cache_size; ///< Maximum size of cache memory bytes.

uint64_t max_image2d_width; ///< Maximum image 2d width supported by the device.
uint64_t max_image2d_height; ///< Maximum image 2d height supported by the device.
Expand Down
49 changes: 20 additions & 29 deletions src/plugins/intel_gpu/src/plugin/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,12 +797,24 @@ uint32_t Plugin::get_optimal_batch_size(const ov::AnyMap& options) const {
auto device_id = get_property(ov::device::id.name(), options).as<std::string>();
auto context = get_default_contexts().at(device_id);
const auto& device_info = context->get_engine().get_device_info();
auto next_pow_of_2 = [] (float x) {
return pow(2, ceil(std::log(x)/std::log(2)));
};

auto closest_pow_of_2 = [] (float x) {
return pow(2, floor(std::log(x)/std::log(2)));
int lower_power = static_cast<int>(floor(std::log(x) / std::log(2)));
double lower_value = pow(2, lower_power); // Current power of 2
double upper_value = pow(2, lower_power + 1); // Next power of 2

// Determine the threshold (70% of the range between lower and upper values)
// If x is within the upper 30% of the range, return the upper power of 2.
double threshold = 0.7 * (upper_value - lower_value);

// Compare x with the threshold and return the appropriate power of 2
if (x - lower_value > threshold) {
return upper_value; // Return the next power of 2
} else {
return lower_value; // Return the current power of 2
}
};

auto model_param = options.find(ov::hint::model.name());
if (model_param == options.end()) {
GPU_DEBUG_INFO << "[OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
Expand All @@ -816,31 +828,10 @@ uint32_t Plugin::get_optimal_batch_size(const ov::AnyMap& options) const {
}
GPU_DEBUG_INFO << "DEVICE_INFO:"
<< "gfx_version.major, " << device_info.gfx_ver.major
<< "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
static std::map<cldnn::gfx_version, size_t> gen_kbytes_per_bank = {
{{12, 0, 0}, 480}, // TGL
{{12, 1, 0}, 2048}, // DG1
{{12, 5, 0}, 320},
{{12, 7, 0}, 512},
};
size_t L3_cache_size = device_info.gfx_ver.major && (device_info.gfx_ver.major <= 9)
? 768 * 1024 // Gen9
: 2 * 768 * 1024; //reasonable default when no arch has been detected (e.g. due to old driver ver)
cldnn::gfx_version gen = {device_info.gfx_ver.major, device_info.gfx_ver.minor, 0 /*ignore the revision*/};
auto val = gen_kbytes_per_bank.find(gen);
if (gen_kbytes_per_bank.end() != val) {
auto kbytes_per_bank = val->second;
auto num_banks_per_slice = device_info.num_sub_slices_per_slice > 4
? next_pow_of_2(device_info.num_sub_slices_per_slice)
: 2 * device_info.num_sub_slices_per_slice;
L3_cache_size = kbytes_per_bank * 1024 * num_banks_per_slice * device_info.num_slices;
GPU_DEBUG_INFO << "DEVICE_INFO:"
<< "num_slices " << device_info.num_slices
<< ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
<< ", num_banks_per_slice " << num_banks_per_slice
<< ", gen_kbytes_per_bank : " << kbytes_per_bank
<< ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
}
<< "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor)
<< "Cache size " << std::to_string(device_info.max_global_cache_size) << std::endl;

size_t L3_cache_size = device_info.max_global_cache_size;
auto config = m_configs_map.at(device_id);
auto cloned_model = clone_and_transform_model(model, config, context);
ov::MemBandwidthPressure memPressure = ov::mem_bandwidth_pressure_tolerance(cloned_model, L3_cache_size);
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
info.max_local_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>());
info.max_global_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>());
info.max_alloc_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>());
info.max_global_cache_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>());

info.supports_image = static_cast<uint8_t>(device.getInfo<CL_DEVICE_IMAGE_SUPPORT>());
info.max_image2d_width = static_cast<uint64_t>(device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>());
Expand Down

0 comments on commit 25c8a28

Please sign in to comment.