Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Allow unlimited allocations #25955

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ std::shared_ptr<KernelString> KernelBaseOpenCL::GetKernelString(const std::strin
kernel_string->undefs = jit.second;
if (engine_info.vendor_id == cldnn::INTEL_VENDOR_ID) {
kernel_string->options = exe_mode + " -cl-mad-enable";
kernel_string->options += " -cl-intel-greater-than-4GB-buffer-required";
if (engine_info.bOptHintsSupport)
kernel_string->options += " -DOPT_HINTS_SUPPORTED=1";
if (engine_info.bLocalBlockIOSupport)
Expand Down
26 changes: 15 additions & 11 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//

#include "ocl_engine.hpp"
#include "intel_gpu/runtime/device.hpp"
#include "ocl_common.hpp"
#include "ocl_memory.hpp"
#include "ocl_stream.hpp"
Expand Down Expand Up @@ -128,19 +129,22 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const
bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) {
OPENVINO_ASSERT(supports_allocation(type) || type == allocation_type::cl_mem, "[GPU] Unsupported allocation type: ", type);

bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
// For intel GPU we use special internal option to allow unlimited USM allocations
if (_device->get_info().vendor_id != INTEL_VENDOR_ID && memory_capabilities::is_usm_type(type)) {
bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);

// When dynamic shape upper bound makes bigger buffer, then return false.
if (exceed_allocatable_mem_size && layout.is_dynamic()) {
OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
return false;
}
// When dynamic shape upper bound makes bigger buffer, then return false.
if (exceed_allocatable_mem_size && layout.is_dynamic()) {
OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
return false;
}

OPENVINO_ASSERT(!exceed_allocatable_mem_size,
"[GPU] Exceeded max size of memory object allocation: ",
"requested ", layout.bytes_count(), " bytes, "
"but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
"Please try to reduce batch size or use lower precision.");
OPENVINO_ASSERT(!exceed_allocatable_mem_size,
"[GPU] Exceeded max size of memory object allocation: ",
"requested ", layout.bytes_count(), " bytes, "
"but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
"Please try to reduce batch size or use lower precision.");
}

auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host);
auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size());
Expand Down
14 changes: 8 additions & 6 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)
} // namespace detail
} // namespace cl

#define CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL (1 << 23)

#include <memory>

namespace {
Expand Down Expand Up @@ -925,23 +927,23 @@ class UsmMemory {
// Get methods returns original pointer allocated by openCL.
void* get() const { return _usm_pointer->ptr(); }

void allocateHost(size_t size) {
void allocateHost(size_t size, const cl_mem_properties_intel* properties = nullptr) {
cl_int error = CL_SUCCESS;
auto ptr = _usmHelper.allocate_host(nullptr, size, 0, &error);
auto ptr = _usmHelper.allocate_host(properties, size, 0, &error);
_check_error(size, ptr, error, "Host");
_allocate(ptr);
}

void allocateShared(size_t size) {
void allocateShared(size_t size, const cl_mem_properties_intel* properties = nullptr) {
cl_int error = CL_SUCCESS;
auto ptr = _usmHelper.allocate_shared(nullptr, size, 0, &error);
auto ptr = _usmHelper.allocate_shared(properties, size, 0, &error);
_check_error(size, ptr, error, "Shared");
_allocate(ptr);
}

void allocateDevice(size_t size) {
void allocateDevice(size_t size, const cl_mem_properties_intel* properties = nullptr) {
cl_int error = CL_SUCCESS;
auto ptr = _usmHelper.allocate_device(nullptr, size, 0, &error);
auto ptr = _usmHelper.allocate_device(properties, size, 0, &error);
_check_error(size, ptr, error, "Device");
_allocate(ptr);
}
Expand Down
18 changes: 13 additions & 5 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
//

#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/device.hpp"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[random spot]
Shouldn't we apply the same extended limit for OpenCL buffers?

cl_mem_flags flags = 0;
flags |= CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL;

cl_mem buffer = clCreateBuffer(context, flags, size, host_ptr, errcode_ret);

#include "intel_gpu/runtime/error_handler.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/utils.hpp"
#include "ocl_memory.hpp"
#include "ocl_engine.hpp"
#include "ocl_stream.hpp"
#include "ocl_event.hpp"
#include "openvino/core/except.hpp"
#include <stdexcept>
#include <vector>

Expand Down Expand Up @@ -456,19 +458,25 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
, memory(engine, layout, type, nullptr)
, _buffer(engine->get_usm_helper())
, _host_buffer(engine->get_usm_helper()) {
std::vector<cl_mem_properties_intel> properties;
if (engine->get_device_info().vendor_id == INTEL_VENDOR_ID) {
properties = {CL_MEM_FLAGS, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
} else {
properties = {0};
}

switch (get_allocation_type()) {
case allocation_type::usm_host:
_buffer.allocateHost(_bytes_count);
_buffer.allocateHost(_bytes_count, &properties[0]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accessing an empty vector (in case of engine->get_device_info().vendor_id == INTEL_VENDOR_ID) at index 0 with operator[] could cause UB probably
And specs say (regarding the properties argument): The list is terminated with the special property 0. So it should be either nullptr or vector with single 0 value by default

break;
case allocation_type::usm_shared:
_buffer.allocateShared(_bytes_count);
_buffer.allocateShared(_bytes_count, &properties[0]);
break;
case allocation_type::usm_device:
_buffer.allocateDevice(_bytes_count);
_buffer.allocateDevice(_bytes_count, &properties[0]);
break;
default:
CLDNN_ERROR_MESSAGE("gpu_usm allocation type",
"Unknown unified shared memory type!");
OPENVINO_THROW("[GPU] gpu_usm allocation type: unknown unified shared memory type!");
}

m_mem_tracker = std::make_shared<MemoryTracker>(engine, _buffer.get(), layout.bytes_count(), type);
Expand Down
16 changes: 16 additions & 0 deletions src/plugins/intel_gpu/tests/unit/module_tests/engine_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,19 @@ TEST(engine, memory_creation) {
ASSERT_FALSE(mem->is_allocated_by(engine));
ASSERT_EQ(std::dynamic_pointer_cast<simple_attached_memory>(mem)->lock(get_test_stream(), mem_lock_type::read), host_data.data());
}

TEST(engine, large_allocation) {
auto& engine = get_test_engine();

std::shared_ptr<memory> mem = nullptr;
ov::Shape sz_8gb = {8, 1024, 1024, 1024};
layout layout_to_allocate = {sz_8gb, data_types::u8, format::bfyx};

if (engine.supports_allocation(allocation_type::usm_device) && ov::shape_size(sz_8gb) < engine.get_device_info().max_global_mem_size) {
OV_ASSERT_NO_THROW(mem = engine.allocate_memory(layout_to_allocate, allocation_type::usm_host));
ASSERT_NE(mem, nullptr);
ASSERT_EQ(mem->get_layout(), layout_to_allocate);
ASSERT_NE(std::dynamic_pointer_cast<ocl::gpu_usm>(mem), nullptr);
ASSERT_TRUE(mem->is_allocated_by(engine));
}
}
55 changes: 55 additions & 0 deletions src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "intel_gpu/primitives/implementation_desc.hpp"
#include "intel_gpu/runtime/internal_properties.hpp"
#include "test_utils.h"
#include "random_generator.hpp"

Expand Down Expand Up @@ -2216,3 +2218,56 @@ INSTANTIATE_TEST_SUITE_P(reduce_scalar_output_f16_f16,
TestParamType_general_reduce_gpu(1, 1, 1, 1, 1024, 1, format::bfyx, reduce_mode::min, {3, 2, 1, 0}, "reduce_simple_to_scalar", false, data_types::f16, false, data_types::f16),
TestParamType_general_reduce_gpu(1, 1, 1, 1, 1025, 1, format::bfyx, reduce_mode::min, {3, 2, 1, 0}, "reduce_simple_to_scalar", false, data_types::f16, false, data_types::f16)
), general_reduce_gpu::PrintToStringParamName);

TEST(reduce_f32_fw_gpu, large_buffer) {
auto engine = create_test_engine();

size_t s0 = 16384;
size_t s1 = 256 * 512;
ov::Shape sz_8gb = { 1, 1, s1, s0 }; // *4 bytes;
size_t peak_mem_usage = (ov::shape_size(sz_8gb) + s0) * sizeof(float);
if (engine->get_device_info().max_global_mem_size < peak_mem_usage)
GTEST_SKIP();

layout in_l = { sz_8gb, data_types::f32, format::bfyx };

auto config = get_test_default_config(*engine);
ov::intel_gpu::ImplementationDesc reduce_impl = {format::bfyx, "", impl_types::any};
if (engine->get_device_info().supports_immad) {
reduce_impl.impl_type = impl_types::onednn;
} else {
reduce_impl.impl_type = impl_types::ocl;
}
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"reduce", reduce_impl}}));
topology topology(input_layout("input", in_l),
reduce("reduce", input_info("input"), reduce_mode::mean, {2}, true));
network network(*engine, topology, config);
auto input = network.get_output_memory("input");
{
mem_lock<float, mem_lock_type::write> l(input, get_test_stream());
for (size_t i = 0; i < ov::shape_size(sz_8gb); i++) {
l[i] = static_cast<float>(i) / s0;
}
}

network.set_input_data("input", input);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "reduce");

auto output_memory = outputs.at("reduce").get_memory();
auto output_layout = output_memory->get_layout();
cldnn::mem_lock<float, mem_lock_type::read> output_ptr(output_memory, get_test_stream());

ASSERT_EQ(output_layout.format, format::bfyx);
ASSERT_EQ(output_layout.get_linear_size(), s0);

// ensure that single 8GB buffer is allocated
ASSERT_EQ(engine->get_max_used_device_memory(), peak_mem_usage);

size_t sum = s1 * (s1 - 1) / 2;
float mean = static_cast<float>(sum) / static_cast<float>(s1);
for (size_t i = 0; i < s0; i++) {
ASSERT_NEAR(mean, output_ptr[i], 1.0f);
}
}
Loading