openvinotoolkit · vladimir-paramuzov · Jul 15, 2024 · Aug 7, 2024 · Sep 19, 2024 · sshlyapn
@@ -173,6 +173,7 @@ std::shared_ptr<KernelString> KernelBaseOpenCL::GetKernelString(const std::strin
         kernel_string->undefs = jit.second;
         if (engine_info.vendor_id == cldnn::INTEL_VENDOR_ID) {
             kernel_string->options = exe_mode + " -cl-mad-enable";
+            kernel_string->options += " -cl-intel-greater-than-4GB-buffer-required";
             if (engine_info.bOptHintsSupport)
                 kernel_string->options += " -DOPT_HINTS_SUPPORTED=1";
             if (engine_info.bLocalBlockIOSupport)

@@ -3,6 +3,7 @@
 //
 
 #include "ocl_engine.hpp"
+#include "intel_gpu/runtime/device.hpp"
 #include "ocl_common.hpp"
 #include "ocl_memory.hpp"
 #include "ocl_stream.hpp"
@@ -128,19 +129,22 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const
 bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) {
     OPENVINO_ASSERT(supports_allocation(type) || type == allocation_type::cl_mem, "[GPU] Unsupported allocation type: ", type);
 
-    bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
+    // For intel GPU we use special internal option to allow unlimited USM allocations
+    if (_device->get_info().vendor_id != INTEL_VENDOR_ID && memory_capabilities::is_usm_type(type)) {
+        bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
 
-    // When dynamic shape upper bound makes bigger buffer, then return false.
-    if (exceed_allocatable_mem_size && layout.is_dynamic()) {
-        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
-        return false;
-    }
+        // When dynamic shape upper bound makes bigger buffer, then return false.
+        if (exceed_allocatable_mem_size && layout.is_dynamic()) {
+            OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
+            return false;
+        }
 
-    OPENVINO_ASSERT(!exceed_allocatable_mem_size,
-                    "[GPU] Exceeded max size of memory object allocation: ",
-                    "requested ", layout.bytes_count(), " bytes, "
-                    "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
-                    "Please try to reduce batch size or use lower precision.");
+        OPENVINO_ASSERT(!exceed_allocatable_mem_size,
+                        "[GPU] Exceeded max size of memory object allocation: ",
+                        "requested ", layout.bytes_count(), " bytes, "
+                        "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
+                        "Please try to reduce batch size or use lower precision.");
+    }
 
     auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host);
     auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size());

@@ -279,6 +279,8 @@ CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)
 }  // namespace detail
 }  // namespace cl
 
+#define CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL (1 << 23)
+
 #include <memory>
 
 namespace {
@@ -925,23 +927,23 @@ class UsmMemory {
     // Get methods returns original pointer allocated by openCL.
     void* get() const { return _usm_pointer->ptr(); }
 
-    void allocateHost(size_t size) {
+    void allocateHost(size_t size, const cl_mem_properties_intel* properties = nullptr) {
         cl_int error = CL_SUCCESS;
-        auto ptr = _usmHelper.allocate_host(nullptr, size, 0, &error);
+        auto ptr = _usmHelper.allocate_host(properties, size, 0, &error);
         _check_error(size, ptr, error, "Host");
         _allocate(ptr);
     }
 
-    void allocateShared(size_t size) {
+    void allocateShared(size_t size, const cl_mem_properties_intel* properties = nullptr) {
         cl_int error = CL_SUCCESS;
-        auto ptr = _usmHelper.allocate_shared(nullptr, size, 0, &error);
+        auto ptr = _usmHelper.allocate_shared(properties, size, 0, &error);
         _check_error(size, ptr, error, "Shared");
         _allocate(ptr);
     }
 
-    void allocateDevice(size_t size) {
+    void allocateDevice(size_t size, const cl_mem_properties_intel* properties = nullptr) {
         cl_int error = CL_SUCCESS;
-        auto ptr = _usmHelper.allocate_device(nullptr, size, 0, &error);
+        auto ptr = _usmHelper.allocate_device(properties, size, 0, &error);
         _check_error(size, ptr, error, "Device");
         _allocate(ptr);
     }

@@ -3,13 +3,15 @@
 //
 
 #include "intel_gpu/runtime/debug_configuration.hpp"
+#include "intel_gpu/runtime/device.hpp"
 #include "intel_gpu/runtime/error_handler.hpp"
 #include "intel_gpu/runtime/memory.hpp"
 #include "intel_gpu/runtime/utils.hpp"
 #include "ocl_memory.hpp"
 #include "ocl_engine.hpp"
 #include "ocl_stream.hpp"
 #include "ocl_event.hpp"
+#include "openvino/core/except.hpp"
 #include <stdexcept>
 #include <vector>
 
@@ -456,19 +458,25 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
     , memory(engine, layout, type, nullptr)
     , _buffer(engine->get_usm_helper())
     , _host_buffer(engine->get_usm_helper()) {
+    std::vector<cl_mem_properties_intel> properties;
+    if (engine->get_device_info().vendor_id == INTEL_VENDOR_ID) {
+        properties = {CL_MEM_FLAGS, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
+    } else {
+        properties = {0};
+    }
+
     switch (get_allocation_type()) {
     case allocation_type::usm_host:
-        _buffer.allocateHost(_bytes_count);
+        _buffer.allocateHost(_bytes_count, &properties[0]);
         break;
     case allocation_type::usm_shared:
-        _buffer.allocateShared(_bytes_count);
+        _buffer.allocateShared(_bytes_count, &properties[0]);
         break;
     case allocation_type::usm_device:
-        _buffer.allocateDevice(_bytes_count);
+        _buffer.allocateDevice(_bytes_count, &properties[0]);
         break;
     default:
-        CLDNN_ERROR_MESSAGE("gpu_usm allocation type",
-            "Unknown unified shared memory type!");
+        OPENVINO_THROW("[GPU] gpu_usm allocation type: unknown unified shared memory type!");
     }
 
     m_mem_tracker = std::make_shared<MemoryTracker>(engine, _buffer.get(), layout.bytes_count(), type);

@@ -53,3 +53,19 @@ TEST(engine, memory_creation) {
     ASSERT_FALSE(mem->is_allocated_by(engine));
     ASSERT_EQ(std::dynamic_pointer_cast<simple_attached_memory>(mem)->lock(get_test_stream(), mem_lock_type::read), host_data.data());
 }
+
+TEST(engine, large_allocation) {
+    auto& engine = get_test_engine();
+
+    std::shared_ptr<memory> mem = nullptr;
+    ov::Shape sz_8gb = {8, 1024, 1024, 1024};
+    layout layout_to_allocate = {sz_8gb, data_types::u8, format::bfyx};
+
+    if (engine.supports_allocation(allocation_type::usm_device) && ov::shape_size(sz_8gb) < engine.get_device_info().max_global_mem_size) {
+        OV_ASSERT_NO_THROW(mem = engine.allocate_memory(layout_to_allocate, allocation_type::usm_host));
+        ASSERT_NE(mem, nullptr);
+        ASSERT_EQ(mem->get_layout(), layout_to_allocate);
+        ASSERT_NE(std::dynamic_pointer_cast<ocl::gpu_usm>(mem), nullptr);
+        ASSERT_TRUE(mem->is_allocated_by(engine));
+    }
+}
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "intel_gpu/primitives/implementation_desc.hpp"
+#include "intel_gpu/runtime/internal_properties.hpp"
 #include "test_utils.h"
 #include "random_generator.hpp"
 
@@ -2216,3 +2218,56 @@ INSTANTIATE_TEST_SUITE_P(reduce_scalar_output_f16_f16,
                             TestParamType_general_reduce_gpu(1, 1, 1, 1, 1024, 1, format::bfyx, reduce_mode::min, {3, 2, 1, 0},  "reduce_simple_to_scalar", false, data_types::f16, false, data_types::f16),
                             TestParamType_general_reduce_gpu(1, 1, 1, 1, 1025, 1, format::bfyx, reduce_mode::min, {3, 2, 1, 0},  "reduce_simple_to_scalar", false, data_types::f16, false, data_types::f16)
                         ), general_reduce_gpu::PrintToStringParamName);
+
+TEST(reduce_f32_fw_gpu, large_buffer) {
+    auto engine = create_test_engine();
+
+    size_t s0 = 16384;
+    size_t s1 = 256 * 512;
+    ov::Shape sz_8gb = { 1, 1, s1, s0 }; // *4 bytes;
+    size_t peak_mem_usage = (ov::shape_size(sz_8gb) + s0) * sizeof(float);
+    if (engine->get_device_info().max_global_mem_size < peak_mem_usage)
+        GTEST_SKIP();
+
+    layout in_l = { sz_8gb, data_types::f32, format::bfyx };
+
+    auto config = get_test_default_config(*engine);
+    ov::intel_gpu::ImplementationDesc reduce_impl = {format::bfyx, "", impl_types::any};
+    if (engine->get_device_info().supports_immad) {
+        reduce_impl.impl_type = impl_types::onednn;
+    } else {
+        reduce_impl.impl_type = impl_types::ocl;
+    }
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"reduce", reduce_impl}}));
+    topology topology(input_layout("input", in_l),
+                      reduce("reduce", input_info("input"), reduce_mode::mean, {2}, true));
+    network network(*engine, topology, config);
+    auto input = network.get_output_memory("input");
+    {
+        mem_lock<float, mem_lock_type::write> l(input, get_test_stream());
+        for (size_t i = 0; i < ov::shape_size(sz_8gb); i++) {
+            l[i] = static_cast<float>(i) / s0;
+        }
+    }
+
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.begin()->first, "reduce");
+
+    auto output_memory = outputs.at("reduce").get_memory();
+    auto output_layout = output_memory->get_layout();
+    cldnn::mem_lock<float, mem_lock_type::read> output_ptr(output_memory, get_test_stream());
+
+    ASSERT_EQ(output_layout.format, format::bfyx);
+    ASSERT_EQ(output_layout.get_linear_size(), s0);
+
+    // ensure that single 8GB buffer is allocated
+    ASSERT_EQ(engine->get_max_used_device_memory(), peak_mem_usage);
+
+    size_t sum = s1 * (s1 - 1) / 2;
+    float mean = static_cast<float>(sum) / static_cast<float>(s1);
+    for (size_t i = 0; i < s0; i++) {
+        ASSERT_NEAR(mean, output_ptr[i], 1.0f);
+    }
+}