openvinotoolkit · vladimir-paramuzov · Sep 17, 2024 · Oct 1, 2024 · Oct 2, 2024 · Oct 2, 2024
@@ -113,6 +113,7 @@ jobs:
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/json
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gtest
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gflags
+            git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero
           popd
 
       - name: Clone vcpkg

@@ -138,6 +138,7 @@ jobs:
             git submodule update --init -- ${OPENVINO_REPO}/src/plugins/intel_cpu
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/open_model_zoo
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/flatbuffers/flatbuffers
+            git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero
           popd
 
       #

@@ -41,6 +41,15 @@ else()
     set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON)
 endif()
 
+set(OV_GPU_DEFAULT_RT "L0")
+if (ENABLE_INTEL_GPU)
+    ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL)
+    if (GPU_RT_TYPE STREQUAL "L0")
+        # There's no interop with native L0 in onednn API. Temporary disable onednn when L0 runtime is selected
+        set(ENABLE_ONEDNN_FOR_GPU_DEFAULT OFF)
+    endif()
+endif()
+
 ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF)
 
 ov_dependent_option (ENABLE_INTEL_NPU "NPU plugin for OpenVINO runtime" ON "X86_64;WIN32 OR LINUX" OFF)

@@ -24,6 +24,7 @@ using gpu_handle_param = void*;
 enum class ContextType {
     OCL = 0,        //!< Pure OpenCL context
     VA_SHARED = 1,  //!< Context shared with a video decoding device
+    ZE = 2,         //!< Pure Level0 context
 };
 
 /** @cond INTERNAL */
@@ -33,6 +34,8 @@ inline std::ostream& operator<<(std::ostream& os, const ContextType& context_typ
         return os << "OCL";
     case ContextType::VA_SHARED:
         return os << "VA_SHARED";
+    case ContextType::ZE:
+        return os << "ZE";
     default:
         OPENVINO_THROW("Unsupported context type");
     }
@@ -43,6 +46,8 @@ inline std::istream& operator>>(std::istream& is, ContextType& context_type) {
     is >> str;
     if (str == "OCL") {
         context_type = ContextType::OCL;
+    } else if (str == "ZE") {
+        context_type = ContextType::ZE;
     } else if (str == "VA_SHARED") {
         context_type = ContextType::VA_SHARED;
     } else {

@@ -8,6 +8,8 @@ endif()
 
 set (TARGET_NAME "openvino_intel_gpu_plugin")
 
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+
 if(OV_COMPILER_IS_INTEL_LLVM)
     # For windows we need to disable warning as error option to make FindSYCL.cmake work
     if (WIN32)
@@ -88,6 +90,7 @@ target_include_directories(${TARGET_NAME} PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/include/)
 
 ov_set_threading_interface_for(${TARGET_NAME})
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 

@@ -0,0 +1,15 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+function(ov_gpu_set_runtime_interface_for TARGET_NAME)
+    if(GPU_RT_TYPE STREQUAL "L0")
+        target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1)
+        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
+    elseif(GPU_RT_TYPE STREQUAL "OCL")
+        target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
+        target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL)
+    else()
+        message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported")
+    endif()
+endfunction()
@@ -8,6 +8,7 @@
 # define NOMINMAX
 #endif
 
+#include "intel_gpu/runtime/engine_configuration.hpp"
 #include "openvino/runtime/intel_gpu/remote_properties.hpp"
 #include "openvino/runtime/iremote_context.hpp"
 
@@ -19,11 +20,30 @@
 #include <string>
 #include <map>
 #include <memory>
-#include <atomic>
 
 namespace ov {
 namespace intel_gpu {
 
+inline std::pair<cldnn::engine_types, cldnn::runtime_types> get_device_query_params() {
+    #ifdef OV_GPU_WITH_ZE_RT
+        auto runtime_type = cldnn::runtime_types::ze;
+        #ifdef OV_GPU_WITH_SYCL
+            auto engine_type = cldnn::engine_types::sycl;
+        #else
+            auto engine_type = cldnn::engine_types::ze;
+        #endif
+    #else
+        auto runtime_type = cldnn::runtime_types::ocl;
+        #ifdef OV_GPU_WITH_SYCL
+            auto engine_type = cldnn::engine_types::sycl;
+        #else
+            auto engine_type = cldnn::engine_types::ocl;
+        #endif
+    #endif
+
+    return {engine_type, runtime_type};
+}
+
 class RemoteContextImpl : public ov::IRemoteContext {
 public:
     using Ptr = std::shared_ptr<RemoteContextImpl>;
@@ -59,7 +79,11 @@ class RemoteContextImpl : public ov::IRemoteContext {
     ov::intel_gpu::gpu_handle_param m_va_display = nullptr;
     ov::intel_gpu::gpu_handle_param m_external_queue = nullptr;
 
+#ifdef OV_GPU_WITH_ZE_RT
+    ContextType m_type = ContextType::ZE;
+#else
     ContextType m_type = ContextType::OCL;
+#endif
     std::string m_device_name = "";
     static const size_t cache_capacity = 100;
     cldnn::LruCache<size_t, cldnn::memory::ptr> m_memory_cache = cldnn::LruCache<size_t, cldnn::memory::ptr>(cache_capacity);

@@ -29,4 +29,28 @@ struct device {
     virtual ~device() = default;
 };
 
+// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration
+// Lower priority value means lower device ID
+// Current behavior is: Intel iGPU < Intel dGPU < any other GPU
+// Order of Intel dGPUs is undefined and depends on the OCL impl
+// Order of other vendor GPUs is undefined and depends on the OCL impl
+inline size_t get_device_priority(const cldnn::device_info& info) {
+    if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) {
+        return 0;
+    } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) {
+        return 1;
+    } else {
+        return std::numeric_limits<size_t>::max();
+    }
+}
+
+inline std::vector<device::ptr> sort_devices(const std::vector<device::ptr>& devices_list) {
+    std::vector<device::ptr> sorted_list = devices_list;
+    std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1,  device::ptr d2) {
+        return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info());
+    });
+
+    return sorted_list;
+}
+
 }  // namespace cldnn
@@ -75,6 +75,8 @@ struct device_info {
     bool supports_imad;                         ///< Does engine support int8 mad.
     bool supports_immad;                        ///< Does engine support int8 multi mad.
 
+    bool supports_mutable_command_list;         ///< Does the target runtime/device support mutable command list feature
+
     bool supports_usm;                          ///< Does engine support unified shared memory.
     bool has_separate_cache;                    ///< Does the target hardware has separate cache for usm_device and usm_host
 
@@ -96,6 +98,9 @@ struct device_info {
     uint32_t num_threads_per_eu;                ///< Number of hardware threads per execution unit
     uint32_t num_ccs;                           ///< Number of compute command streamers
 
+    uint64_t timer_resolution;                  ///< Resolution of device timer used for profiling in cycles/sec
+    uint32_t kernel_timestamp_valid_bits;       ///< Number of valid bits in the kernel timestamp values
+
     ov::device::UUID uuid;                      ///< UUID of the gpu device
     ov::device::LUID luid;                      ///< LUID of the gpu device
 };

@@ -4,20 +4,22 @@
 
 #pragma once
 
-#include <string>
+#include <ostream>
 
 namespace cldnn {
 
 /// @brief Defines available engine types
 enum class engine_types : int32_t {
     ocl,
-    sycl
+    sycl,
+    ze
 };
 
 inline std::ostream& operator<<(std::ostream& os, const engine_types& type) {
     switch (type) {
     case engine_types::ocl: os << "ocl"; break;
     case engine_types::sycl: os << "sycl"; break;
+    case engine_types::ze: os << "ze"; break;
     default: os << "unknown"; break;
     }
 
@@ -27,11 +29,13 @@ inline std::ostream& operator<<(std::ostream& os, const engine_types& type) {
 /// @brief Defines available runtime types
 enum class runtime_types : int32_t {
     ocl,
+    ze,
 };
 
 inline std::ostream& operator<<(std::ostream& os, const runtime_types& type) {
     switch (type) {
     case runtime_types::ocl: os << "ocl"; break;
+    case runtime_types::ze: os << "ze"; break;
     default: os << "unknown"; break;
     }
 

@@ -4,9 +4,6 @@
 
 #pragma once
 
-#include "kernel_args.hpp"
-#include "event.hpp"
-
 #include <memory>
 #include <vector>
 
@@ -19,7 +16,9 @@ class kernel {
     using ptr = std::shared_ptr<kernel>;
     virtual std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const = 0;
     virtual ~kernel() = default;
-    virtual std::string get_id() const { return ""; }
+
+    virtual std::string get_id() const = 0;
+    virtual std::vector<uint8_t> get_binary() const = 0;
 };
 
 }  // namespace cldnn
@@ -60,6 +60,7 @@ if(COMMAND add_cpplint_target)
 endif()
 
 ov_set_threading_interface_for(${TARGET_NAME})
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 

@@ -9,6 +9,7 @@
 #include "fully_connected_inst.h"
 #include "assign_inst.h"
 #include "mvn_inst.h"
+#include "reorder_inst.h"
 
 #include <algorithm>
 #include <memory>

@@ -5,6 +5,7 @@
 #include "pass_manager.h"
 #include "program_helpers.h"
 #include "reshape_inst.h"
+#include "reorder_inst.h"
 #include <vector>
 #include <memory>
 

@@ -12,7 +12,7 @@
 #include "shape_of_inst.h"
 #include "broadcast_inst.h"
 #include "non_zero_inst.h"
-#include "non_max_suppression_inst.h"
+#include "reorder_inst.h"
 #include "unique_inst.hpp"
 #include "program_helpers.h"
 

@@ -9,6 +9,7 @@
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
 #include "fully_connected_inst.h"
+#include "reorder_inst.h"
 #include "intel_gpu/runtime/format.hpp"
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #include "graph/impls/onednn/utils.hpp"

@@ -11,6 +11,7 @@
 #include "crop_inst.h"
 #include "eltwise_inst.h"
 #include "gemm_inst.h"
+#include "reorder_inst.h"
 #include "read_value_inst.h"
 #include "reshape_inst.h"
 #include "permute_inst.h"

@@ -12,6 +12,7 @@
 #include "activation_inst.h"
 #include "batch_to_space_inst.h"
 #include "crop_inst.h"
+#include "reorder_inst.h"
 #include "eltwise_inst.h"
 #include "gemm_inst.h"
 #include "lrn_inst.h"

@@ -10,6 +10,7 @@
 #include "data_inst.h"
 #include "eltwise_inst.h"
 #include "mutable_data_inst.h"
+#include "reorder_inst.h"
 #include <vector>
 #include <memory>
 

@@ -21,6 +21,7 @@
 #include "region_yolo_inst.h"
 #include "fully_connected_inst.h"
 #include "mvn_inst.h"
+#include "reorder_inst.h"
 
 #include <vector>
 #include <list>

@@ -62,8 +62,8 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
-        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0]));
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
+        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0], e));
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 

@@ -452,11 +452,11 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co
     }
 }
 
-kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const {
+kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const engine& e) const {
     auto res = _cached_kernels.find(id);
     OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");
 
-    return res->second->clone(_reuse_kernels);
+    return e.prepare_kernel(res->second->clone(_reuse_kernels));
 }
 
 std::vector<kernel::ptr> kernels_cache::get_kernels(const kernel_impl_params& params) const {
@@ -599,15 +599,12 @@ void kernels_cache::add_kernels_source(const kernel_impl_params& params,
 }
 
 std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const {
-    auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
-    const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
-    auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
-    cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
+    auto program_binaries = kernel->get_binary();
 
     auto iter = _cached_binaries.find(program_binaries);
     OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries");
 
-    return entry_point + "@" + std::to_string(iter->second);
+    return kernel->get_id() + "@" + std::to_string(iter->second);
 }
 
 std::vector<std::string> kernels_cache::get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const {
@@ -625,9 +622,7 @@ void kernels_cache::add_to_cached_kernels(const std::vector<kernel::ptr>& kernel
     static std::atomic<uint32_t> id_gen{0};
 
     for (auto& kernel : kernels) {
-        auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
-        auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
-        cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
+        auto program_binaries = kernel->get_binary();
 
         std::lock_guard<std::mutex> lock(_mutex);
         auto iter = _cached_binaries.find(program_binaries);

@@ -113,7 +113,7 @@ class kernels_cache {
                            uint32_t prog_id,
                            std::shared_ptr<ov::threading::ITaskExecutor> task_executor = nullptr,
                            const std::map<std::string, std::string>& batch_headers = {});
-    kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
+    kernel::ptr get_kernel_from_cached_kernels(std::string id, const engine& e) const;
     std::vector<kernel::ptr> get_kernels(const kernel_impl_params& params) const;
 
     void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; }
-Original file line number
+Diff line change
@@ Expand Up / @@ -60,6 +60,7 @@ if(COMMAND add_cpplint_target) @@
     endif()
     ov_set_threading_interface_for(${TARGET_NAME})
+    ov_gpu_set_runtime_interface_for(${TARGET_NAME})
     set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
@@ Expand Down @@