Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU][WIP] Initial L0 runtime support #26874

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/android_arm64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ jobs:
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/json
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gtest
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gflags
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero
popd

- name: Clone vcpkg
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/linux_riscv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ jobs:
git submodule update --init -- ${OPENVINO_REPO}/src/plugins/intel_cpu
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/open_model_zoo
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/flatbuffers/flatbuffers
git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero
popd

#
Expand Down
9 changes: 9 additions & 0 deletions cmake/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ else()
set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON)
endif()

set(OV_GPU_DEFAULT_RT "L0")
if (ENABLE_INTEL_GPU)
ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL)
if (GPU_RT_TYPE STREQUAL "L0")
# There's no interop with native L0 in onednn API. Temporary disable onednn when L0 runtime is selected
set(ENABLE_ONEDNN_FOR_GPU_DEFAULT OFF)
endif()
endif()

ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF)

ov_dependent_option (ENABLE_INTEL_NPU "NPU plugin for OpenVINO runtime" ON "X86_64;WIN32 OR LINUX" OFF)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ using gpu_handle_param = void*;
enum class ContextType {
OCL = 0, //!< Pure OpenCL context
VA_SHARED = 1, //!< Context shared with a video decoding device
ZE = 2, //!< Pure Level0 context
};

/** @cond INTERNAL */
Expand All @@ -33,6 +34,8 @@ inline std::ostream& operator<<(std::ostream& os, const ContextType& context_typ
return os << "OCL";
case ContextType::VA_SHARED:
return os << "VA_SHARED";
case ContextType::ZE:
return os << "ZE";
default:
OPENVINO_THROW("Unsupported context type");
}
Expand All @@ -43,6 +46,8 @@ inline std::istream& operator>>(std::istream& is, ContextType& context_type) {
is >> str;
if (str == "OCL") {
context_type = ContextType::OCL;
} else if (str == "ZE") {
context_type = ContextType::ZE;
} else if (str == "VA_SHARED") {
context_type = ContextType::VA_SHARED;
} else {
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ endif()

set (TARGET_NAME "openvino_intel_gpu_plugin")

include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

if(OV_COMPILER_IS_INTEL_LLVM)
# For windows we need to disable warning as error option to make FindSYCL.cmake work
if (WIN32)
Expand Down Expand Up @@ -88,6 +90,7 @@ target_include_directories(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include/)

ov_set_threading_interface_for(${TARGET_NAME})
ov_gpu_set_runtime_interface_for(${TARGET_NAME})

set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})

Expand Down
15 changes: 15 additions & 0 deletions src/plugins/intel_gpu/cmake/utils.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

function(ov_gpu_set_runtime_interface_for TARGET_NAME)
if(GPU_RT_TYPE STREQUAL "L0")
target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1)
target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
elseif(GPU_RT_TYPE STREQUAL "OCL")
target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL)
else()
message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported")
endif()
endfunction()
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# define NOMINMAX
#endif

#include "intel_gpu/runtime/engine_configuration.hpp"
#include "openvino/runtime/intel_gpu/remote_properties.hpp"
#include "openvino/runtime/iremote_context.hpp"

Expand All @@ -19,11 +20,30 @@
#include <string>
#include <map>
#include <memory>
#include <atomic>

namespace ov {
namespace intel_gpu {

inline std::pair<cldnn::engine_types, cldnn::runtime_types> get_device_query_params() {
#ifdef OV_GPU_WITH_ZE_RT
auto runtime_type = cldnn::runtime_types::ze;
#ifdef OV_GPU_WITH_SYCL
auto engine_type = cldnn::engine_types::sycl;
#else
auto engine_type = cldnn::engine_types::ze;
#endif
#else
auto runtime_type = cldnn::runtime_types::ocl;
#ifdef OV_GPU_WITH_SYCL
auto engine_type = cldnn::engine_types::sycl;
#else
auto engine_type = cldnn::engine_types::ocl;
#endif
#endif

return {engine_type, runtime_type};
}

class RemoteContextImpl : public ov::IRemoteContext {
public:
using Ptr = std::shared_ptr<RemoteContextImpl>;
Expand Down Expand Up @@ -59,7 +79,11 @@ class RemoteContextImpl : public ov::IRemoteContext {
ov::intel_gpu::gpu_handle_param m_va_display = nullptr;
ov::intel_gpu::gpu_handle_param m_external_queue = nullptr;

#ifdef OV_GPU_WITH_ZE_RT
ContextType m_type = ContextType::ZE;
#else
ContextType m_type = ContextType::OCL;
#endif
std::string m_device_name = "";
static const size_t cache_capacity = 100;
cldnn::LruCache<size_t, cldnn::memory::ptr> m_memory_cache = cldnn::LruCache<size_t, cldnn::memory::ptr>(cache_capacity);
Expand Down
24 changes: 24 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,28 @@ struct device {
virtual ~device() = default;
};

// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration
// Lower priority value means lower device ID
// Current behavior is: Intel iGPU < Intel dGPU < any other GPU
// Order of Intel dGPUs is undefined and depends on the OCL impl
// Order of other vendor GPUs is undefined and depends on the OCL impl
inline size_t get_device_priority(const cldnn::device_info& info) {
if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) {
return 0;
} else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) {
return 1;
} else {
return std::numeric_limits<size_t>::max();
}
}

inline std::vector<device::ptr> sort_devices(const std::vector<device::ptr>& devices_list) {
std::vector<device::ptr> sorted_list = devices_list;
std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1, device::ptr d2) {
return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info());
});

return sorted_list;
}

} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ struct device_info {
bool supports_imad; ///< Does engine support int8 mad.
bool supports_immad; ///< Does engine support int8 multi mad.

bool supports_mutable_command_list; ///< Does the target runtime/device support mutable command list feature

bool supports_usm; ///< Does engine support unified shared memory.
bool has_separate_cache; ///< Does the target hardware has separate cache for usm_device and usm_host

Expand All @@ -96,6 +98,9 @@ struct device_info {
uint32_t num_threads_per_eu; ///< Number of hardware threads per execution unit
uint32_t num_ccs; ///< Number of compute command streamers

uint64_t timer_resolution; ///< Resolution of device timer used for profiling in cycles/sec
uint32_t kernel_timestamp_valid_bits; ///< Number of valid bits in the kernel timestamp values

ov::device::UUID uuid; ///< UUID of the gpu device
ov::device::LUID luid; ///< LUID of the gpu device
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,22 @@

#pragma once

#include <string>
#include <ostream>

namespace cldnn {

/// @brief Defines available engine types
enum class engine_types : int32_t {
ocl,
sycl
sycl,
ze
};

inline std::ostream& operator<<(std::ostream& os, const engine_types& type) {
switch (type) {
case engine_types::ocl: os << "ocl"; break;
case engine_types::sycl: os << "sycl"; break;
case engine_types::ze: os << "ze"; break;
default: os << "unknown"; break;
}

Expand All @@ -27,11 +29,13 @@ inline std::ostream& operator<<(std::ostream& os, const engine_types& type) {
/// @brief Defines available runtime types
enum class runtime_types : int32_t {
ocl,
ze,
};

inline std::ostream& operator<<(std::ostream& os, const runtime_types& type) {
switch (type) {
case runtime_types::ocl: os << "ocl"; break;
case runtime_types::ze: os << "ze"; break;
default: os << "unknown"; break;
}

Expand Down
7 changes: 3 additions & 4 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

#pragma once

#include "kernel_args.hpp"
#include "event.hpp"

#include <memory>
#include <vector>

Expand All @@ -19,7 +16,9 @@ class kernel {
using ptr = std::shared_ptr<kernel>;
virtual std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const = 0;
virtual ~kernel() = default;
virtual std::string get_id() const { return ""; }

virtual std::string get_id() const = 0;
virtual std::vector<uint8_t> get_binary() const = 0;
};

} // namespace cldnn
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ if(COMMAND add_cpplint_target)
endif()

ov_set_threading_interface_for(${TARGET_NAME})
ov_gpu_set_runtime_interface_for(${TARGET_NAME})

set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "fully_connected_inst.h"
#include "assign_inst.h"
#include "mvn_inst.h"
#include "reorder_inst.h"

#include <algorithm>
#include <memory>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "pass_manager.h"
#include "program_helpers.h"
#include "reshape_inst.h"
#include "reorder_inst.h"
#include <vector>
#include <memory>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include "shape_of_inst.h"
#include "broadcast_inst.h"
#include "non_zero_inst.h"
#include "non_max_suppression_inst.h"
#include "reorder_inst.h"
#include "unique_inst.hpp"
#include "program_helpers.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "convolution_inst.h"
#include "deconvolution_inst.h"
#include "fully_connected_inst.h"
#include "reorder_inst.h"
#include "intel_gpu/runtime/format.hpp"
#ifdef ENABLE_ONEDNN_FOR_GPU
#include "graph/impls/onednn/utils.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "crop_inst.h"
#include "eltwise_inst.h"
#include "gemm_inst.h"
#include "reorder_inst.h"
#include "read_value_inst.h"
#include "reshape_inst.h"
#include "permute_inst.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "activation_inst.h"
#include "batch_to_space_inst.h"
#include "crop_inst.h"
#include "reorder_inst.h"
#include "eltwise_inst.h"
#include "gemm_inst.h"
#include "lrn_inst.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "data_inst.h"
#include "eltwise_inst.h"
#include "mutable_data_inst.h"
#include "reorder_inst.h"
#include <vector>
#include <memory>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "region_yolo_inst.h"
#include "fully_connected_inst.h"
#include "mvn_inst.h"
#include "reorder_inst.h"

#include <vector>
#include <list>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
this->can_share_kernels = kernels_cache.get_kernels_reuse();
}

void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0]));
void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0], e));
this->can_share_kernels = kernels_cache.get_kernels_reuse();
}

Expand Down
15 changes: 5 additions & 10 deletions src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,11 +452,11 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co
}
}

kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const {
kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const engine& e) const {
auto res = _cached_kernels.find(id);
OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");

return res->second->clone(_reuse_kernels);
return e.prepare_kernel(res->second->clone(_reuse_kernels));
}

std::vector<kernel::ptr> kernels_cache::get_kernels(const kernel_impl_params& params) const {
Expand Down Expand Up @@ -599,15 +599,12 @@ void kernels_cache::add_kernels_source(const kernel_impl_params& params,
}

std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const {
auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
auto program_binaries = kernel->get_binary();

auto iter = _cached_binaries.find(program_binaries);
OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries");

return entry_point + "@" + std::to_string(iter->second);
return kernel->get_id() + "@" + std::to_string(iter->second);
}

std::vector<std::string> kernels_cache::get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const {
Expand All @@ -625,9 +622,7 @@ void kernels_cache::add_to_cached_kernels(const std::vector<kernel::ptr>& kernel
static std::atomic<uint32_t> id_gen{0};

for (auto& kernel : kernels) {
auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
auto program_binaries = kernel->get_binary();

std::lock_guard<std::mutex> lock(_mutex);
auto iter = _cached_binaries.find(program_binaries);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class kernels_cache {
uint32_t prog_id,
std::shared_ptr<ov::threading::ITaskExecutor> task_executor = nullptr,
const std::map<std::string, std::string>& batch_headers = {});
kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
kernel::ptr get_kernel_from_cached_kernels(std::string id, const engine& e) const;
std::vector<kernel::ptr> get_kernels(const kernel_impl_params& params) const;

void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; }
Expand Down
Loading
Loading