[CPU][Refactoring] Introduce VariableExecutor

Depending on the parameters a FullyConnected node can use one or multiple executors. With the current approach, even when just a single executor is used, every prepareParams() (executor::update()) call goes through executor selection routine. The idea is to avoid such overhead for a single executor scenarious, which are probably the most common ones. Thus, split the pipeline input two branches: - only single simple executor is used and updated - a VariableExecutor is used and updated. VariableExecutor contains two or more simple executors
openvinotoolkit · Dec 3, 2024 · caa6e7a · caa6e7a
1 parent 963b1be
commit caa6e7a
Show file tree

Hide file tree

Showing 7 changed files with 233 additions and 175 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp
@@ -8,12 +8,12 @@
 #include <oneapi/dnnl/dnnl.hpp>
 
 #include "cpu_memory.h"
-#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp"
-#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp"
 #include "nodes/executors/dnnl/dnnl_aliases.hpp"
+#include "nodes/executors/dnnl/dnnl_utils.hpp"
 #include "nodes/executors/executor.hpp"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "nodes/executors/memory_arguments.hpp"
+#include "post_ops.hpp"
 
 namespace ov {
 namespace intel_cpu {

diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
@@ -1,55 +1,27 @@
-// Copyright (C) 2018-2022 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 
 #include "executor.hpp"
-#include "nodes/executors/implementations.hpp"
 #include "nodes/executors/executor_config.hpp"
 #include "nodes/executors/executor_implementation.hpp"
 #include "nodes/executors/graph_emitter.hpp"
+#include "nodes/executors/implementations.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "nodes/executors/printers.hpp"
-#include "openvino/core/except.hpp"
+#include "nodes/executors/variable_executor.hpp"
 #include "post_ops.hpp"
 
 namespace ov {
 namespace intel_cpu {
 using namespace executor;
 
-template <typename Attrs, typename NodeT>
-static ExecutorPtr fallback(const executor::Config<Attrs>& config,
-                            const executor::Config<Attrs>& fallbackConfig,
-                            const MemoryArgs& memory,
-                            const ExecutorContext::CPtr context,
-                            const std::string& name) {
-    DEBUG_LOG("Falling back to graph executor for ",
-              name,
-              ". Original config: ",
-              config,
-              " new config:",
-              fallbackConfig);
-
-    GraphEmitter<Attrs> graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name);
-
-    const auto& graphExecutor =
-        graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context)
-            .ensureAttrsMatch()
-            .ensureSrcDescsMatch()
-            .ensureDstDescsMatch()
-            .ensurePostOpsMatch()
-            .emit();
-    (void)graphExecutor;
-
-    OPENVINO_THROW("Fallback logic is not implemented yet");  // return graphExecutor;
-}
-
-template <typename Attrs, typename NodeT>
+template <typename Attrs>
 class ExecutorFactory {
 public:
     using ExecutorImplementationRef = std::reference_wrapper<const ExecutorImplementation<Attrs>>;
@@ -95,104 +67,41 @@ class ExecutorFactory {
     }
 
     /**
-     * @brief Preconfigures an executor based on the provided memory arguments.
-     *
-     * Preconfigures an executor by selecting an appropriate implementation based on the provided
-     * memory arguments and by creating an executor using the implementation.
-     *
-     * @param memory The memory parameters used for selecting the appropriate executor implementation.
-     *
-     * @note The main use case is to offload executor data preparation (i.e. weights packing)
-     *       From the make() call
-     * @todo Currently supports creating a single executor.
-     *       For some nodes it can be worth to preconfigure all the executors.
-     */
-    void preconfigure(const MemoryArgs& memory) {
-        executor::Config<Attrs> config{memoryDescsFromMemory(memory), m_attrs, m_postOps};
-
-        cacheFallbackStatus(config);
-
-        const size_t implId = select(memory, 0);
-        const auto& impl = m_suitableImplementations[implId].get();
-        DEBUG_LOG("Preconfiguring executor: ", impl.name());
-
-        if (m_implementationRequiresFallback[implId]) {
-            if (auto fallbackConfig = impl.requiresFallback(config)) {
-                fallback<Attrs, NodeT>(config, *fallbackConfig, memory, m_context, impl.name());
-            }
-        }
-
-        (void)create(implId, memory, m_context);
-    }
-
-    /**
-     * @brief Creates an Executor instance based on provided memory arguments.
-     *
-     * Creates an Executor instance using the provided MemoryArgs, selecting an appropriate implementation
-     * based on the characteristics of the memory. It handles fallback scenarios if necessary and updates the executor
-     * with the given memory information.
+     * @brief Creates an Executor instance based on the provided \memory arguments.
+     * Depending on the number of available implementations, returns:
+     * - VariableExecutor, if the number of implementations two or more
+     * - simple Executor, if there is only one implementation is available
      *
      * @param memory memory arguments.
      *
      * @return A shared pointer to the created Executor.
-     *
-     * The function follows the steps below:
-     * - Selects an implementation based on the provided memory using the select() function.
-     * - Retrieves the selected implementation and checks if fallback is required.
-     * - If fallback is required, it creates a fallback configuration and returns a fallback executor.
-     * - Otherwise creates the executor using the selected implementation.
-     * - Updates the executor with the given memory information.
-     *
      */
-    ExecutorPtr make(MemoryArgs& memory) {
-        auto createExec = [this](MemoryArgs& memory, size_t implId) -> ExecutorPtr {
-            const auto& impl = m_suitableImplementations[implId].get();
-            if (m_implementationRequiresFallback[implId]) {
-                executor::Config<Attrs> config{memoryDescsFromMemory(memory), m_attrs, m_postOps};
-                if (auto fallbackConfig = impl.requiresFallback(config)) {
-                    return fallback<Attrs, NodeT>(config, *fallbackConfig, memory, m_context, impl.name());
-                }
-            }
-            const auto executor = create(implId, memory, m_context);
-            if (!executor->update(memory)) {
-                return nullptr;
+    ExecutorPtr make(const MemoryArgs& memory) {
+        // only single executor is available and it does not require any fallback
+        if (m_suitableImplementations.size() == 1) {
+            auto config = GraphEmitter<Attrs>::createConfig(memory, m_attrs, m_postOps);
+
+            const auto& theOnlyImplementation = m_suitableImplementations.front().get();
+
+            if (const auto fallbackConfig = theOnlyImplementation.requiresFallback(config)) {
+                return GraphEmitter<Attrs>::fallback(config,
+                                                     *fallbackConfig,
+                                                     memory,
+                                                     m_context,
+                                                     theOnlyImplementation.name());
             }
-            return executor;
-        };
-
-        auto implId = select(memory, 0);
-        auto executor = createExec(memory, implId);
-        while (!executor) {
-            implId = select(memory, ++implId);
-            executor = createExec(memory, implId);
-        }
-        return executor;
-    }
 
-private:
-    static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) {
-        MemoryDescArgs memoryDescs;
-        memoryDescs.reserve(memory.size());
-
-        for (const auto& mem : memory) {
-            memoryDescs[mem.first] = mem.second->getDescPtr();
+            return theOnlyImplementation.create(m_attrs, m_postOps, memory, m_context);
         }
 
-        return memoryDescs;
-    }
-
-    /**
-     * @brief Caches the fallback status for each suitable implementation.
-     */
-    void cacheFallbackStatus(const executor::Config<Attrs>& config) {
-        std::transform(m_suitableImplementations.begin(),
-                       m_suitableImplementations.end(),
-                       m_implementationRequiresFallback.begin(),
-                       [&config](const ExecutorImplementationRef& impl) {
-                           return impl.get().requiresFallback(config);
-                       });
+        return std::make_shared<VariableExecutor<Attrs>>(memory,
+                                                         m_attrs,
+                                                         m_postOps,
+                                                         m_context,
+                                                         m_suitableImplementations);
     }
 
+private:
     /**
      * @brief Filters and retrieves suitable implementations based on the provided executor configuration.
      *
@@ -205,11 +114,10 @@ class ExecutorFactory {
      * @note If an implementation is shape agnostic, no further implementations with lower
      *       priority are considered.
      */
-    static std::vector<ExecutorImplementationRef> filter(
-        const Attrs& attrs,
-        const PostOps& postOps,
-        const MemoryDescArgs& descs,
-        const std::string& implementationPriority = {}) {
+    static std::vector<ExecutorImplementationRef> filter(const Attrs& attrs,
+                                                         const PostOps& postOps,
+                                                         const MemoryDescArgs& descs,
+                                                         const std::string& implementationPriority = {}) {
         const auto& implementations = getImplementations<Attrs>();
         std::vector<ExecutorImplementationRef> suitableImplementations;
         const executor::Config<Attrs> config{descs, attrs, postOps};
@@ -244,36 +152,6 @@ class ExecutorFactory {
         return suitableImplementations;
     }
 
-    size_t select(const MemoryArgs& memory, const size_t startIdx) const {
-        OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(),
-            "Failed to find an implementation since start indx: ", startIdx,
-            " is out of range of the suitable implementations array: ", m_suitableImplementations.size());
-        auto startIt = m_suitableImplementations.begin();
-        std::advance(startIt, startIdx);
-        const auto selectedImplementation =
-            std::find_if(startIt,
-                         m_suitableImplementations.end(),
-                         [&memory](const ExecutorImplementationRef& implementation) {
-                             return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory);
-                         });
-        OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation");
-
-        return std::distance(m_suitableImplementations.begin(), selectedImplementation);
-    }
-
-    ExecutorPtr create(const size_t implId,
-                       const MemoryArgs& memory,
-                       const ExecutorContext::CPtr context) {
-        assert(implId < m_executors.size() && implId < m_suitableImplementations.size());
-
-        if (!m_executors[implId]) {
-            const auto& impl = m_suitableImplementations[implId].get();
-            m_executors[implId] = impl.create(m_attrs, m_postOps, memory, context);
-        }
-
-        return m_executors[implId];
-    }
-
     const Attrs& m_attrs;
     const PostOps& m_postOps;
     const ExecutorContext::CPtr m_context;
@@ -284,11 +162,11 @@ class ExecutorFactory {
     std::vector<ExecutorPtr> m_executors;
 };
 
-template <typename Attrs, typename NodeT>
-using ExecutorFactoryPtr = std::shared_ptr<ExecutorFactory<Attrs, NodeT>>;
+template <typename Attrs>
+using ExecutorFactoryPtr = std::shared_ptr<ExecutorFactory<Attrs>>;
 
-template <typename Attrs, typename NodeT>
-using ExecutorFactoryCPtr = std::shared_ptr<const ExecutorFactory<Attrs, NodeT>>;
+template <typename Attrs>
+using ExecutorFactoryCPtr = std::shared_ptr<const ExecutorFactory<Attrs>>;
 
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -11,6 +11,7 @@
 #include "memory_desc/cpu_memory_desc.h"
 #include "nodes/executors/convolution_config.hpp"
 #include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp"
+#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp"
 #include "nodes/executors/dnnl/dnnl_fullyconnected.hpp"
 #include "nodes/executors/dnnl/dnnl_matmul_primitive.hpp"
 #include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp"

diff --git a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp
@@ -5,12 +5,11 @@
 #pragma once
 
 #include <functional>
-#include <vector>
 
 #include "graph.h"
-#include "memory_desc/cpu_memory_desc.h"
 #include "node.h"
 #include "nodes/executors/executor.hpp"
+#include "nodes/executors/executor_config.hpp"
 #include "post_ops.hpp"
 
 namespace ov {
@@ -72,6 +71,49 @@ class GraphEmitter {
         return graph;
     }
 
+    static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) {
+        MemoryDescArgs memoryDescs;
+        memoryDescs.reserve(memory.size());
+
+        for (const auto& mem : memory) {
+            memoryDescs[mem.first] = mem.second->getDescPtr();
+        }
+
+        return memoryDescs;
+    }
+
+    static executor::Config<Attrs> createConfig(const MemoryArgs& memory,
+                                                const Attrs& attrs,
+                                                const PostOps& postOps) {
+        return executor::Config<Attrs>{memoryDescsFromMemory(memory), attrs, postOps};
+    }
+
+    static ExecutorPtr fallback(const executor::Config<Attrs>& config,
+                                const executor::Config<Attrs>& fallbackConfig,
+                                const MemoryArgs& memory,
+                                const ExecutorContext::CPtr context,
+                                const std::string& name) {
+        DEBUG_LOG("Falling back to graph executor for ",
+                  name,
+                  ". Original config: ",
+                  config,
+                  " new config:",
+                  fallbackConfig);
+
+        GraphEmitter<Attrs> graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name);
+
+        const auto& graphExecutor =
+            graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context)
+            .ensureAttrsMatch()
+            .ensureSrcDescsMatch()
+            .ensureDstDescsMatch()
+            .ensurePostOpsMatch()
+            .emit();
+        (void)graphExecutor;
+
+        OPENVINO_THROW("Fallback logic is not implemented yet");  // return graphExecutor;
+    }
+
 private:
     const MemoryDescArgs& descs;
     const Attrs& attrs;