[CPU] FullyConnected: fixed primitive caching for sparse decompressio…

…n case [CPU][oneDNN] sparsity: some fixes and removed unused code [CPU][TESTS] FullyConnected: sparsity weights decompression tests [CPU] FullyConnected: removed min sparse rate = 0.5 limitation [CPU] fixed property CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE [CPU][TESTS] added CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE test [CPU][DOC] doc dixes
openvinotoolkit · Jan 5, 2023 · 47faadd · 47faadd
1 parent 3017c8d
commit 47faadd
Show file tree

Hide file tree

Showing 8 changed files with 371 additions and 15 deletions.
diff --git a/docs/OV_Runtime_UG/supported_plugins/CPU.md b/docs/OV_Runtime_UG/supported_plugins/CPU.md
@@ -281,7 +281,7 @@ To enable denormals optimization in the application, the `denormals_optimization
 
 `Sparse weights decompression feature` allows to pack weights for Matrix Multiplication operations directly in the CPU plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve inference performance.
 
-To use this feature, the user is provided with property `sparse_weights_decompression_rate`, which can take values from the interval \[0.5, 1\] (values from \[0, 0.5\] are not supported in current implementation, see limitations below). `sparse_weights_decompression_rate` defines sparse rate threashold: only operations with higher sparse rate will be executed using `sparse weights decompression feature`. The default value is `1`, which means the option is disabled.
+To use this feature, the user is provided with property `sparse_weights_decompression_rate`, which can take values from the interval \[0, 1\]. `sparse_weights_decompression_rate` defines sparse rate threashold: only operations with higher sparse rate will be executed using `sparse weights decompression feature`. The default value is `1`, which means the option is disabled.
 
 > **NOTE**: `Sparse weights decompression feature` is disabled by default since overall speed-up highly depends on particular workload and for some cases the feature may introduce performance degradations.
 
@@ -315,7 +315,6 @@ Currently, the `sparse weights decompression feature` is supported with the foll
 2. Feature is only supported for Matrix Multiplication operations.
 3. HW target must have Intel AMX extension support (e.g., Intel® 4th Generation Xeon® processors (code name Sapphire Rapids)).
 4. The number of input and output channels of the weights must be a multiple of 64.
-5. Current feature implementation supports only sparse rate higher than 0.5.
 
 ## Additional Resources
 * [Supported Devices](Supported_Devices.md)

diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -195,7 +195,7 @@ def test_properties_ro(ov_property_ro, expected_value):
         ),
         (
             properties.intel_cpu.sparse_weights_decompression_rate,
-            "SPARSE_WEIGHTS_DECOMPRESSION_RATE",
+            "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE",
             (
                 (0.1, np.float32(0.1)),
                 (2.0, 2.0),

diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
@@ -47,7 +47,21 @@ namespace intel_cpu {
  */
 static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"};
 
-static constexpr Property<float> sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
+/**
+ * @brief This property defines threshold for sparse weights decompression feature activation
+ * @ingroup ov_runtime_cpu_prop_cpp_api
+ *
+ * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU plugin
+ * at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model,
+ * the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed
+ * format this significantly decreases memory consumption and as a consequence improve inference performance.
+ * The following code allows to set the sparse rate value.
+ *
+ * @code
+ * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8));
+ * @endcode
+ */
+static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
 
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
     dnnl::memory::desc wgh_candidate;
     if (useSparseWeights) {
         wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
-                wdt, memory::desc::packed(nnzCount) };
+                wdt, memory::desc::packed() };
     } else {
         wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
                                         wdt, dnnl::memory::format_tag::any };
@@ -930,18 +930,12 @@ bool FullyConnected::useSparseWeightsDecompression() {
             zerosCounts++;
         }
     }
-    nnzCount = elementsCount - zerosCounts;
 
     DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ",
-        zerosCounts, ", nnzCount = ", nnzCount);
+        zerosCounts, ", nnzCount = ", elementsCount - zerosCounts);
 
     weiSparseRate = static_cast<float>(zerosCounts) / static_cast<float>(elementsCount);
 
-    // [av] WA: there is no point in using sparse decompression when the sparse rate is low
-    // todo: add heuristic
-    if (minSparseRate < 0.5)
-        minSparseRate = 0.5;
-
     DEBUG_LOG(getName(), " | sparse rate = ", weiSparseRate * 100, "%, min sparse rate = ",
         minSparseRate * 100, "%, use sparse weights = ", weiSparseRate >= minSparseRate);
 

diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -42,7 +42,6 @@ class FullyConnected : public Node {
 
     void initSupportedPrimitiveDescriptors() override;
     void initOptimalPrimitiveDescriptor() override;
-    // void createPrimitive() override;
     std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
     std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
 
@@ -112,7 +111,6 @@ class FullyConnected : public Node {
 
     // sparse weights
     bool useSparseWeights = false;
-    int nnzCount = -1;
     float minSparseRate = 1.f;
     float weiSparseRate = 0.f;
     bool useSparseWeightsDecompression();

diff --git a/...cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp b/...cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
@@ -9,6 +9,7 @@
 #include "openvino/runtime/core.hpp"
 #include "openvino/runtime/compiled_model.hpp"
 #include "openvino/runtime/properties.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
 
 #include <gtest/gtest.h>
 
@@ -113,6 +114,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanThrough
     ASSERT_EQ(streams, value);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CheckSparseWeigthsDecompressionRate) {
+    ov::Core core;
+
+    core.set_property(deviceName, ov::intel_cpu::sparse_weights_decompression_rate(0.8));
+    ASSERT_NO_THROW(ov::CompiledModel compiledModel = core.compile_model(model, deviceName));
+}
+
 const std::vector<ov::AnyMap> multiDevicePriorityConfigs = {
         {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}};