From f092cf2725704cc91641cb7c0e3d122e2f75fb5a Mon Sep 17 00:00:00 2001
From: Lyamin-Roman <roman.lyamin@intel.com>
Date: Fri, 13 Dec 2024 03:46:23 +0900
Subject: [PATCH] [GPU] Review fixes 2

---
 .../lora_horizontal_fusion.cpp                |  38 ++-
 .../lora_horizontal_fusion.hpp                |  69 +++++
 .../lora_horizontal_fusion.cpp                | 264 +++++++++++++++++-
 3 files changed, 351 insertions(+), 20 deletions(-)
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp
index 226eb155652c10..d9059e63338876 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp
@@ -21,15 +21,23 @@ LoRAHorizontalFusion::LoRAHorizontalFusion() {
         auto is_lora_pattern = [](const std::shared_ptr<Node>& node) {
             #define check(node) if (!node) return false;
 
-            const auto& add =            std::dynamic_pointer_cast<ov::op::v1::Add>(node);                                               check(add)
-            const auto& matmul2 =        std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) ?
-                                         std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) :
-                                         std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(1));               check(matmul2)
-            const auto& multiply =       std::dynamic_pointer_cast<ov::op::v1::Multiply>(matmul2->get_input_node_shared_ptr(0));         check(multiply)
-            const auto& variable_b =     std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(matmul2->get_input_node_shared_ptr(1));  check(variable_b)
-            const auto& matmul1 =        std::dynamic_pointer_cast<ov::op::v0::MatMul>(multiply->get_input_node_shared_ptr(0));          check(matmul1)
-            const auto& variable_alpha = std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(multiply->get_input_node_shared_ptr(1)); check(variable_alpha)
-            const auto& variable_a =     std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(matmul1->get_input_node_shared_ptr(1));  check(variable_a)
+            const auto& add = std::dynamic_pointer_cast<ov::op::v1::Add>(node);                                                         check(add)
+
+            size_t matmul2_idx = ov::is_type<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) ? 0 : 1;
+            const auto& matmul2 = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(matmul2_idx));           check(matmul2)
+
+            const auto& multiply = std::dynamic_pointer_cast<ov::op::v1::Multiply>(matmul2->get_input_node_shared_ptr(0));              check(multiply)
+
+            const auto& variable_b = std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(matmul2->get_input_node_shared_ptr(1));     check(variable_b)
+
+            size_t matmul1_idx = ov::is_type<ov::op::v0::MatMul>(multiply->get_input_node_shared_ptr(0)) ? 0 : 1;
+            const auto& matmul1 = std::dynamic_pointer_cast<ov::op::v0::MatMul>(multiply->get_input_node_shared_ptr(matmul1_idx));      check(matmul1)
+
+            size_t alpha_idx = (matmul1_idx + 1) % 2;
+            const auto& variable_alpha =
+                std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(multiply->get_input_node_shared_ptr(alpha_idx));                 check(variable_alpha)
+
+            const auto& variable_a = std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(matmul1->get_input_node_shared_ptr(1));     check(variable_a)
 
             #undef check
             return true;
@@ -68,17 +76,19 @@ LoRAHorizontalFusion::LoRAHorizontalFusion() {
         for (const auto& add : split->get_users()) {
             add_nodes.emplace_back(add);
 
-            bool first_input_matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) != nullptr;
-            matmul2_nodes.emplace_back(first_input_matmul ? add->get_input_node_shared_ptr(0)
-                                                          : add->get_input_node_shared_ptr(1));
+            size_t matmul2_idx = ov::is_type<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) ? 0 : 1;
+            matmul2_nodes.emplace_back(add->get_input_node_shared_ptr(matmul2_idx));
         }
         for (const auto& matmul2 : matmul2_nodes) {
             multiply_nodes.emplace_back(matmul2->get_input_node_shared_ptr(0));
             variable_b_nodes.emplace_back(matmul2->get_input_node_shared_ptr(1));
         }
         for (const auto& multiply : multiply_nodes) {
-            matmul1_nodes.emplace_back(multiply->get_input_node_shared_ptr(0));
-            variable_alpha_nodes.emplace_back(multiply->get_input_node_shared_ptr(1));
+            size_t matmul1_idx = ov::is_type<ov::op::v0::MatMul>(multiply->get_input_node_shared_ptr(0)) ? 0 : 1;
+            matmul1_nodes.emplace_back(multiply->get_input_node_shared_ptr(matmul1_idx));
+
+            size_t alpha_idx = (matmul1_idx + 1) % 2;
+            variable_alpha_nodes.emplace_back(multiply->get_input_node_shared_ptr(alpha_idx));
         }
         for (const auto& matmul1 : matmul1_nodes) {
             variable_a_nodes.emplace_back(matmul1->get_input_node_shared_ptr(1));
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp
index 424f5a58bffb6b..631028d68baa7a 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp
@@ -9,6 +9,75 @@
 namespace ov {
 namespace intel_gpu {
 
+// Before:
+//          ┌─────────┐                                                                  ┌─────────┐
+//          │ReadValue│                                                                  │ReadValue│
+//          └────┬────┘                                                                  └────┬────┘
+//               │                                 ┌───────────┐                              │
+//               │         ┌───────────────────────┼ LoraInput ┼───────────────────┐          │
+//               │         │                       └─────┬─────┘                   │          │
+//               │    ┌────▼───┐                         │                    ┌────▼───┐      │
+//               └────►  Gemm  │                         │                    │  Gemm  ◄──────┘
+// ┌─────────┐        └────┬───┘                         │                    └────┬───┘       ┌─────────┐
+// │ReadValue│             │                             │                         │           │ReadValue│
+// └────┬────┘             │                 ┌───────────▼────────────┐            │           └────┬────┘
+//      │             ┌────▼───┐             │FullyConnectedCompressed│       ┌────▼───┐            │
+//      └─────────────►Multiply│             └───────────┬────────────┘       │Multiply◄────────────┘
+//                    └────┬───┘                         │                    └────────┘
+// ┌─────────┐             │                             │                         │               ┌─────────┐
+// │ReadValue│             │                             │                         │               │ReadValue│
+// └────┬────┘             │                             │                         │               └────┬────┘
+//      │             ┌────▼───┐                  ┌──────▼──────┐             ┌────▼───┐                │
+//      └─────────────►  Gemm  │      ┌───────────┼VariadicSplit┼──────────┐  │  Gemm  ◄────────────────┘
+//                    └────┬───┘      │           └──────┬──────┘          │  └────┬───┘
+//                         │          │                  │                 │       │
+//                         │          │                  │                 │       │
+//                         │          │                  │                 │       │
+//                         │       ┌──▼──┐               ▼              ┌──▼──┐    │
+//                         └───────► Add │              ...             │ Add ◄────┘
+//                                 └─────┘                              └─────┘
+// After:
+//                                                                                   ┌─────────┐
+//                                                                              ┌────┼ReadValue│
+//        ┌──────────┐                                             ┌──────┐     │    └─────────┘
+//        │LoRA_Input┼────────────────────────────┐  ┌─────────────┼Concat◄─────┤     ...
+//        └────┬─────┘                            │  │             └──────┘     │    ┌─────────┐
+//             │                                  │  │                          └────┼ReadValue│
+//             │                                  │  │                               └─────────┘
+//             │                             ┌────▼──▼───┐
+//             │                             │MatMulFused│
+//             │                             └───────────┘
+//             │                                   │                                 ┌─────────┐
+//             │                                   │                            ┌────┼ReadValue│
+//             │                                   │               ┌──────┐     │    └─────────┘
+//             │                                   │      ┌────────┼Concat◄─────┤     ...
+//             │                                   │      │        └──────┘     │    ┌─────────┐
+//             │                                   │      │                     └────┼ReadValue│
+// ┌───────────▼────────────┐                  ┌───▼──────▼──┐                       └─────────┘
+// │FullyConnectedCompressed│                  │MultiplyFused│
+// └───────────┬────────────┘                  └──────┬──────┘
+//             │                                      │
+//             │                     ┌─────────┐      │      ┌─────────┐
+//             │                     │ReadValue│   ┌──▼──┐   │ReadValue│
+//             │                     └────┬────┘   │Split│   └────┬────┘
+//             │                          │        └──┬──┘        │
+//             │                          │           │           │
+//             │                          │  ┌────────┼────────┐  │
+//             │                          │  │                 │  │
+//             │                       ┌──▼──▼──┐           ┌──▼──▼──┐
+//             │                       │ MatMul │    ...    │ MatMul │
+//             │                       └────┬───┘           └────┬───┘
+//             │                            └──────┐    ┌────────┘
+//             │                                   │    │
+//             │             ┌─────┐             ┌─▼────▼─┐
+//             └─────────────► Add ◄─────────────┼ Concat │
+//                           └──┬──┘             └────────┘
+//                              │
+//                              │
+//                       ┌──────▼──────┐
+//                       │VariadicSplit│
+//                       └─────────────┘
+
 class LoRAHorizontalFusion: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("LoRAHorizontalFusion", "0");
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp b/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp
index 1692ba3a40a1e8..38d2365c9e0545 100644
--- a/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp
+++ b/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp
@@ -59,7 +59,7 @@ TEST_F(TransformationTestsF, LoRAHorizontalFusion_default) {
         auto matmul2_1 = std::make_shared<ov::op::v0::MatMul>(multiply_1, read_value_b_1, false, true);
         auto add_1 = std::make_shared<ov::op::v1::Add>(split->output(1), matmul2_1);
 
-       auto variable_a_2 = std::make_shared<ov::op::util::Variable>(
+        auto variable_a_2 = std::make_shared<ov::op::util::Variable>(
             ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_2"});
         auto variable_alpha_2 = std::make_shared<ov::op::util::Variable>(
             ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_2"});
@@ -165,7 +165,7 @@ TEST_F(TransformationTestsF, LoRAHorizontalFusion_default) {
     }
 }
 
-TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_inputs) {
+TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_and_multiply_inputs) {
     ov::element::Type model_dt = ov::element::f16;
     {
         auto lora_input = std::make_shared<ov::op::v0::Parameter>(model_dt, ov::PartialShape{-1, -1, 2048});
@@ -188,7 +188,7 @@ TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_inputs) {
         auto read_value_alpha_0 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_0);
         auto read_value_b_0 = std::make_shared<ov::op::v6::ReadValue>(variable_b_0);
         auto matmul1_0 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_0, false, true);
-        auto multiply_0 = std::make_shared<ov::op::v1::Multiply>(matmul1_0, read_value_alpha_0);
+        auto multiply_0 = std::make_shared<ov::op::v1::Multiply>(read_value_alpha_0, matmul1_0);
         auto matmul2_0 = std::make_shared<ov::op::v0::MatMul>(multiply_0, read_value_b_0, false, true);
         auto add_0 = std::make_shared<ov::op::v1::Add>(matmul2_0, split->output(0));
 
@@ -202,11 +202,11 @@ TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_inputs) {
         auto read_value_alpha_1 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_1);
         auto read_value_b_1 = std::make_shared<ov::op::v6::ReadValue>(variable_b_1);
         auto matmul1_1 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_1, false, true);
-        auto multiply_1 = std::make_shared<ov::op::v1::Multiply>(matmul1_1, read_value_alpha_1);
+        auto multiply_1 = std::make_shared<ov::op::v1::Multiply>(read_value_alpha_1, matmul1_1);
         auto matmul2_1 = std::make_shared<ov::op::v0::MatMul>(multiply_1, read_value_b_1, false, true);
         auto add_1 = std::make_shared<ov::op::v1::Add>(matmul2_1, split->output(1));
 
-       auto variable_a_2 = std::make_shared<ov::op::util::Variable>(
+        auto variable_a_2 = std::make_shared<ov::op::util::Variable>(
             ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_2"});
         auto variable_alpha_2 = std::make_shared<ov::op::util::Variable>(
             ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_2"});
@@ -216,7 +216,7 @@ TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_inputs) {
         auto read_value_alpha_2 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_2);
         auto read_value_b_2 = std::make_shared<ov::op::v6::ReadValue>(variable_b_2);
         auto matmul1_2 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_2, false, true);
-        auto multiply_2 = std::make_shared<ov::op::v1::Multiply>(matmul1_2, read_value_alpha_2);
+        auto multiply_2 = std::make_shared<ov::op::v1::Multiply>(read_value_alpha_2, matmul1_2);
         auto matmul2_2 = std::make_shared<ov::op::v0::MatMul>(multiply_2, read_value_b_2, false, true);
         auto add_2 = std::make_shared<ov::op::v1::Add>(matmul2_2, split->output(2));
 
@@ -312,6 +312,258 @@ TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_inputs) {
     }
 }
 
+TEST_F(TransformationTestsF, LoRAHorizontalFusion_split_two_outputs) {
+    ov::element::Type model_dt = ov::element::f16;
+    {
+        auto lora_input = std::make_shared<ov::op::v0::Parameter>(model_dt, ov::PartialShape{-1, -1, 2048});
+        auto weights = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{2304, 2048});
+        auto bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto scale = std::make_shared<ov::op::v0::Constant>(model_dt, ov::Shape{2304, 1});
+        auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(lora_input, weights, bias, scale);
+
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2});
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256});
+        auto split = std::make_shared<ov::op::v1::VariadicSplit>(fc_fused, axis_const, split_const);
+
+        auto variable_a_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"});
+        auto variable_alpha_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"});
+        auto variable_b_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"});
+        auto read_value_a_0 = std::make_shared<ov::op::v6::ReadValue>(variable_a_0);
+        auto read_value_alpha_0 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_0);
+        auto read_value_b_0 = std::make_shared<ov::op::v6::ReadValue>(variable_b_0);
+        auto matmul1_0 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_0, false, true);
+        auto multiply_0 = std::make_shared<ov::op::v1::Multiply>(matmul1_0, read_value_alpha_0);
+        auto matmul2_0 = std::make_shared<ov::op::v0::MatMul>(multiply_0, read_value_b_0, false, true);
+        auto add_0 = std::make_shared<ov::op::v1::Add>(split->output(0), matmul2_0);
+
+        auto variable_a_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"});
+        auto variable_alpha_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"});
+        auto variable_b_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"});
+        auto read_value_a_1 = std::make_shared<ov::op::v6::ReadValue>(variable_a_1);
+        auto read_value_alpha_1 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_1);
+        auto read_value_b_1 = std::make_shared<ov::op::v6::ReadValue>(variable_b_1);
+        auto matmul1_1 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_1, false, true);
+        auto multiply_1 = std::make_shared<ov::op::v1::Multiply>(matmul1_1, read_value_alpha_1);
+        auto matmul2_1 = std::make_shared<ov::op::v0::MatMul>(multiply_1, read_value_b_1, false, true);
+        auto add_1 = std::make_shared<ov::op::v1::Add>(split->output(1), matmul2_1);
+
+        auto reshape_pattern0 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 32, 64});
+        auto reshape_pattern1 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 4, 64});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(add_0, reshape_pattern0, true);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(add_1, reshape_pattern1, true);
+
+        auto result0 = std::make_shared<ov::op::v0::Result>(reshape0);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{result0, result1}, ov::ParameterVector{lora_input});
+        manager.register_pass<LoRAHorizontalFusion>();
+    }
+
+    {
+        auto lora_input = std::make_shared<ov::op::v0::Parameter>(model_dt, ov::PartialShape{-1, -1, 2048});
+        auto weights = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{2304, 2048});
+        auto bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto scale = std::make_shared<ov::op::v0::Constant>(model_dt, ov::Shape{2304, 1});
+        auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(lora_input, weights, bias, scale);
+
+        auto variable_a_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"});
+        auto variable_a_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"});
+
+        auto read_value_a_0 = std::make_shared<ov::op::v6::ReadValue>(variable_a_0);
+        auto read_value_a_1 = std::make_shared<ov::op::v6::ReadValue>(variable_a_1);
+        auto concat_variable_a = std::make_shared<ov::op::v0::Concat>(NodeVector{read_value_a_0, read_value_a_1}, 0);
+
+        auto fused_matmul1 = std::make_shared<ov::op::v0::MatMul>(lora_input, concat_variable_a, false, true);
+
+        auto variable_alpha_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"});
+        auto variable_alpha_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"});
+
+        auto read_value_alpha_0 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_0);
+        auto read_value_alpha_1 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_1);
+        auto concat_variable_alpha = std::make_shared<ov::op::v0::Concat>(NodeVector{read_value_alpha_0, read_value_alpha_1}, 1);
+
+        auto multiply = std::make_shared<ov::op::v1::Multiply>(fused_matmul1, concat_variable_alpha);
+
+        auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2});
+        auto split = std::make_shared<ov::op::v1::Split>(multiply, split_axis, 2);
+
+        auto variable_b_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"});
+        auto variable_b_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"});
+
+        auto read_value_b_0 = std::make_shared<ov::op::v6::ReadValue>(variable_b_0);
+        auto read_value_b_1 = std::make_shared<ov::op::v6::ReadValue>(variable_b_1);
+
+        auto matmul2_0 = std::make_shared<ov::op::v0::MatMul>(split->output(0), read_value_b_0, false, true);
+        auto matmul2_1 = std::make_shared<ov::op::v0::MatMul>(split->output(1), read_value_b_1, false, true);
+
+        auto concat_matmul2 = std::make_shared<ov::op::v0::Concat>(NodeVector{matmul2_0, matmul2_1}, 2);
+
+        auto add = std::make_shared<ov::op::v1::Add>(fc_fused, concat_matmul2);
+
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2});
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256});
+        auto var_split = std::make_shared<ov::op::v1::VariadicSplit>(add, axis_const, split_const);
+
+        auto reshape_pattern0 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 32, 64});
+        auto reshape_pattern1 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 4, 64});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(var_split->output(0), reshape_pattern0, true);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(var_split->output(1), reshape_pattern1, true);
+
+        auto result0 = std::make_shared<ov::op::v0::Result>(reshape0);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{result0, result1}, ov::ParameterVector{lora_input});
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
+TEST_F(TransformationTestsF, LoRAHorizontalFusion_multiple_split_output_users) {
+    ov::element::Type model_dt = ov::element::f16;
+    {
+        auto lora_input = std::make_shared<ov::op::v0::Parameter>(model_dt, ov::PartialShape{-1, -1, 2048});
+        auto weights = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{2304, 2048});
+        auto bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto scale = std::make_shared<ov::op::v0::Constant>(model_dt, ov::Shape{2304, 1});
+        auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(lora_input, weights, bias, scale);
+
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2});
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256});
+        auto split = std::make_shared<ov::op::v1::VariadicSplit>(fc_fused, axis_const, split_const);
+
+        auto variable_a_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"});
+        auto variable_alpha_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"});
+        auto variable_b_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"});
+        auto read_value_a_0 = std::make_shared<ov::op::v6::ReadValue>(variable_a_0);
+        auto read_value_alpha_0 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_0);
+        auto read_value_b_0 = std::make_shared<ov::op::v6::ReadValue>(variable_b_0);
+        auto matmul1_0 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_0, false, true);
+        auto multiply_0 = std::make_shared<ov::op::v1::Multiply>(matmul1_0, read_value_alpha_0);
+        auto matmul2_0 = std::make_shared<ov::op::v0::MatMul>(multiply_0, read_value_b_0, false, true);
+        auto add_0 = std::make_shared<ov::op::v1::Add>(split->output(0), matmul2_0);
+
+        auto variable_a_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"});
+        auto variable_alpha_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"});
+        auto variable_b_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"});
+        auto read_value_a_1 = std::make_shared<ov::op::v6::ReadValue>(variable_a_1);
+        auto read_value_alpha_1 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_1);
+        auto read_value_b_1 = std::make_shared<ov::op::v6::ReadValue>(variable_b_1);
+        auto matmul1_1 = std::make_shared<ov::op::v0::MatMul>(lora_input, read_value_a_1, false, true);
+        auto multiply_1 = std::make_shared<ov::op::v1::Multiply>(matmul1_1, read_value_alpha_1);
+        auto matmul2_1 = std::make_shared<ov::op::v0::MatMul>(multiply_1, read_value_b_1, false, true);
+        auto add_1 = std::make_shared<ov::op::v1::Add>(split->output(1), matmul2_1);
+
+        auto reshape_pattern0 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 32, 64});
+        auto reshape_pattern1 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 4, 64});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(add_0, reshape_pattern0, true);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(add_1, reshape_pattern1, true);
+
+        auto shape_of0 = std::make_shared<ov::op::v0::ShapeOf>(add_0);
+        auto shape_of1 = std::make_shared<ov::op::v0::ShapeOf>(add_0);
+        auto shape_of2 = std::make_shared<ov::op::v0::ShapeOf>(add_1);
+        auto shape_of3 = std::make_shared<ov::op::v0::ShapeOf>(add_1);
+
+        auto result0 = std::make_shared<ov::op::v0::Result>(reshape0);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+        auto result2 = std::make_shared<ov::op::v0::Result>(shape_of0);
+        auto result3 = std::make_shared<ov::op::v0::Result>(shape_of1);
+        auto result4 = std::make_shared<ov::op::v0::Result>(shape_of2);
+        auto result5 = std::make_shared<ov::op::v0::Result>(shape_of3);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{result0, result1, result2, result3, result4, result5}, ov::ParameterVector{lora_input});
+        manager.register_pass<LoRAHorizontalFusion>();
+    }
+
+    {
+        auto lora_input = std::make_shared<ov::op::v0::Parameter>(model_dt, ov::PartialShape{-1, -1, 2048});
+        auto weights = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{2304, 2048});
+        auto bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto scale = std::make_shared<ov::op::v0::Constant>(model_dt, ov::Shape{2304, 1});
+        auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(lora_input, weights, bias, scale);
+
+        auto variable_a_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"});
+        auto variable_a_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"});
+
+        auto read_value_a_0 = std::make_shared<ov::op::v6::ReadValue>(variable_a_0);
+        auto read_value_a_1 = std::make_shared<ov::op::v6::ReadValue>(variable_a_1);
+        auto concat_variable_a = std::make_shared<ov::op::v0::Concat>(NodeVector{read_value_a_0, read_value_a_1}, 0);
+
+        auto fused_matmul1 = std::make_shared<ov::op::v0::MatMul>(lora_input, concat_variable_a, false, true);
+
+        auto variable_alpha_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"});
+        auto variable_alpha_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"});
+
+        auto read_value_alpha_0 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_0);
+        auto read_value_alpha_1 = std::make_shared<ov::op::v6::ReadValue>(variable_alpha_1);
+        auto concat_variable_alpha = std::make_shared<ov::op::v0::Concat>(NodeVector{read_value_alpha_0, read_value_alpha_1}, 1);
+
+        auto multiply = std::make_shared<ov::op::v1::Multiply>(fused_matmul1, concat_variable_alpha);
+
+        auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2});
+        auto split = std::make_shared<ov::op::v1::Split>(multiply, split_axis, 2);
+
+        auto variable_b_0 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"});
+        auto variable_b_1 = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"});
+
+        auto read_value_b_0 = std::make_shared<ov::op::v6::ReadValue>(variable_b_0);
+        auto read_value_b_1 = std::make_shared<ov::op::v6::ReadValue>(variable_b_1);
+
+        auto matmul2_0 = std::make_shared<ov::op::v0::MatMul>(split->output(0), read_value_b_0, false, true);
+        auto matmul2_1 = std::make_shared<ov::op::v0::MatMul>(split->output(1), read_value_b_1, false, true);
+
+        auto concat_matmul2 = std::make_shared<ov::op::v0::Concat>(NodeVector{matmul2_0, matmul2_1}, 2);
+
+        auto add = std::make_shared<ov::op::v1::Add>(fc_fused, concat_matmul2);
+
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2});
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256});
+        auto var_split = std::make_shared<ov::op::v1::VariadicSplit>(add, axis_const, split_const);
+
+        auto reshape_pattern0 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 32, 64});
+        auto reshape_pattern1 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 4, 64});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(var_split->output(0), reshape_pattern0, true);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(var_split->output(1), reshape_pattern1, true);
+
+        auto shape_of0 = std::make_shared<ov::op::v0::ShapeOf>(var_split->output(0));
+        auto shape_of1 = std::make_shared<ov::op::v0::ShapeOf>(var_split->output(0));
+        auto shape_of2 = std::make_shared<ov::op::v0::ShapeOf>(var_split->output(1));
+        auto shape_of3 = std::make_shared<ov::op::v0::ShapeOf>(var_split->output(1));
+
+        auto result0 = std::make_shared<ov::op::v0::Result>(reshape0);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+        auto result2 = std::make_shared<ov::op::v0::Result>(shape_of0);
+        auto result3 = std::make_shared<ov::op::v0::Result>(shape_of1);
+        auto result4 = std::make_shared<ov::op::v0::Result>(shape_of2);
+        auto result5 = std::make_shared<ov::op::v0::Result>(shape_of3);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{result0, result1, result2, result3, result4, result5}, ov::ParameterVector{lora_input});
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
 }  // namespace intel_gpu
 }  // namespace test
 }  // namespace ov