From 3b6adcd65168f9cd26d139ebe1089b0548e7f961 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 15 Oct 2024 14:30:13 +0800
Subject: [PATCH 01/14] update.

---
 src/plugins/auto/src/schedule.cpp | 47 ++++++++++++++++++++++---------
 src/plugins/auto/src/schedule.hpp |  3 +-
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 96a815cc21c8c6..92ee3a84375187 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -9,6 +9,7 @@ namespace auto_plugin {
 thread_local WorkerInferRequest* Schedule::m_this_worker_infer_request = nullptr;
 // TODO: revert to the plain variable (see header file), when we moved to the next CentOS 8.x in our support matrix
 thread_local const char* Schedule::m_this_preferred_device_name = "";
+int32_t Schedule::m_need_retry_times = 0;
 
 void Schedule::launch(const ScheduleContext::Ptr& context) {
     m_context = context;
@@ -55,18 +56,27 @@ bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
     NotBusyPriorityWorkerRequests& idle_workerrequests,
     const DeviceName& preferred_device) {
     WorkerInferRequest* worker_request_ptr = nullptr;
+    static int index = 0;
     std::pair<int, WorkerInferRequest*> worker;
-    if (idle_workerrequests.try_pop(worker)) {
-        worker_request_ptr = worker.second;
-        IdleGuard<NotBusyPriorityWorkerRequests> idle_guard{worker_request_ptr, idle_workerrequests};
-        m_this_worker_infer_request = worker_request_ptr;
-        {
-            auto captured_task = std::move(pipeline_task);
-            captured_task();
+    std::cout << "------- start try pop -------\n";
+    do {
+        std::cout << "------- [Need retry: " << m_need_retry_times << "] try pop index: " << index << std::endl;
+        if (idle_workerrequests.try_pop(worker)) {
+            std::cout << "------- [Need retry: " << m_need_retry_times << "] popped index: " << index++ << std::endl;
+            worker_request_ptr = worker.second;
+            IdleGuard<NotBusyPriorityWorkerRequests> idle_guard{worker_request_ptr, idle_workerrequests};
+            m_this_worker_infer_request = worker_request_ptr;
+            {
+                auto captured_task = std::move(pipeline_task);
+                captured_task();
+            }
+            idle_guard.release();
+            return true;
+        } else {
+            std::cout << "1234567 [Need retry: " << m_need_retry_times << "] Failed to pop index: " << index
+                      << std::endl;
         }
-        idle_guard.release();
-        return true;
-    }
+    } while (m_need_retry_times-- > 0);
     return false;
 }
 
@@ -85,8 +95,11 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         OPENVINO_THROW("Every device used with AUTO should support query optimal_number_of_infer_requests property from compiled model ",
                     iie.what());
     }
-    const auto num_requests = (m_context->m_device_priorities.end() == it_numrequests ||
-                              it_numrequests->num_requests_per_devices == -1) ? optimal_num : it_numrequests->num_requests_per_devices;
+    auto num_requests =
+        (m_context->m_device_priorities.end() == it_numrequests || it_numrequests->num_requests_per_devices == -1)
+            ? optimal_num
+            : it_numrequests->num_requests_per_devices;
+    num_requests = num_requests < 2 ? 2 : num_requests;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
     worker_requests.resize(num_requests);
@@ -128,10 +141,13 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                     } else {
                         stop_retry_and_continue();
                     }
+                    static int index = 0;
+                    std::cout << "------- try push index: " << ++index << std::endl;
                     // try to return the request to the idle list (fails if the overall object destruction has began)
-                    if (idleGuard.release()->try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
+                    if (index % 2 && idleGuard.release()->try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
                         // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded
                         // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded
+                        std::cout << "------- pushed index: " << index << std::endl;
                         ov::threading::Task t;
                         do {
                             m_infer_pipeline_tasks.try_pop(t);
@@ -139,6 +155,11 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                         do {
                             m_infer_pipeline_tasks_device_specific[device]->try_pop(t);
                         } while (t && schedule_to_worker_infer_request(std::move(t), device));
+                    } else {
+                        m_need_retry_times++;
+                        std::cout << "-------[Need to retry: " << m_need_retry_times
+                                  << " ] Failed to try push index: " << index << std::endl;
+                        std::this_thread::sleep_for(std::chrono::seconds(1));
                     }
                 }
             });
diff --git a/src/plugins/auto/src/schedule.hpp b/src/plugins/auto/src/schedule.hpp
index 99efa3138cef00..0568e88b220118 100644
--- a/src/plugins/auto/src/schedule.hpp
+++ b/src/plugins/auto/src/schedule.hpp
@@ -25,7 +25,8 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     // have to use the const char* ptr rather than std::string due to a bug in old gcc versions,
     // the bug is e.g. manifesting on the old CentOS (and it's 4.8.x gcc) used in our testing
     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81880
-    static thread_local const char*         m_this_preferred_device_name;
+    static thread_local const char*             m_this_preferred_device_name;
+    static std::int32_t            m_need_retry_times;
 
 protected:
     virtual void init() = 0;

From 15f9def04cadf8d45c628f5eee0dce760345197b Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Mon, 28 Oct 2024 16:17:46 +0800
Subject: [PATCH 02/14] enable sync for handling worker infer requests.

---
 src/plugins/auto/src/auto_schedule.cpp       |  8 ++-
 src/plugins/auto/src/cumulative_schedule.cpp |  6 +-
 src/plugins/auto/src/schedule.cpp            | 73 +++++++++++---------
 src/plugins/auto/src/schedule.hpp            | 10 ++-
 4 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp
index ea5f2159179824..f7389fa2fc7bce 100644
--- a/src/plugins/auto/src/auto_schedule.cpp
+++ b/src/plugins/auto/src/auto_schedule.cpp
@@ -212,10 +212,12 @@ void AutoSchedule::init() {
             // initialize containers before run async task
             m_idle_worker_requests[device.device_name];
             m_worker_requests[device.device_name];
+            m_worker_requests_cvs[device.device_name];
             m_infer_pipeline_tasks_device_specific[device.device_name] = nullptr;
         }
         m_idle_worker_requests["CPU_HELP"];
         m_worker_requests["CPU_HELP"];
+        m_worker_requests_cvs["CPU_HELP"];
         m_infer_pipeline_tasks_device_specific["CPU_HELP"] = nullptr;
         m_executor->run(m_compile_context[CPU].m_task);
         m_executor->run(m_compile_context[ACTUALDEVICE].m_task);
@@ -486,7 +488,11 @@ bool AutoSchedule::schedule_to_worker_infer_request(ov::threading::Task pipeline
         if (!preferred_device.empty() && (device.device_name != preferred_device)) {
             continue;
         }
-        if (run_pipeline_task(pipeline_task, m_idle_worker_requests[device.device_name], preferred_device)) {
+        if (run_pipeline_task(pipeline_task,
+                              m_idle_worker_requests[device.device_name],
+                              preferred_device,
+                              m_worker_requests_cvs[device.device_name],
+                              m_worker_infer_mutex)) {
             return true;
         }
     }
diff --git a/src/plugins/auto/src/cumulative_schedule.cpp b/src/plugins/auto/src/cumulative_schedule.cpp
index a607205e17d1e5..adabfb9826c670 100644
--- a/src/plugins/auto/src/cumulative_schedule.cpp
+++ b/src/plugins/auto/src/cumulative_schedule.cpp
@@ -247,7 +247,11 @@ bool CumuSchedule::schedule_to_worker_infer_request(ov::threading::Task pipeline
         }
         auto selected_device_name =
             preferred_device.empty() ? schedule_to_next_device(devices, current_device_index) : preferred_device;
-        if (run_pipeline_task(pipeline_task, m_idle_worker_requests[selected_device_name], preferred_device)) {
+        if (run_pipeline_task(pipeline_task,
+                              m_idle_worker_requests[selected_device_name],
+                              preferred_device,
+                              m_worker_requests_cvs[selected_device_name],
+                              m_worker_infer_mutex)) {
             return true;
         } else {
             current_device_index++;
diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 92ee3a84375187..ddfa2fd3dc1506 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -9,7 +9,6 @@ namespace auto_plugin {
 thread_local WorkerInferRequest* Schedule::m_this_worker_infer_request = nullptr;
 // TODO: revert to the plain variable (see header file), when we moved to the next CentOS 8.x in our support matrix
 thread_local const char* Schedule::m_this_preferred_device_name = "";
-int32_t Schedule::m_need_retry_times = 0;
 
 void Schedule::launch(const ScheduleContext::Ptr& context) {
     m_context = context;
@@ -53,30 +52,36 @@ void Schedule::run(ov::threading::Task pipeline_task) {
 }
 
 bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
-    NotBusyPriorityWorkerRequests& idle_workerrequests,
-    const DeviceName& preferred_device) {
+                                 NotBusyPriorityWorkerRequests& idle_workerrequests,
+                                 const DeviceName& preferred_device,
+                                 std::condition_variable& idle_workerrequests_cv,
+                                 std::mutex& worker_infer_mutex) {
     WorkerInferRequest* worker_request_ptr = nullptr;
     static int index = 0;
     std::pair<int, WorkerInferRequest*> worker;
+    std::unique_lock<std::mutex> lck(worker_infer_mutex);
     std::cout << "------- start try pop -------\n";
-    do {
-        std::cout << "------- [Need retry: " << m_need_retry_times << "] try pop index: " << index << std::endl;
-        if (idle_workerrequests.try_pop(worker)) {
-            std::cout << "------- [Need retry: " << m_need_retry_times << "] popped index: " << index++ << std::endl;
-            worker_request_ptr = worker.second;
-            IdleGuard<NotBusyPriorityWorkerRequests> idle_guard{worker_request_ptr, idle_workerrequests};
-            m_this_worker_infer_request = worker_request_ptr;
-            {
-                auto captured_task = std::move(pipeline_task);
-                captured_task();
-            }
-            idle_guard.release();
-            return true;
-        } else {
-            std::cout << "1234567 [Need retry: " << m_need_retry_times << "] Failed to pop index: " << index
-                      << std::endl;
+    std::cout << "------- try pop index: " << index << std::endl;
+    if (!idle_workerrequests.try_pop(worker)) {
+        std::cout << "------- pop failed and will wait......" << std::endl;
+        idle_workerrequests_cv.wait(lck, [&idle_workerrequests, &worker] {
+            return idle_workerrequests.try_pop(worker);
+        });
+    }
+    if (worker.second) {
+        std::cout << "------- popped index: " << index++ << std::endl;
+        worker_request_ptr = worker.second;
+        IdleGuard<NotBusyPriorityWorkerRequests> idle_guard{worker_request_ptr, idle_workerrequests};
+        m_this_worker_infer_request = worker_request_ptr;
+        {
+            auto captured_task = std::move(pipeline_task);
+            captured_task();
         }
-    } while (m_need_retry_times-- > 0);
+        idle_guard.release();
+        return true;
+    } else {
+        std::cout << "------- Failed to pop index: " << index << std::endl;
+    }
     return false;
 }
 
@@ -99,9 +104,9 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         (m_context->m_device_priorities.end() == it_numrequests || it_numrequests->num_requests_per_devices == -1)
             ? optimal_num
             : it_numrequests->num_requests_per_devices;
-    num_requests = num_requests < 2 ? 2 : num_requests;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
+    auto& worker_requests_cv = m_worker_requests_cvs[device];
     worker_requests.resize(num_requests);
     m_infer_pipeline_tasks_device_specific[device] = std::unique_ptr<TaskQueue>(new TaskQueue);
     auto* idle_workerrequests_ptr = &(idle_worker_requests);
@@ -111,9 +116,11 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         worker_request.m_inferrequest = {compiled_model->create_infer_request(), compiled_model._so};
         auto* worker_request_ptr = &worker_request;
         worker_request_ptr->m_index = num++;
-        OPENVINO_ASSERT(idle_worker_requests.try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr)) == true);
+        OPENVINO_ASSERT(
+            idle_worker_requests.try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr)) == true);
         worker_request.m_inferrequest->set_callback(
-            [worker_request_ptr, this, device, idle_workerrequests_ptr](std::exception_ptr exception_ptr) mutable {
+            [worker_request_ptr, this, device, idle_workerrequests_ptr, &worker_requests_cv](
+                std::exception_ptr exception_ptr) mutable {
                 IdleGuard<NotBusyPriorityWorkerRequests> idleGuard{worker_request_ptr, *idle_workerrequests_ptr};
                 worker_request_ptr->m_exception_ptr = std::move(exception_ptr);
                 {
@@ -142,11 +149,15 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                         stop_retry_and_continue();
                     }
                     static int index = 0;
-                    std::cout << "------- try push index: " << ++index << std::endl;
-                    // try to return the request to the idle list (fails if the overall object destruction has began)
-                    if (index % 2 && idleGuard.release()->try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
-                        // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded
-                        // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded
+                    std::cout << "------- try push index: " << index++ << std::endl;
+                    // try to return the request to the idle list (fails if the overall object destruction has
+                    // began)
+                    //std::this_thread::sleep_for(std::chrono::seconds(5));
+                    if (idleGuard.release()->try_push(
+                            std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
+                        // let's try to pop a task, as we know there is at least one idle request, schedule if
+                        // succeeded if no device-agnostic tasks, let's try pop the device specific task, schedule
+                        // if succeeded
                         std::cout << "------- pushed index: " << index << std::endl;
                         ov::threading::Task t;
                         do {
@@ -155,11 +166,7 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                         do {
                             m_infer_pipeline_tasks_device_specific[device]->try_pop(t);
                         } while (t && schedule_to_worker_infer_request(std::move(t), device));
-                    } else {
-                        m_need_retry_times++;
-                        std::cout << "-------[Need to retry: " << m_need_retry_times
-                                  << " ] Failed to try push index: " << index << std::endl;
-                        std::this_thread::sleep_for(std::chrono::seconds(1));
+                        worker_requests_cv.notify_all();
                     }
                 }
             });
diff --git a/src/plugins/auto/src/schedule.hpp b/src/plugins/auto/src/schedule.hpp
index 0568e88b220118..f7ae523ea986b8 100644
--- a/src/plugins/auto/src/schedule.hpp
+++ b/src/plugins/auto/src/schedule.hpp
@@ -26,12 +26,14 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     // the bug is e.g. manifesting on the old CentOS (and it's 4.8.x gcc) used in our testing
     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81880
     static thread_local const char*             m_this_preferred_device_name;
-    static std::int32_t            m_need_retry_times;
 
 protected:
     virtual void init() = 0;
-    static bool run_pipeline_task(ov::threading::Task& pipeline_task, NotBusyPriorityWorkerRequests& idle_worker_request,
-                                  const DeviceName& preferred_device);
+    static bool run_pipeline_task(ov::threading::Task& pipeline_task,
+                                  NotBusyPriorityWorkerRequests& idle_worker_request,
+                                  const DeviceName& preferred_device,
+                                  std::condition_variable& idle_worker_request_cv,
+                                  std::mutex& mutex);
     virtual void generate_workers(const std::string& device, const SoCompiledModel& compiled_model);
     virtual void try_to_compile_model(AutoCompileContext& context, const std::shared_ptr<ov::Model>& model) = 0;
     virtual bool schedule_to_worker_infer_request(ov::threading::Task, DeviceName preferred_device = "") = 0;
@@ -41,6 +43,7 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     std::shared_ptr<ov::threading::IStreamsExecutor>                     m_executor;
     DeviceMap<NotBusyPriorityWorkerRequests>                             m_idle_worker_requests;
     DeviceMap<std::vector<WorkerInferRequest>>                           m_worker_requests;
+    DeviceMap<std::condition_variable>                                   m_worker_requests_cvs;
     TaskQueue                                                            m_infer_pipeline_tasks;
     DeviceMap<std::unique_ptr<TaskQueue>>                                m_infer_pipeline_tasks_device_specific;
     SoCompiledModel                                                      m_passthrough_compiled_model;
@@ -51,6 +54,7 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     mutable std::atomic<std::size_t>                                     m_request_id = {0};
     std::mutex                                                           m_dev_infer_mutex;
     std::unordered_map<IASyncInferPtr, WorkerInferRequest*>              m_dev_infer;
+    std::mutex                                                           m_worker_infer_mutex;
 };
 
 }  // namespace auto_plugin

From e9c7d043522b8a1297c12d4ce27dc8efac6297cf Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 29 Oct 2024 14:49:14 +0800
Subject: [PATCH 03/14] update.

---
 src/plugins/auto/src/schedule.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index ddfa2fd3dc1506..133347c4527d05 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -57,19 +57,14 @@ bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
                                  std::condition_variable& idle_workerrequests_cv,
                                  std::mutex& worker_infer_mutex) {
     WorkerInferRequest* worker_request_ptr = nullptr;
-    static int index = 0;
     std::pair<int, WorkerInferRequest*> worker;
     std::unique_lock<std::mutex> lck(worker_infer_mutex);
-    std::cout << "------- start try pop -------\n";
-    std::cout << "------- try pop index: " << index << std::endl;
     if (!idle_workerrequests.try_pop(worker)) {
-        std::cout << "------- pop failed and will wait......" << std::endl;
         idle_workerrequests_cv.wait(lck, [&idle_workerrequests, &worker] {
             return idle_workerrequests.try_pop(worker);
         });
     }
     if (worker.second) {
-        std::cout << "------- popped index: " << index++ << std::endl;
         worker_request_ptr = worker.second;
         IdleGuard<NotBusyPriorityWorkerRequests> idle_guard{worker_request_ptr, idle_workerrequests};
         m_this_worker_infer_request = worker_request_ptr;
@@ -79,8 +74,6 @@ bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
         }
         idle_guard.release();
         return true;
-    } else {
-        std::cout << "------- Failed to pop index: " << index << std::endl;
     }
     return false;
 }
@@ -148,17 +141,11 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                     } else {
                         stop_retry_and_continue();
                     }
-                    static int index = 0;
-                    std::cout << "------- try push index: " << index++ << std::endl;
-                    // try to return the request to the idle list (fails if the overall object destruction has
-                    // began)
-                    //std::this_thread::sleep_for(std::chrono::seconds(5));
                     if (idleGuard.release()->try_push(
                             std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
                         // let's try to pop a task, as we know there is at least one idle request, schedule if
                         // succeeded if no device-agnostic tasks, let's try pop the device specific task, schedule
                         // if succeeded
-                        std::cout << "------- pushed index: " << index << std::endl;
                         ov::threading::Task t;
                         do {
                             m_infer_pipeline_tasks.try_pop(t);

From eb727a475b1340e73306324233374f2cb6e5c70a Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 12 Nov 2024 16:35:52 +0800
Subject: [PATCH 04/14] update.

---
 src/plugins/auto/src/schedule.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index c5e839948e1476..d29e824de234a7 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -141,6 +141,7 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                     } else {
                         stop_retry_and_continue();
                     }
+                    std::unique_lock<std::mutex> lck(m_worker_infer_mutex);
                     if (idleGuard.release()->try_push(
                             std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
                         // let's try to pop a task, as we know there is at least one idle request, schedule if

From 3507dfebf1593f0def4bd60529099cc06a6a6868 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 12 Nov 2024 16:39:29 +0800
Subject: [PATCH 05/14] update.

---
 src/plugins/auto/src/auto_schedule.cpp       | 6 +++---
 src/plugins/auto/src/cumulative_schedule.cpp | 2 +-
 src/plugins/auto/src/schedule.cpp            | 2 +-
 src/plugins/auto/src/schedule.hpp            | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp
index d87e4b4f418212..a9ef0df8f17307 100644
--- a/src/plugins/auto/src/auto_schedule.cpp
+++ b/src/plugins/auto/src/auto_schedule.cpp
@@ -212,12 +212,12 @@ void AutoSchedule::init() {
             // initialize containers before run async task
             m_idle_worker_requests[device.device_name];
             m_worker_requests[device.device_name];
-            m_worker_requests_cvs[device.device_name];
+            m_worker_requests_conds[device.device_name];
             m_infer_pipeline_tasks_device_specific[device.device_name] = nullptr;
         }
         m_idle_worker_requests["CPU_HELP"];
         m_worker_requests["CPU_HELP"];
-        m_worker_requests_cvs["CPU_HELP"];
+        m_worker_requests_conds["CPU_HELP"];
         m_infer_pipeline_tasks_device_specific["CPU_HELP"] = nullptr;
         m_executor->run(m_compile_context[CPU].m_task);
         m_executor->run(m_compile_context[ACTUALDEVICE].m_task);
@@ -493,7 +493,7 @@ bool AutoSchedule::schedule_to_worker_infer_request(ov::threading::Task pipeline
         if (run_pipeline_task(pipeline_task,
                               m_idle_worker_requests[device.device_name],
                               preferred_device,
-                              m_worker_requests_cvs[device.device_name],
+                              m_worker_requests_conds[device.device_name],
                               m_worker_infer_mutex)) {
             return true;
         }
diff --git a/src/plugins/auto/src/cumulative_schedule.cpp b/src/plugins/auto/src/cumulative_schedule.cpp
index adabfb9826c670..89672acd8a9073 100644
--- a/src/plugins/auto/src/cumulative_schedule.cpp
+++ b/src/plugins/auto/src/cumulative_schedule.cpp
@@ -250,7 +250,7 @@ bool CumuSchedule::schedule_to_worker_infer_request(ov::threading::Task pipeline
         if (run_pipeline_task(pipeline_task,
                               m_idle_worker_requests[selected_device_name],
                               preferred_device,
-                              m_worker_requests_cvs[selected_device_name],
+                              m_worker_requests_conds[selected_device_name],
                               m_worker_infer_mutex)) {
             return true;
         } else {
diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index d29e824de234a7..4457ae55383ecb 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -99,7 +99,7 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
             : it_numrequests->num_requests_per_devices;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
-    auto& worker_requests_cv = m_worker_requests_cvs[device];
+    auto& worker_requests_cv = m_worker_requests_conds[device];
     worker_requests.resize(num_requests);
     m_infer_pipeline_tasks_device_specific[device] = std::unique_ptr<TaskQueue>(new TaskQueue);
     auto* idle_workerrequests_ptr = &(idle_worker_requests);
diff --git a/src/plugins/auto/src/schedule.hpp b/src/plugins/auto/src/schedule.hpp
index f7ae523ea986b8..eaa2915e27a45b 100644
--- a/src/plugins/auto/src/schedule.hpp
+++ b/src/plugins/auto/src/schedule.hpp
@@ -43,7 +43,7 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     std::shared_ptr<ov::threading::IStreamsExecutor>                     m_executor;
     DeviceMap<NotBusyPriorityWorkerRequests>                             m_idle_worker_requests;
     DeviceMap<std::vector<WorkerInferRequest>>                           m_worker_requests;
-    DeviceMap<std::condition_variable>                                   m_worker_requests_cvs;
+    DeviceMap<std::condition_variable>                                   m_worker_requests_conds;
     TaskQueue                                                            m_infer_pipeline_tasks;
     DeviceMap<std::unique_ptr<TaskQueue>>                                m_infer_pipeline_tasks_device_specific;
     SoCompiledModel                                                      m_passthrough_compiled_model;

From fdc866b441f3951b0f7d0264569abbfd766d29e9 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 12 Nov 2024 16:41:25 +0800
Subject: [PATCH 06/14] update.

---
 src/plugins/auto/src/schedule.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/auto/src/schedule.hpp b/src/plugins/auto/src/schedule.hpp
index eaa2915e27a45b..794accc9f0e1df 100644
--- a/src/plugins/auto/src/schedule.hpp
+++ b/src/plugins/auto/src/schedule.hpp
@@ -25,7 +25,7 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     // have to use the const char* ptr rather than std::string due to a bug in old gcc versions,
     // the bug is e.g. manifesting on the old CentOS (and it's 4.8.x gcc) used in our testing
     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81880
-    static thread_local const char*             m_this_preferred_device_name;
+    static thread_local const char*         m_this_preferred_device_name;
 
 protected:
     virtual void init() = 0;

From 47d40e14fce8bfd9574beb6a76c2fd1ec1120309 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Thu, 14 Nov 2024 09:54:40 +0800
Subject: [PATCH 07/14] update.

---
 src/plugins/auto/src/cumulative_schedule.cpp |  1 +
 src/plugins/auto/src/schedule.cpp            | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/plugins/auto/src/cumulative_schedule.cpp b/src/plugins/auto/src/cumulative_schedule.cpp
index 89672acd8a9073..153e2c09e52d75 100644
--- a/src/plugins/auto/src/cumulative_schedule.cpp
+++ b/src/plugins/auto/src/cumulative_schedule.cpp
@@ -148,6 +148,7 @@ void CumuSchedule::init() {
         // initialize containers before run async task, if not initialized, it will hang during infer
         m_idle_worker_requests[device.device_name];
         m_worker_requests[device.device_name];
+        m_worker_requests_conds[device.device_name];
         m_infer_pipeline_tasks_device_specific[device.device_name] = nullptr;
     }
     // load devices other than CPU first
diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 4457ae55383ecb..ae12e45c810488 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -58,11 +58,13 @@ bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
                                  std::mutex& worker_infer_mutex) {
     WorkerInferRequest* worker_request_ptr = nullptr;
     std::pair<int, WorkerInferRequest*> worker;
-    std::unique_lock<std::mutex> lck(worker_infer_mutex);
-    if (!idle_workerrequests.try_pop(worker)) {
-        idle_workerrequests_cv.wait(lck, [&idle_workerrequests, &worker] {
-            return idle_workerrequests.try_pop(worker);
-        });
+    {
+        std::unique_lock<std::mutex> lck(worker_infer_mutex);
+        if (!idle_workerrequests.try_pop(worker)) {
+            idle_workerrequests_cv.wait(lck, [&idle_workerrequests, &worker] {
+                return idle_workerrequests.try_pop(worker);
+            });
+        }
     }
     if (worker.second) {
         worker_request_ptr = worker.second;

From 81f346cccf08b0425b7ec5d274ca1b2f8904fde6 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Wed, 4 Dec 2024 15:20:15 +0800
Subject: [PATCH 08/14] remove the condition variable control and increase the
 number of worker requests to at least 2 per device to avoid deadlock.

---
 src/plugins/auto/src/auto_schedule.cpp       |  8 +---
 src/plugins/auto/src/cumulative_schedule.cpp |  7 +---
 src/plugins/auto/src/schedule.cpp            | 43 +++++++-------------
 src/plugins/auto/src/schedule.hpp            |  9 +---
 4 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp
index a9ef0df8f17307..c504e8e4457870 100644
--- a/src/plugins/auto/src/auto_schedule.cpp
+++ b/src/plugins/auto/src/auto_schedule.cpp
@@ -212,12 +212,10 @@ void AutoSchedule::init() {
             // initialize containers before run async task
             m_idle_worker_requests[device.device_name];
             m_worker_requests[device.device_name];
-            m_worker_requests_conds[device.device_name];
             m_infer_pipeline_tasks_device_specific[device.device_name] = nullptr;
         }
         m_idle_worker_requests["CPU_HELP"];
         m_worker_requests["CPU_HELP"];
-        m_worker_requests_conds["CPU_HELP"];
         m_infer_pipeline_tasks_device_specific["CPU_HELP"] = nullptr;
         m_executor->run(m_compile_context[CPU].m_task);
         m_executor->run(m_compile_context[ACTUALDEVICE].m_task);
@@ -490,11 +488,7 @@ bool AutoSchedule::schedule_to_worker_infer_request(ov::threading::Task pipeline
         if (!preferred_device.empty() && (device.device_name != preferred_device)) {
             continue;
         }
-        if (run_pipeline_task(pipeline_task,
-                              m_idle_worker_requests[device.device_name],
-                              preferred_device,
-                              m_worker_requests_conds[device.device_name],
-                              m_worker_infer_mutex)) {
+        if (run_pipeline_task(pipeline_task, m_idle_worker_requests[device.device_name], preferred_device)) {
             return true;
         }
     }
diff --git a/src/plugins/auto/src/cumulative_schedule.cpp b/src/plugins/auto/src/cumulative_schedule.cpp
index 153e2c09e52d75..a607205e17d1e5 100644
--- a/src/plugins/auto/src/cumulative_schedule.cpp
+++ b/src/plugins/auto/src/cumulative_schedule.cpp
@@ -148,7 +148,6 @@ void CumuSchedule::init() {
         // initialize containers before run async task, if not initialized, it will hang during infer
         m_idle_worker_requests[device.device_name];
         m_worker_requests[device.device_name];
-        m_worker_requests_conds[device.device_name];
         m_infer_pipeline_tasks_device_specific[device.device_name] = nullptr;
     }
     // load devices other than CPU first
@@ -248,11 +247,7 @@ bool CumuSchedule::schedule_to_worker_infer_request(ov::threading::Task pipeline
         }
         auto selected_device_name =
             preferred_device.empty() ? schedule_to_next_device(devices, current_device_index) : preferred_device;
-        if (run_pipeline_task(pipeline_task,
-                              m_idle_worker_requests[selected_device_name],
-                              preferred_device,
-                              m_worker_requests_conds[selected_device_name],
-                              m_worker_infer_mutex)) {
+        if (run_pipeline_task(pipeline_task, m_idle_worker_requests[selected_device_name], preferred_device)) {
             return true;
         } else {
             current_device_index++;
diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index ae12e45c810488..198b115fdb5c79 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -53,20 +53,10 @@ void Schedule::run(ov::threading::Task pipeline_task) {
 
 bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
                                  NotBusyPriorityWorkerRequests& idle_workerrequests,
-                                 const DeviceName& preferred_device,
-                                 std::condition_variable& idle_workerrequests_cv,
-                                 std::mutex& worker_infer_mutex) {
+    const DeviceName& preferred_device) {
     WorkerInferRequest* worker_request_ptr = nullptr;
     std::pair<int, WorkerInferRequest*> worker;
-    {
-        std::unique_lock<std::mutex> lck(worker_infer_mutex);
-        if (!idle_workerrequests.try_pop(worker)) {
-            idle_workerrequests_cv.wait(lck, [&idle_workerrequests, &worker] {
-                return idle_workerrequests.try_pop(worker);
-            });
-        }
-    }
-    if (worker.second) {
+    if (idle_workerrequests.try_pop(worker)) {
         worker_request_ptr = worker.second;
         IdleGuard<NotBusyPriorityWorkerRequests> idle_guard{worker_request_ptr, idle_workerrequests};
         m_this_worker_infer_request = worker_request_ptr;
@@ -95,13 +85,15 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         OPENVINO_THROW("Every device used with AUTO should support query optimal_number_of_infer_requests property from compiled model ",
                     iie.what());
     }
-    auto num_requests =
-        (m_context->m_device_priorities.end() == it_numrequests || it_numrequests->num_requests_per_devices == -1)
-            ? optimal_num
-            : it_numrequests->num_requests_per_devices;
+    auto num_requests = (m_context->m_device_priorities.end() == it_numrequests ||
+                              it_numrequests->num_requests_per_devices == -1) ? optimal_num : it_numrequests->num_requests_per_devices;
+    // If the user creates only one infer request, we need to ensure at least 2 requests per device.
+    // This is necessary to handle the case where a request worker is popped from the idle queue before being pushed back.
+    // Without at least 2 requests, there could be a situation where no requests are available for inference,
+    // leading to potential deadlocks.
+    num_requests = num_requests <= 1 ? 2 : num_requests;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
-    auto& worker_requests_cv = m_worker_requests_conds[device];
     worker_requests.resize(num_requests);
     m_infer_pipeline_tasks_device_specific[device] = std::unique_ptr<TaskQueue>(new TaskQueue);
     auto* idle_workerrequests_ptr = &(idle_worker_requests);
@@ -111,11 +103,9 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         worker_request.m_inferrequest = {compiled_model->create_infer_request(), compiled_model._so};
         auto* worker_request_ptr = &worker_request;
         worker_request_ptr->m_index = num++;
-        OPENVINO_ASSERT(
-            idle_worker_requests.try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr)) == true);
+        OPENVINO_ASSERT(idle_worker_requests.try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr)) == true);
         worker_request.m_inferrequest->set_callback(
-            [worker_request_ptr, this, device, idle_workerrequests_ptr, &worker_requests_cv](
-                std::exception_ptr exception_ptr) mutable {
+            [worker_request_ptr, this, device, idle_workerrequests_ptr](std::exception_ptr exception_ptr) mutable {
                 IdleGuard<NotBusyPriorityWorkerRequests> idleGuard{worker_request_ptr, *idle_workerrequests_ptr};
                 worker_request_ptr->m_exception_ptr = std::move(exception_ptr);
                 {
@@ -143,12 +133,10 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                     } else {
                         stop_retry_and_continue();
                     }
-                    std::unique_lock<std::mutex> lck(m_worker_infer_mutex);
-                    if (idleGuard.release()->try_push(
-                            std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
-                        // let's try to pop a task, as we know there is at least one idle request, schedule if
-                        // succeeded if no device-agnostic tasks, let's try pop the device specific task, schedule
-                        // if succeeded
+                    // try to return the request to the idle list (fails if the overall object destruction has began)
+                    if (idleGuard.release()->try_push(std::make_pair(worker_request_ptr->m_index, worker_request_ptr))) {
+                        // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded
+                        // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded
                         ov::threading::Task t;
                         do {
                             m_infer_pipeline_tasks.try_pop(t);
@@ -156,7 +144,6 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
                         do {
                             m_infer_pipeline_tasks_device_specific[device]->try_pop(t);
                         } while (t && schedule_to_worker_infer_request(std::move(t), device));
-                        worker_requests_cv.notify_all();
                     }
                 }
             });
diff --git a/src/plugins/auto/src/schedule.hpp b/src/plugins/auto/src/schedule.hpp
index 794accc9f0e1df..99efa3138cef00 100644
--- a/src/plugins/auto/src/schedule.hpp
+++ b/src/plugins/auto/src/schedule.hpp
@@ -29,11 +29,8 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
 
 protected:
     virtual void init() = 0;
-    static bool run_pipeline_task(ov::threading::Task& pipeline_task,
-                                  NotBusyPriorityWorkerRequests& idle_worker_request,
-                                  const DeviceName& preferred_device,
-                                  std::condition_variable& idle_worker_request_cv,
-                                  std::mutex& mutex);
+    static bool run_pipeline_task(ov::threading::Task& pipeline_task, NotBusyPriorityWorkerRequests& idle_worker_request,
+                                  const DeviceName& preferred_device);
     virtual void generate_workers(const std::string& device, const SoCompiledModel& compiled_model);
     virtual void try_to_compile_model(AutoCompileContext& context, const std::shared_ptr<ov::Model>& model) = 0;
     virtual bool schedule_to_worker_infer_request(ov::threading::Task, DeviceName preferred_device = "") = 0;
@@ -43,7 +40,6 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     std::shared_ptr<ov::threading::IStreamsExecutor>                     m_executor;
     DeviceMap<NotBusyPriorityWorkerRequests>                             m_idle_worker_requests;
     DeviceMap<std::vector<WorkerInferRequest>>                           m_worker_requests;
-    DeviceMap<std::condition_variable>                                   m_worker_requests_conds;
     TaskQueue                                                            m_infer_pipeline_tasks;
     DeviceMap<std::unique_ptr<TaskQueue>>                                m_infer_pipeline_tasks_device_specific;
     SoCompiledModel                                                      m_passthrough_compiled_model;
@@ -54,7 +50,6 @@ class Schedule : public std::enable_shared_from_this<Schedule>, public ov::threa
     mutable std::atomic<std::size_t>                                     m_request_id = {0};
     std::mutex                                                           m_dev_infer_mutex;
     std::unordered_map<IASyncInferPtr, WorkerInferRequest*>              m_dev_infer;
-    std::mutex                                                           m_worker_infer_mutex;
 };
 
 }  // namespace auto_plugin

From 976aa4792c660e51b30da6c1aa92c01050d89450 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Wed, 4 Dec 2024 15:40:33 +0800
Subject: [PATCH 09/14] update.

---
 src/plugins/auto/src/schedule.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 198b115fdb5c79..9f59d00311f152 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -53,7 +53,7 @@ void Schedule::run(ov::threading::Task pipeline_task) {
 
 bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
                                  NotBusyPriorityWorkerRequests& idle_workerrequests,
-    const DeviceName& preferred_device) {
+                                 const DeviceName& preferred_device) {
     WorkerInferRequest* worker_request_ptr = nullptr;
     std::pair<int, WorkerInferRequest*> worker;
     if (idle_workerrequests.try_pop(worker)) {

From 2ef471a26bf1a8dc2b32314e404acf60f6476cea Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Wed, 4 Dec 2024 15:46:04 +0800
Subject: [PATCH 10/14] update.

---
 src/plugins/auto/src/schedule.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 9f59d00311f152..fd2b112e707a36 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -52,8 +52,8 @@ void Schedule::run(ov::threading::Task pipeline_task) {
 }
 
 bool Schedule::run_pipeline_task(ov::threading::Task& pipeline_task,
-                                 NotBusyPriorityWorkerRequests& idle_workerrequests,
-                                 const DeviceName& preferred_device) {
+    NotBusyPriorityWorkerRequests& idle_workerrequests,
+    const DeviceName& preferred_device) {
     WorkerInferRequest* worker_request_ptr = nullptr;
     std::pair<int, WorkerInferRequest*> worker;
     if (idle_workerrequests.try_pop(worker)) {

From 10f6a9f2ac536dc21b08bc3bd6fd3b4ea158ffe0 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 10 Dec 2024 15:48:18 +0800
Subject: [PATCH 11/14] update number of infer requests for throughput mode.

---
 src/plugins/auto/src/schedule.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index fd2b112e707a36..0d7bd4b0ecda3c 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -85,13 +85,13 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         OPENVINO_THROW("Every device used with AUTO should support query optimal_number_of_infer_requests property from compiled model ",
                     iie.what());
     }
-    auto num_requests = (m_context->m_device_priorities.end() == it_numrequests ||
-                              it_numrequests->num_requests_per_devices == -1) ? optimal_num : it_numrequests->num_requests_per_devices;
-    // If the user creates only one infer request, we need to ensure at least 2 requests per device.
-    // This is necessary to handle the case where a request worker is popped from the idle queue before being pushed back.
-    // Without at least 2 requests, there could be a situation where no requests are available for inference,
-    // leading to potential deadlocks.
-    num_requests = num_requests <= 1 ? 2 : num_requests;
+    auto num_requests =
+        (m_context->m_device_priorities.end() == it_numrequests || it_numrequests->num_requests_per_devices == -1)
+            ? optimal_num
+            : it_numrequests->num_requests_per_devices;
+    num_requests = num_requests <= 1 && m_context->m_performance_hint == ov::hint::PerformanceMode::THROUGHPUT
+                       ? 2
+                       : num_requests;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
     worker_requests.resize(num_requests);

From 306ac481fd84d48b1149b38735f435c45381c6d6 Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Tue, 17 Dec 2024 11:12:34 +0800
Subject: [PATCH 12/14] update test cases.

---
 src/plugins/auto/src/schedule.cpp             |  7 ++-
 .../auto/tests/unit/release_helper_test.cpp   | 14 +++--
 .../auto/tests/unit/runtime_fallback_test.cpp | 58 ++++++++++---------
 3 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 0d7bd4b0ecda3c..2fbb81e34647fb 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -89,9 +89,10 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         (m_context->m_device_priorities.end() == it_numrequests || it_numrequests->num_requests_per_devices == -1)
             ? optimal_num
             : it_numrequests->num_requests_per_devices;
-    num_requests = num_requests <= 1 && m_context->m_performance_hint == ov::hint::PerformanceMode::THROUGHPUT
-                       ? 2
-                       : num_requests;
+    num_requests =
+        (num_requests == 1 && m_context->m_performance_hint != ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT)
+            ? 2
+            : num_requests;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
     worker_requests.resize(num_requests);
diff --git a/src/plugins/auto/tests/unit/release_helper_test.cpp b/src/plugins/auto/tests/unit/release_helper_test.cpp
index 89c4654f58bc78..2485cb6ee758a2 100644
--- a/src/plugins/auto/tests/unit/release_helper_test.cpp
+++ b/src/plugins/auto/tests/unit/release_helper_test.cpp
@@ -157,7 +157,8 @@ TEST_P(AutoReleaseHelperTest, releaseResource) {
     bool cpuSuccess;
     bool accSuccess;
     std::tie(cpuSuccess, accSuccess) = this->GetParam();
-    size_t decreaseCount = 0;
+    size_t decreaseExeNetworkCount = 0;
+    size_t decreaseInferReqCount = 0;
     // test auto plugin
     plugin->set_device_name("AUTO");
     const std::string strDevices = ov::test::utils::DEVICE_GPU + std::string(",") + ov::test::utils::DEVICE_CPU;
@@ -188,8 +189,11 @@ TEST_P(AutoReleaseHelperTest, releaseResource) {
                               ::testing::Matcher<const std::string&>(StrEq(ov::test::utils::DEVICE_CPU)),
                               _))
             .WillByDefault(Return(mockExeNetwork));
-        if (accSuccess)
-            decreaseCount++;
+        if (accSuccess) {
+            decreaseExeNetworkCount++;
+            // will be at least 2 infer requests for mocked CPU/GPU
+            decreaseInferReqCount += 2;
+        }
     } else {
         ON_CALL(*core,
                 compile_model(::testing::Matcher<const std::shared_ptr<const ov::Model>&>(_),
@@ -224,8 +228,8 @@ TEST_P(AutoReleaseHelperTest, releaseResource) {
     auto sharedcount = mockExeNetwork._ptr.use_count();
     auto requestsharedcount = inferReqInternal.use_count();
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    EXPECT_EQ(mockExeNetwork._ptr.use_count(), sharedcount - decreaseCount);
-    EXPECT_EQ(inferReqInternal.use_count(), requestsharedcount - decreaseCount);
+    EXPECT_EQ(mockExeNetwork._ptr.use_count(), sharedcount - decreaseExeNetworkCount);
+    EXPECT_EQ(inferReqInternal.use_count(), requestsharedcount - decreaseInferReqCount);
     if (cpuSuccess || accSuccess) {
         if (accSuccess)
             EXPECT_EQ(exeNetwork->get_property(ov::execution_devices.name()).as<std::string>(),
diff --git a/src/plugins/auto/tests/unit/runtime_fallback_test.cpp b/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
index 88eca787740e96..6b97802f872ee4 100644
--- a/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
+++ b/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
@@ -171,23 +171,24 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
         targetDev += deviceName;
         targetDev += ((deviceInfo == targetDevices.back()) ? "" : ",");
         if (deviceName == "CPU") {
-            mockInferrequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal,
-                                                                                             mockExecutor,
-                                                                                             nullptr,
-                                                                                             ifThrow);
-            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this]() {
-                return mockInferrequest;
+            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this, ifThrow]() {
+                auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal,
+                                                                                                  mockExecutor,
+                                                                                                  nullptr,
+                                                                                                  ifThrow);
+                return inferRequest;
             });
         } else if (deviceName == "GPU.0") {
-            mockInferrequestGPU_0 =
-                std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalActual,
-                                                                              mockExecutorGPU_0,
-                                                                              nullptr,
-                                                                              ifThrow);
-            ON_CALL(*mockIExeNetActual.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this]() {
-                std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                return mockInferrequestGPU_0;
-            }));
+            ON_CALL(*mockIExeNetActual.get(), create_infer_request())
+                .WillByDefault(InvokeWithoutArgs([this, ifThrow]() {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                    auto inferRequest =
+                        std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalActual,
+                                                                                      mockExecutorGPU_0,
+                                                                                      nullptr,
+                                                                                      ifThrow);
+                    return inferRequest;
+                }));
         } else if (deviceName == "GPU.1") {
             if (generateWorkersFail) {
                 mockInferrequestGPU_1 =
@@ -197,24 +198,25 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
                                                                                   ifThrow);
                 ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request()).WillByDefault(ov::Throw("error"));
             } else {
-                mockInferrequestGPU_1 =
-                    std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalGPU_1,
-                                                                                  mockExecutorGPU_1,
-                                                                                  nullptr,
-                                                                                  ifThrow);
-                ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this]() {
-                    std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                    return mockInferrequestGPU_1;
-                }));
+                ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request())
+                    .WillByDefault(InvokeWithoutArgs([this, ifThrow]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                        auto inferRequest =
+                            std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalGPU_1,
+                                                                                          mockExecutorGPU_1,
+                                                                                          nullptr,
+                                                                                          ifThrow);
+                        return inferRequest;
+                    }));
             }
         } else if (deviceName == "OTHER") {
-            mockInferrequestOTHER = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalOTHER,
+            ON_CALL(*mockIExeNetOTHER.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this, ifThrow]() {
+                std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalOTHER,
                                                                                                   mockExecutorOTHER,
                                                                                                   nullptr,
                                                                                                   ifThrow);
-            ON_CALL(*mockIExeNetOTHER.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this]() {
-                std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                return mockInferrequestOTHER;
+                return inferRequest;
             }));
         } else {
             return;

From 4e27715cb6fb69cab8d9b82c5533f7b29042586b Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Fri, 20 Dec 2024 12:21:01 +0800
Subject: [PATCH 13/14] update.

---
 src/plugins/auto/src/schedule.cpp             |   5 +-
 .../auto/tests/unit/dynamic_output_test.cpp   |  28 ++-
 .../auto/tests/unit/runtime_fallback_test.cpp | 176 ++++++++++++++----
 3 files changed, 161 insertions(+), 48 deletions(-)

diff --git a/src/plugins/auto/src/schedule.cpp b/src/plugins/auto/src/schedule.cpp
index 2fbb81e34647fb..dc9961752f18c3 100644
--- a/src/plugins/auto/src/schedule.cpp
+++ b/src/plugins/auto/src/schedule.cpp
@@ -89,10 +89,7 @@ void Schedule::generate_workers(const std::string& device, const SoCompiledModel
         (m_context->m_device_priorities.end() == it_numrequests || it_numrequests->num_requests_per_devices == -1)
             ? optimal_num
             : it_numrequests->num_requests_per_devices;
-    num_requests =
-        (num_requests == 1 && m_context->m_performance_hint != ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT)
-            ? 2
-            : num_requests;
+    num_requests = (num_requests == 1) ? 2 : num_requests;
     auto& worker_requests = m_worker_requests[device];
     auto& idle_worker_requests = m_idle_worker_requests[device];
     worker_requests.resize(num_requests);
diff --git a/src/plugins/auto/tests/unit/dynamic_output_test.cpp b/src/plugins/auto/tests/unit/dynamic_output_test.cpp
index c0902b38ce5d46..ba7febe07d049e 100644
--- a/src/plugins/auto/tests/unit/dynamic_output_test.cpp
+++ b/src/plugins/auto/tests/unit/dynamic_output_test.cpp
@@ -7,7 +7,6 @@
 
 #include "include/auto_unit_test.hpp"
 #include "openvino/runtime/threading/immediate_executor.hpp"
-
 using DynamicOutputConfigParams = std::tuple<ov::Any,  // priority device list
                                              ov::Any   // expected device to run inference on
                                              >;
@@ -21,14 +20,18 @@ class DynamicOutputInferenceTest : public tests::AutoTest, public ::testing::Tes
         mockExecutor.reset();
         mockExecutorActual.reset();
         mockInferrequest.reset();
+        mockInferrequest_2.reset();
         mockInferrequestActual.reset();
+        mockInferrequestActual_2.reset();
     }
 
 protected:
     ov::Any priorityList;
     ov::Any targetList;
     std::shared_ptr<ov::mock_auto_plugin::MockAsyncInferRequest> mockInferrequest;
+    std::shared_ptr<ov::mock_auto_plugin::MockAsyncInferRequest> mockInferrequest_2;
     std::shared_ptr<ov::mock_auto_plugin::MockAsyncInferRequest> mockInferrequestActual;
+    std::shared_ptr<ov::mock_auto_plugin::MockAsyncInferRequest> mockInferrequestActual_2;
     std::shared_ptr<ov::threading::ImmediateExecutor> mockExecutor;
     std::shared_ptr<ov::threading::ImmediateExecutor> mockExecutorActual;
 };
@@ -53,10 +56,22 @@ void DynamicOutputInferenceTest::SetUp() {
     mockExecutorActual = std::make_shared<ov::threading::ImmediateExecutor>();
     mockInferrequest =
         std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal, mockExecutor, nullptr, false);
+    // will be at least 2 infer requests for mocked CPU/GPU
+    auto inferReqInternal_2 = std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNet);
+    mockInferrequest_2 =
+        std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal_2, mockExecutor, nullptr, false);
+
+    auto inferReqInternalActual_2 = std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNetActual);
+
     mockInferrequestActual = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalActual,
                                                                                            mockExecutorActual,
                                                                                            nullptr,
                                                                                            false);
+    mockInferrequestActual_2 = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalActual_2,
+                                                                                             mockExecutorActual,
+                                                                                             nullptr,
+                                                                                             false);
+
     std::tie(priorityList, targetList) = GetParam();
     auto targets = targetList.as<std::vector<std::string>>();
     ON_CALL(*core, get_available_devices()).WillByDefault(Return(targets));
@@ -103,11 +118,12 @@ TEST_P(DynamicOutputInferenceTest, CanInferWithOutputChangedFromDynamicOnAutoToS
         auto tensor = inferReqInternal->get_tensor(it);
         tensor->set_shape(ov::Shape{2, 3});
     }
-    ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault(Return(mockInferrequest));
-    ON_CALL(*mockIExeNetActual.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this]() {
-        std::this_thread::sleep_for(std::chrono::milliseconds(0));
-        return mockInferrequestActual;
-    }));
+    EXPECT_CALL(*mockIExeNet.get(), create_infer_request())
+        .WillOnce(Return(mockInferrequest))
+        .WillOnce(Return(mockInferrequest_2));
+    EXPECT_CALL(*mockIExeNetActual.get(), create_infer_request())
+        .WillOnce(Return(mockInferrequestActual))
+        .WillOnce(Return(mockInferrequestActual_2));
     config.insert(ov::device::priorities(priorityList.as<std::string>()));
     config.insert(ov::hint::performance_mode(ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT));
     std::shared_ptr<ov::ICompiledModel> exeNetwork;
diff --git a/src/plugins/auto/tests/unit/runtime_fallback_test.cpp b/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
index 6b97802f872ee4..12903b7dfed5e5 100644
--- a/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
+++ b/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
@@ -164,6 +164,11 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
                               _))
             .WillByDefault(ov::Throw("compile model error"));
     }
+    std::map<std::string, std::vector<std::shared_ptr<ov::mock_auto_plugin::MockAsyncInferRequest>>> inferRequests;
+    inferRequests["CPU"] = {};
+    inferRequests["GPU.0"] = {};
+    inferRequests["GPU.1"] = {};
+    inferRequests["OTHER"] = {};
     for (auto& deviceInfo : targetDevices) {
         std::string deviceName;
         bool ifThrow;
@@ -171,23 +176,47 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
         targetDev += deviceName;
         targetDev += ((deviceInfo == targetDevices.back()) ? "" : ",");
         if (deviceName == "CPU") {
-            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this, ifThrow]() {
-                auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal,
-                                                                                                  mockExecutor,
-                                                                                                  nullptr,
-                                                                                                  ifThrow);
-                return inferRequest;
+            auto inferReqInternal_CPU_2 = std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNet);
+            auto inferRequest_2 = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal_CPU_2,
+                                                                                                mockExecutor,
+                                                                                                nullptr,
+                                                                                                ifThrow);
+            auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal,
+                                                                                              mockExecutor,
+                                                                                              nullptr,
+                                                                                              ifThrow);
+            inferRequests[deviceName].push_back(inferRequest);
+            inferRequests[deviceName].push_back(inferRequest_2);
+            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this, &inferRequests, deviceName]() {
+                auto infer = inferRequests.at(deviceName).back();
+                if (inferRequests.at(deviceName).size() > 1) {
+                    // in case of passthrough model, we need to keep the infer request
+                    inferRequests.at(deviceName).pop_back();
+                }
+                return infer;
             });
         } else if (deviceName == "GPU.0") {
+            auto inferReqInternal_GPU_0_2 =
+                std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNetActual);
+            auto inferRequest_2 =
+                std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal_GPU_0_2,
+                                                                              mockExecutorGPU_0,
+                                                                              nullptr,
+                                                                              ifThrow);
+            auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalActual,
+                                                                                              mockExecutorGPU_0,
+                                                                                              nullptr,
+                                                                                              ifThrow);
+            inferRequests[deviceName].push_back(inferRequest);
+            inferRequests[deviceName].push_back(inferRequest_2);
             ON_CALL(*mockIExeNetActual.get(), create_infer_request())
-                .WillByDefault(InvokeWithoutArgs([this, ifThrow]() {
+                .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
                     std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                    auto inferRequest =
-                        std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalActual,
-                                                                                      mockExecutorGPU_0,
-                                                                                      nullptr,
-                                                                                      ifThrow);
-                    return inferRequest;
+                    auto infer = inferRequests.at(deviceName).back();
+                    if (inferRequests.at(deviceName).size() > 1) {
+                        inferRequests.at(deviceName).pop_back();
+                    }
+                    return infer;
                 }));
         } else if (deviceName == "GPU.1") {
             if (generateWorkersFail) {
@@ -198,26 +227,52 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
                                                                                   ifThrow);
                 ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request()).WillByDefault(ov::Throw("error"));
             } else {
+                auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalGPU_1,
+                                                                                                  mockExecutorGPU_1,
+                                                                                                  nullptr,
+                                                                                                  ifThrow);
+                auto inferReqInternalGPU_1_2 =
+                    std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNetGPU_1);
+                auto inferRequest_2 =
+                    std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalGPU_1_2,
+                                                                                  mockExecutorGPU_1,
+                                                                                  nullptr,
+                                                                                  ifThrow);
+                inferRequests[deviceName].push_back(inferRequest);
+                inferRequests[deviceName].push_back(inferRequest_2);
                 ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request())
-                    .WillByDefault(InvokeWithoutArgs([this, ifThrow]() {
+                    .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
                         std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                        auto inferRequest =
-                            std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalGPU_1,
-                                                                                          mockExecutorGPU_1,
-                                                                                          nullptr,
-                                                                                          ifThrow);
-                        return inferRequest;
+                        auto infer = inferRequests.at(deviceName).back();
+                        if (inferRequests.at(deviceName).size() > 1) {
+                            inferRequests.at(deviceName).pop_back();
+                        }
+                        return infer;
                     }));
             }
         } else if (deviceName == "OTHER") {
-            ON_CALL(*mockIExeNetOTHER.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this, ifThrow]() {
-                std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalOTHER,
-                                                                                                  mockExecutorOTHER,
-                                                                                                  nullptr,
-                                                                                                  ifThrow);
-                return inferRequest;
-            }));
+            auto inferRequest = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalOTHER,
+                                                                                              mockExecutorOTHER,
+                                                                                              nullptr,
+                                                                                              ifThrow);
+            auto inferReqInternalOTHER_2 =
+                std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNetOTHER);
+            std::this_thread::sleep_for(std::chrono::milliseconds(0));
+            auto inferRequest_2 = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalOTHER_2,
+                                                                                                mockExecutorOTHER,
+                                                                                                nullptr,
+                                                                                                ifThrow);
+            inferRequests[deviceName].push_back(inferRequest);
+            inferRequests[deviceName].push_back(inferRequest_2);
+            ON_CALL(*mockIExeNetOTHER.get(), create_infer_request())
+                .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                    auto infer = inferRequests.at(deviceName).back();
+                    if (inferRequests.at(deviceName).size() > 1) {
+                        inferRequests.at(deviceName).pop_back();
+                    }
+                    return infer;
+                }));
         } else {
             return;
         }
@@ -321,6 +376,11 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
                               _))
             .WillByDefault(ov::Throw("compile model error"));
     }
+    std::map<std::string, std::vector<std::shared_ptr<ov::mock_auto_plugin::MockAsyncInferRequest>>> inferRequests;
+    inferRequests["CPU"] = {};
+    inferRequests["GPU.0"] = {};
+    inferRequests["GPU.1"] = {};
+    inferRequests["OTHER"] = {};
     for (auto& deviceInfo : targetDevices) {
         std::string deviceName;
         bool ifThrow;
@@ -332,8 +392,20 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
                                                                                              mockExecutor,
                                                                                              nullptr,
                                                                                              ifThrow);
-            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this]() {
-                return mockInferrequest;
+            auto inferReqInternal_CPU_2 = std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNet);
+            auto inferRequest_2 = std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal_CPU_2,
+                                                                                                mockExecutor,
+                                                                                                nullptr,
+                                                                                                ifThrow);
+            inferRequests[deviceName].push_back(mockInferrequest);
+            inferRequests[deviceName].push_back(inferRequest_2);
+            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this, &inferRequests, deviceName]() {
+                auto infer = inferRequests.at(deviceName).back();
+                if (inferRequests.at(deviceName).size() > 1) {
+                    // in case of passthrough model, we need to keep the infer request
+                    inferRequests.at(deviceName).pop_back();
+                }
+                return infer;
             });
         } else if (deviceName == "GPU.0") {
             mockInferrequestGPU_0 =
@@ -341,10 +413,24 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
                                                                               mockExecutorGPU_0,
                                                                               nullptr,
                                                                               ifThrow);
-            ON_CALL(*mockIExeNetActual.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this]() {
-                std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                return mockInferrequestGPU_0;
-            }));
+            auto inferReqInternal_GPU_0_2 =
+                std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNetActual);
+            auto inferRequest_2 =
+                std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternal_GPU_0_2,
+                                                                              mockExecutorGPU_0,
+                                                                              nullptr,
+                                                                              ifThrow);
+            inferRequests[deviceName].push_back(mockInferrequestGPU_0);
+            inferRequests[deviceName].push_back(inferRequest_2);
+            ON_CALL(*mockIExeNetActual.get(), create_infer_request())
+                .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                    auto infer = inferRequests.at(deviceName).back();
+                    if (inferRequests.at(deviceName).size() > 1) {
+                        inferRequests.at(deviceName).pop_back();
+                    }
+                    return infer;
+                }));
         } else if (deviceName == "GPU.1") {
             if (generateWorkersFail) {
                 mockInferrequestGPU_1 =
@@ -359,10 +445,24 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
                                                                                   mockExecutorGPU_1,
                                                                                   nullptr,
                                                                                   ifThrow);
-                ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request()).WillByDefault(InvokeWithoutArgs([this]() {
-                    std::this_thread::sleep_for(std::chrono::milliseconds(0));
-                    return mockInferrequestGPU_1;
-                }));
+                auto inferReqInternalGPU_1_2 =
+                    std::make_shared<ov::mock_auto_plugin::MockISyncInferRequest>(mockIExeNetGPU_1);
+                auto inferRequest_2 =
+                    std::make_shared<ov::mock_auto_plugin::MockAsyncInferRequest>(inferReqInternalGPU_1_2,
+                                                                                  mockExecutorGPU_1,
+                                                                                  nullptr,
+                                                                                  ifThrow);
+                inferRequests[deviceName].push_back(mockInferrequestGPU_1);
+                inferRequests[deviceName].push_back(inferRequest_2);
+                ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request())
+                    .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                        auto infer = inferRequests.at(deviceName).back();
+                        if (inferRequests.at(deviceName).size() > 1) {
+                            inferRequests.at(deviceName).pop_back();
+                        }
+                        return infer;
+                    }));
             }
         }
     }

From ee751636b6c7d75279f8e546bc8a224601ebbcaa Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Fri, 20 Dec 2024 13:42:00 +0800
Subject: [PATCH 14/14] update.

---
 .../auto/tests/unit/runtime_fallback_test.cpp      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/plugins/auto/tests/unit/runtime_fallback_test.cpp b/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
index 12903b7dfed5e5..50a1d96a8251b8 100644
--- a/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
+++ b/src/plugins/auto/tests/unit/runtime_fallback_test.cpp
@@ -187,7 +187,7 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
                                                                                               ifThrow);
             inferRequests[deviceName].push_back(inferRequest);
             inferRequests[deviceName].push_back(inferRequest_2);
-            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this, &inferRequests, deviceName]() {
+            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([&inferRequests, deviceName]() {
                 auto infer = inferRequests.at(deviceName).back();
                 if (inferRequests.at(deviceName).size() > 1) {
                     // in case of passthrough model, we need to keep the infer request
@@ -210,7 +210,7 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
             inferRequests[deviceName].push_back(inferRequest);
             inferRequests[deviceName].push_back(inferRequest_2);
             ON_CALL(*mockIExeNetActual.get(), create_infer_request())
-                .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                .WillByDefault(InvokeWithoutArgs([&inferRequests, deviceName]() {
                     std::this_thread::sleep_for(std::chrono::milliseconds(0));
                     auto infer = inferRequests.at(deviceName).back();
                     if (inferRequests.at(deviceName).size() > 1) {
@@ -241,7 +241,7 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
                 inferRequests[deviceName].push_back(inferRequest);
                 inferRequests[deviceName].push_back(inferRequest_2);
                 ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request())
-                    .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                    .WillByDefault(InvokeWithoutArgs([&inferRequests, deviceName]() {
                         std::this_thread::sleep_for(std::chrono::milliseconds(0));
                         auto infer = inferRequests.at(deviceName).back();
                         if (inferRequests.at(deviceName).size() > 1) {
@@ -265,7 +265,7 @@ TEST_P(AutoRuntimeFallback, releaseResource) {
             inferRequests[deviceName].push_back(inferRequest);
             inferRequests[deviceName].push_back(inferRequest_2);
             ON_CALL(*mockIExeNetOTHER.get(), create_infer_request())
-                .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                .WillByDefault(InvokeWithoutArgs([&inferRequests, deviceName]() {
                     std::this_thread::sleep_for(std::chrono::milliseconds(0));
                     auto infer = inferRequests.at(deviceName).back();
                     if (inferRequests.at(deviceName).size() > 1) {
@@ -399,7 +399,7 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
                                                                                                 ifThrow);
             inferRequests[deviceName].push_back(mockInferrequest);
             inferRequests[deviceName].push_back(inferRequest_2);
-            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([this, &inferRequests, deviceName]() {
+            ON_CALL(*mockIExeNet.get(), create_infer_request()).WillByDefault([&inferRequests, deviceName]() {
                 auto infer = inferRequests.at(deviceName).back();
                 if (inferRequests.at(deviceName).size() > 1) {
                     // in case of passthrough model, we need to keep the infer request
@@ -423,7 +423,7 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
             inferRequests[deviceName].push_back(mockInferrequestGPU_0);
             inferRequests[deviceName].push_back(inferRequest_2);
             ON_CALL(*mockIExeNetActual.get(), create_infer_request())
-                .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                .WillByDefault(InvokeWithoutArgs([&inferRequests, deviceName]() {
                     std::this_thread::sleep_for(std::chrono::milliseconds(0));
                     auto infer = inferRequests.at(deviceName).back();
                     if (inferRequests.at(deviceName).size() > 1) {
@@ -455,7 +455,7 @@ TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
                 inferRequests[deviceName].push_back(mockInferrequestGPU_1);
                 inferRequests[deviceName].push_back(inferRequest_2);
                 ON_CALL(*mockIExeNetGPU_1.get(), create_infer_request())
-                    .WillByDefault(InvokeWithoutArgs([this, &inferRequests, deviceName]() {
+                    .WillByDefault(InvokeWithoutArgs([&inferRequests, deviceName]() {
                         std::this_thread::sleep_for(std::chrono::milliseconds(0));
                         auto infer = inferRequests.at(deviceName).back();
                         if (inferRequests.at(deviceName).size() > 1) {