WIP weightless

openvinotoolkit · Jan 15, 2025 · ae1f79d · ae1f79d
1 parent 2ee2da4
commit ae1f79d
Show file tree

Hide file tree

Showing 10 changed files with 668 additions and 63 deletions.
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -481,6 +481,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
         }
     }
 
+    // Store constants' offset for serialization purposes
+    store_const_offsets(model);
+
     // Finalize memory in closures and weight banks
     finalize_weights_bank();
     detach_memory();
@@ -509,7 +512,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     LOG_DEBUG("CompiledModel is being deserialized, skipping the full constructor flow...");
 }
 
-void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream) const {
+void ov::npuw::CompiledModel::CompiledModelDesc::serialize(
+    std::ostream& stream,
+    bool is_weightless,
+    const std::unordered_map<const void*, std::size_t>& const_to_offset) const {
     using namespace ov::npuw::s11n;
 
     LOG_DEBUG("Serializing CompiledModelDesc...");
@@ -526,37 +532,64 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream)
 
     write(stream, spatial);
 
-    write(stream, scales);
-    write(stream, zerops);
-    write(stream, is_remote);
-
-    // NOTE: for closure only serialize uids - full flow
-    write(stream, closure_uid);
-
-    // Some tensors might be present in CPU closure already - need to serialize as is
-    // FIXME: When weightless serialization is introduced, this should be handled differently
-    write(stream, closure.size());
-    std::vector<ov::Tensor> cpu_closures;
-    std::vector<std::size_t> cpu_closure_ids;
-    for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
-        if (closure_uid[cidx] == -1) {  // CPU closure, not in the bank
-            cpu_closure_ids.push_back(cidx);
-            cpu_closures.push_back(closure[cidx]);
+    if (!is_weightless) {
+        write(stream, scales);
+        write(stream, zerops);
+        write(stream, is_remote);
+        write(stream, closure_uid);
+
+        // Some tensors might be present in CPU closure already - need to serialize as is
+        write(stream, closure.size());
+        std::vector<ov::Tensor> cpu_closures;
+        std::vector<std::size_t> cpu_closure_ids;
+        for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
+            if (closure_uid[cidx] == -1) {  // CPU closure, not in the bank
+                cpu_closure_ids.push_back(cidx);
+                cpu_closures.push_back(closure[cidx]);
+            }
         }
-    }
 
-    write(stream, cpu_closure_ids);
+        write(stream, cpu_closure_ids);
 
-    for (const auto& tensor : cpu_closures) {
-        write(stream, tensor);
-    }
+        for (const auto& tensor : cpu_closures) {
+            write(stream, tensor);
+        }
+    } else {  // weightless
+        // FIXME: almost a copy of the above
+        write_weightless(stream, scales, const_to_offset);
+        write_weightless(stream, zerops, const_to_offset);
+
+        write(stream, is_remote);
+        write(stream, closure_uid);
+
+        // Some tensors might be present in CPU closure already - need to serialize as is
+        write(stream, closure.size());
+        std::vector<ov::Tensor> cpu_closures;
+        std::vector<std::size_t> cpu_closure_ids;
+        std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
+        std::vector<std::size_t> non_cpu_tensors_ids;
+        for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
+            if (closure_uid[cidx] == -1) {  // CPU closure
+                cpu_closure_ids.push_back(cidx);
+                cpu_closures.push_back(closure[cidx]);
+            } else {
+                non_cpu_tensors_ids.push_back(cidx);
+                non_cpu_tensors.push_back(lazy_closure[cidx]);  // must be there
+            }
+        }
 
-    // FIXME: support weightless flow!
+        write(stream, cpu_closure_ids);
+        write(stream, non_cpu_tensors_ids);
+        write_weightless(stream, cpu_closures, const_to_offset);
+        write(stream, non_cpu_tensors);
+    }
 
     LOG_DEBUG("DONE.");
 }
 
-void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream) {
+void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream,
+                                                             bool is_weightless,
+                                                             const std::string& weights_path) {
     using namespace ov::npuw::s11n;
 
     LOG_DEBUG("Deserializing CompiledModelDesc...");
@@ -573,30 +606,64 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
 
     read(stream, spatial);
 
-    read(stream, scales);
-    read(stream, zerops);
-    read(stream, is_remote);
-
-    // NOTE: for closure only deserialize uids - full flow
-    read(stream, closure_uid);
-
-    // Some tensors might be present in CPU closure already - need to deserialize as is
-    // FIXME: When weightless serialization is introduced, this should be handled differently
-    std::size_t closure_size = 0;
-    read(stream, closure_size);
-    std::vector<std::size_t> cpu_closure_ids;
-    read(stream, cpu_closure_ids);
-    closure.resize(closure_size);
-    for (const auto& cidx : cpu_closure_ids) {
-        read(stream, closure[cidx]);
+    if (!is_weightless) {
+        read(stream, scales);
+        read(stream, zerops);
+        read(stream, is_remote);
+        read(stream, closure_uid);
+
+        // Some tensors might be present in CPU closure already - need to deserialize as is
+        std::size_t closure_size = 0;
+        read(stream, closure_size);
+        std::vector<std::size_t> cpu_closure_ids;
+        read(stream, cpu_closure_ids);
+        closure.resize(closure_size);
+        for (const auto& cidx : cpu_closure_ids) {
+            read(stream, closure[cidx]);
+        }
+    } else {  // weightless
+        // FIXME: almost a copy of the above
+        std::ifstream weights_stream(weights_path, std::ios::in | std::ios::binary);
+        NPUW_ASSERT(weights_stream);
+        read_weightless(stream, scales, weights_stream);
+        read_weightless(stream, zerops, weights_stream);
+
+        read(stream, is_remote);
+        read(stream, closure_uid);
+
+        // Some tensors might be present in CPU closure already - need to serialize as is
+        std::size_t closure_size = 0;
+        read(stream, closure_size);
+        closure.resize(closure_size);
+        lazy_closure.resize(closure_size);
+
+        std::vector<std::size_t> cpu_closure_ids;
+        std::vector<std::size_t> non_cpu_tensors_ids;
+        read(stream, cpu_closure_ids);
+        read(stream, non_cpu_tensors_ids);
+
+        std::vector<ov::Tensor> cpu_closures;
+        std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
+        read_weightless(stream, cpu_closures, weights_stream);
+        std::size_t tidx = 0;
+        for (const auto& idx : cpu_closure_ids) {
+            ov::Tensor t = cpu_closures[tidx];
+            closure[idx] = ov::Tensor(t.get_element_type(), t.get_shape());
+            // FIXME: get rid of this copy
+            t.copy_to(closure[idx]);
+            tidx++;
+        }
+        read(stream, non_cpu_tensors);
+        std::size_t ltidx = 0;
+        for (const auto& idx : non_cpu_tensors_ids) {
+            lazy_closure[idx] = non_cpu_tensors[ltidx++];
+        }
     }
 
-    // FIXME: support weightless flow!
-
     LOG_DEBUG("DONE.");
 }
 
-void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
+void ov::npuw::CompiledModel::serialize(std::ostream& stream, bool is_weightless) const {
     LOG_INFO("Serializing CompiledModel...");
     LOG_BLOCK();
 
@@ -644,15 +711,17 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
             write(stream, false);
         }
         // Write the rest of the submodel desc
-        subm.serialize(stream);
+        subm.serialize(stream, is_weightless, m_const_to_offset);
     }
 
     LOG_INFO("Done.");
 }
 
 std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
     std::istream& stream,
-    const std::shared_ptr<const ov::IPlugin>& plugin) {
+    const std::shared_ptr<const ov::IPlugin>& plugin,
+    bool is_weightless,
+    const std::string& weights_path) {
     LOG_INFO("Deserializing CompiledModel...");
     LOG_BLOCK();
 
@@ -719,7 +788,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
                 plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
         }
         compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
-        compiled->m_compiled_submodels[i].deserialize(stream);
+        compiled->m_compiled_submodels[i].deserialize(stream, is_weightless, weights_path);
     }
 
     compiled->implement_properties();
@@ -812,6 +881,87 @@ void ov::npuw::CompiledModel::reconstruct_closure() {
     }
 }
 
+// Almost a copy of finalize_weights_bank()
+void ov::npuw::CompiledModel::reconstruct_closure_weightless(const std::string& weights_path) {
+    LOG_INFO("Reconstructing weights bank...");
+    std::ifstream weights_stream(weights_path, std::ios::in | std::ios::binary);
+    NPUW_ASSERT(weights_stream);
+    // Register lazy tensors
+    for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+        auto& comp_model_desc = m_compiled_submodels[idx];
+
+        // Skip optimized out and non-functions
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            continue;
+        }
+
+        const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+        auto& func_desc = m_compiled_submodels[real_idx];
+
+        for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
+            if (comp_model_desc.closure[tidx]) {
+                continue;  // host-side closure
+            }
+            comp_model_desc.closure_uid[tidx] =
+                m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it);
+        }
+    }
+
+    // Evaluate and allocate all LazyTensors inside the bank
+    m_weights_bank->evaluate_and_allocate(weights_stream);
+
+    // Set evaluated and allocated ov::Tensors to closures
+    for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+        auto& comp_model_desc = m_compiled_submodels[idx];
+
+        // Skip optimized out and non-functions
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            continue;
+        }
+
+        const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+        auto& func_desc = m_compiled_submodels[real_idx];
+
+        for (std::size_t tidx = 0; tidx < comp_model_desc.closure.size(); ++tidx) {
+            if (comp_model_desc.closure[tidx]) {
+                // host-side closure - already set, do nothing
+                comp_model_desc.is_remote[tidx] = false;
+                continue;
+            }
+            const auto& uid = comp_model_desc.closure_uid[tidx];
+            NPUW_ASSERT(uid != -1);  // All tensors should be registered at this point
+            comp_model_desc.closure[tidx] = m_weights_bank->get(uid, *func_desc.device_it);
+            // FIXME: find a more reliable way to do so
+            comp_model_desc.is_remote[tidx] = m_weights_bank->is_remote(uid);
+        }
+    }
+
+    LOG_INFO("Done.");
+}
+
+void ov::npuw::CompiledModel::store_const_offsets(const std::shared_ptr<ov::Model>& model) {
+    for (auto&& node_ptr : model->get_ordered_ops()) {
+        if (ov::op::util::is_constant(node_ptr)) {
+            const auto& c = std::static_pointer_cast<ov::op::v0::Constant>(node_ptr);
+            const auto& rt_info = c->get_rt_info();
+            auto data_ptr = c->get_data_ptr();
+            auto offset_iter = rt_info.find("offset");
+            if (offset_iter == rt_info.end()) {
+                continue;
+            }
+            std::size_t offset = offset_iter->second.as<std::size_t>();
+            auto map_iter = m_const_to_offset.find(data_ptr);
+            if (map_iter != m_const_to_offset.end()) {
+                // Already there - check that offset is the same
+                NPUW_ASSERT(map_iter->second == offset &&
+                            "Model contains two constants with same pointer and different offset!");
+            } else {
+                m_const_to_offset[data_ptr] = offset;
+            }
+        }
+    }
+}
+
 void ov::npuw::CompiledModel::detach_memory() {
     LOG_INFO("Detaching model & weight memory...");
     LOG_BLOCK();

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -70,10 +70,11 @@ class CompiledModel : public ov::npuw::ICompiledModel {
 
     void report_io() const;
 
-    void serialize(std::ostream& stream, bool is_weightless, const std::string& weights_path) const;
+    void serialize(std::ostream& stream, bool is_weightless) const;
     static std::shared_ptr<CompiledModel> deserialize(std::istream& stream,
                                                       const std::shared_ptr<const ov::IPlugin>& plugin,
-                                                      bool is_weightless, const std::string& weights_path);
+                                                      bool is_weightless,
+                                                      const std::string& weights_path);
 
     // This is used for removing too long output tensor names to fix some compilation issues
     // NB: These two methods has nothing to do with this particular class and should be
@@ -94,6 +95,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {
 
     // For full deserialization flow with weights
     void reconstruct_closure();
+    // For weightless serialization flow
+    void reconstruct_closure_weightless(const std::string& weights_path);
+    void store_const_offsets(const std::shared_ptr<ov::Model>& model);
 
     void finalize_weights_bank();
     void detach_memory();
@@ -167,8 +171,10 @@ class CompiledModel : public ov::npuw::ICompiledModel {
         // Metrics
         execution_stats stat;
 
-        void serialize(std::ostream& stream) const;
-        void deserialize(std::istream& stream);
+        void serialize(std::ostream& stream,
+                       bool is_weightless,
+                       const std::unordered_map<const void*, std::size_t>& const_to_offset) const;
+        void deserialize(std::istream& stream, bool is_weightless, const std::string& weights_path);
     };
     std::vector<CompiledModelDesc> m_compiled_submodels;
 
@@ -178,6 +184,8 @@ class CompiledModel : public ov::npuw::ICompiledModel {
     execution_stats m_total_stat;
 
     std::shared_ptr<weights::Bank> m_weights_bank = nullptr;
+
+    std::unordered_map<const void*, std::size_t> m_const_to_offset;
 };
 }  // namespace npuw
 }  // namespace ov