Skip to content

Commit

Permalink
WIP weightless
Browse files Browse the repository at this point in the history
  • Loading branch information
smirnov-alexey committed Jan 15, 2025
1 parent 2ee2da4 commit ae1f79d
Show file tree
Hide file tree
Showing 10 changed files with 668 additions and 63 deletions.
242 changes: 196 additions & 46 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
}
}

// Store constants' offset for serialization purposes
store_const_offsets(model);

// Finalize memory in closures and weight banks
finalize_weights_bank();
detach_memory();
Expand Down Expand Up @@ -509,7 +512,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
LOG_DEBUG("CompiledModel is being deserialized, skipping the full constructor flow...");
}

void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream) const {
void ov::npuw::CompiledModel::CompiledModelDesc::serialize(
std::ostream& stream,
bool is_weightless,
const std::unordered_map<const void*, std::size_t>& const_to_offset) const {
using namespace ov::npuw::s11n;

LOG_DEBUG("Serializing CompiledModelDesc...");
Expand All @@ -526,37 +532,64 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream)

write(stream, spatial);

write(stream, scales);
write(stream, zerops);
write(stream, is_remote);

// NOTE: for closure only serialize uids - full flow
write(stream, closure_uid);

// Some tensors might be present in CPU closure already - need to serialize as is
// FIXME: When weightless serialization is introduced, this should be handled differently
write(stream, closure.size());
std::vector<ov::Tensor> cpu_closures;
std::vector<std::size_t> cpu_closure_ids;
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
if (closure_uid[cidx] == -1) { // CPU closure, not in the bank
cpu_closure_ids.push_back(cidx);
cpu_closures.push_back(closure[cidx]);
if (!is_weightless) {
write(stream, scales);
write(stream, zerops);
write(stream, is_remote);
write(stream, closure_uid);

// Some tensors might be present in CPU closure already - need to serialize as is
write(stream, closure.size());
std::vector<ov::Tensor> cpu_closures;
std::vector<std::size_t> cpu_closure_ids;
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
if (closure_uid[cidx] == -1) { // CPU closure, not in the bank
cpu_closure_ids.push_back(cidx);
cpu_closures.push_back(closure[cidx]);
}
}
}

write(stream, cpu_closure_ids);
write(stream, cpu_closure_ids);

for (const auto& tensor : cpu_closures) {
write(stream, tensor);
}
for (const auto& tensor : cpu_closures) {
write(stream, tensor);
}
} else { // weightless
// FIXME: almost a copy of the above
write_weightless(stream, scales, const_to_offset);
write_weightless(stream, zerops, const_to_offset);

write(stream, is_remote);
write(stream, closure_uid);

// Some tensors might be present in CPU closure already - need to serialize as is
write(stream, closure.size());
std::vector<ov::Tensor> cpu_closures;
std::vector<std::size_t> cpu_closure_ids;
std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
std::vector<std::size_t> non_cpu_tensors_ids;
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
if (closure_uid[cidx] == -1) { // CPU closure
cpu_closure_ids.push_back(cidx);
cpu_closures.push_back(closure[cidx]);
} else {
non_cpu_tensors_ids.push_back(cidx);
non_cpu_tensors.push_back(lazy_closure[cidx]); // must be there
}
}

// FIXME: support weightless flow!
write(stream, cpu_closure_ids);
write(stream, non_cpu_tensors_ids);
write_weightless(stream, cpu_closures, const_to_offset);
write(stream, non_cpu_tensors);
}

LOG_DEBUG("DONE.");
}

void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream) {
void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream,
bool is_weightless,
const std::string& weights_path) {
using namespace ov::npuw::s11n;

LOG_DEBUG("Deserializing CompiledModelDesc...");
Expand All @@ -573,30 +606,64 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea

read(stream, spatial);

read(stream, scales);
read(stream, zerops);
read(stream, is_remote);

// NOTE: for closure only deserialize uids - full flow
read(stream, closure_uid);

// Some tensors might be present in CPU closure already - need to deserialize as is
// FIXME: When weightless serialization is introduced, this should be handled differently
std::size_t closure_size = 0;
read(stream, closure_size);
std::vector<std::size_t> cpu_closure_ids;
read(stream, cpu_closure_ids);
closure.resize(closure_size);
for (const auto& cidx : cpu_closure_ids) {
read(stream, closure[cidx]);
if (!is_weightless) {
read(stream, scales);
read(stream, zerops);
read(stream, is_remote);
read(stream, closure_uid);

// Some tensors might be present in CPU closure already - need to deserialize as is
std::size_t closure_size = 0;
read(stream, closure_size);
std::vector<std::size_t> cpu_closure_ids;
read(stream, cpu_closure_ids);
closure.resize(closure_size);
for (const auto& cidx : cpu_closure_ids) {
read(stream, closure[cidx]);
}
} else { // weightless
// FIXME: almost a copy of the above
std::ifstream weights_stream(weights_path, std::ios::in | std::ios::binary);
NPUW_ASSERT(weights_stream);
read_weightless(stream, scales, weights_stream);
read_weightless(stream, zerops, weights_stream);

read(stream, is_remote);
read(stream, closure_uid);

// Some tensors might be present in CPU closure already - need to serialize as is
std::size_t closure_size = 0;
read(stream, closure_size);
closure.resize(closure_size);
lazy_closure.resize(closure_size);

std::vector<std::size_t> cpu_closure_ids;
std::vector<std::size_t> non_cpu_tensors_ids;
read(stream, cpu_closure_ids);
read(stream, non_cpu_tensors_ids);

std::vector<ov::Tensor> cpu_closures;
std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
read_weightless(stream, cpu_closures, weights_stream);
std::size_t tidx = 0;
for (const auto& idx : cpu_closure_ids) {
ov::Tensor t = cpu_closures[tidx];
closure[idx] = ov::Tensor(t.get_element_type(), t.get_shape());
// FIXME: get rid of this copy
t.copy_to(closure[idx]);
tidx++;
}
read(stream, non_cpu_tensors);
std::size_t ltidx = 0;
for (const auto& idx : non_cpu_tensors_ids) {
lazy_closure[idx] = non_cpu_tensors[ltidx++];
}
}

// FIXME: support weightless flow!

LOG_DEBUG("DONE.");
}

void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
void ov::npuw::CompiledModel::serialize(std::ostream& stream, bool is_weightless) const {
LOG_INFO("Serializing CompiledModel...");
LOG_BLOCK();

Expand Down Expand Up @@ -644,15 +711,17 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
write(stream, false);
}
// Write the rest of the submodel desc
subm.serialize(stream);
subm.serialize(stream, is_weightless, m_const_to_offset);
}

LOG_INFO("Done.");
}

std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
std::istream& stream,
const std::shared_ptr<const ov::IPlugin>& plugin) {
const std::shared_ptr<const ov::IPlugin>& plugin,
bool is_weightless,
const std::string& weights_path) {
LOG_INFO("Deserializing CompiledModel...");
LOG_BLOCK();

Expand Down Expand Up @@ -719,7 +788,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
}
compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
compiled->m_compiled_submodels[i].deserialize(stream);
compiled->m_compiled_submodels[i].deserialize(stream, is_weightless, weights_path);
}

compiled->implement_properties();
Expand Down Expand Up @@ -812,6 +881,87 @@ void ov::npuw::CompiledModel::reconstruct_closure() {
}
}

// Almost a copy of finalize_weights_bank()
void ov::npuw::CompiledModel::reconstruct_closure_weightless(const std::string& weights_path) {
LOG_INFO("Reconstructing weights bank...");
std::ifstream weights_stream(weights_path, std::ios::in | std::ios::binary);
NPUW_ASSERT(weights_stream);
// Register lazy tensors
for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
auto& comp_model_desc = m_compiled_submodels[idx];

// Skip optimized out and non-functions
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
continue;
}

const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
auto& func_desc = m_compiled_submodels[real_idx];

for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
if (comp_model_desc.closure[tidx]) {
continue; // host-side closure
}
comp_model_desc.closure_uid[tidx] =
m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it);
}
}

// Evaluate and allocate all LazyTensors inside the bank
m_weights_bank->evaluate_and_allocate(weights_stream);

// Set evaluated and allocated ov::Tensors to closures
for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
auto& comp_model_desc = m_compiled_submodels[idx];

// Skip optimized out and non-functions
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
continue;
}

const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
auto& func_desc = m_compiled_submodels[real_idx];

for (std::size_t tidx = 0; tidx < comp_model_desc.closure.size(); ++tidx) {
if (comp_model_desc.closure[tidx]) {
// host-side closure - already set, do nothing
comp_model_desc.is_remote[tidx] = false;
continue;
}
const auto& uid = comp_model_desc.closure_uid[tidx];
NPUW_ASSERT(uid != -1); // All tensors should be registered at this point
comp_model_desc.closure[tidx] = m_weights_bank->get(uid, *func_desc.device_it);
// FIXME: find a more reliable way to do so
comp_model_desc.is_remote[tidx] = m_weights_bank->is_remote(uid);
}
}

LOG_INFO("Done.");
}

void ov::npuw::CompiledModel::store_const_offsets(const std::shared_ptr<ov::Model>& model) {
for (auto&& node_ptr : model->get_ordered_ops()) {
if (ov::op::util::is_constant(node_ptr)) {
const auto& c = std::static_pointer_cast<ov::op::v0::Constant>(node_ptr);
const auto& rt_info = c->get_rt_info();
auto data_ptr = c->get_data_ptr();
auto offset_iter = rt_info.find("offset");
if (offset_iter == rt_info.end()) {
continue;
}
std::size_t offset = offset_iter->second.as<std::size_t>();
auto map_iter = m_const_to_offset.find(data_ptr);
if (map_iter != m_const_to_offset.end()) {
// Already there - check that offset is the same
NPUW_ASSERT(map_iter->second == offset &&
"Model contains two constants with same pointer and different offset!");
} else {
m_const_to_offset[data_ptr] = offset;
}
}
}
}

void ov::npuw::CompiledModel::detach_memory() {
LOG_INFO("Detaching model & weight memory...");
LOG_BLOCK();
Expand Down
16 changes: 12 additions & 4 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,11 @@ class CompiledModel : public ov::npuw::ICompiledModel {

void report_io() const;

void serialize(std::ostream& stream, bool is_weightless, const std::string& weights_path) const;
void serialize(std::ostream& stream, bool is_weightless) const;
static std::shared_ptr<CompiledModel> deserialize(std::istream& stream,
const std::shared_ptr<const ov::IPlugin>& plugin,
bool is_weightless, const std::string& weights_path);
bool is_weightless,
const std::string& weights_path);

// This is used for removing too long output tensor names to fix some compilation issues
// NB: These two methods has nothing to do with this particular class and should be
Expand All @@ -94,6 +95,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {

// For full deserialization flow with weights
void reconstruct_closure();
// For weightless serialization flow
void reconstruct_closure_weightless(const std::string& weights_path);
void store_const_offsets(const std::shared_ptr<ov::Model>& model);

void finalize_weights_bank();
void detach_memory();
Expand Down Expand Up @@ -167,8 +171,10 @@ class CompiledModel : public ov::npuw::ICompiledModel {
// Metrics
execution_stats stat;

void serialize(std::ostream& stream) const;
void deserialize(std::istream& stream);
void serialize(std::ostream& stream,
bool is_weightless,
const std::unordered_map<const void*, std::size_t>& const_to_offset) const;
void deserialize(std::istream& stream, bool is_weightless, const std::string& weights_path);
};
std::vector<CompiledModelDesc> m_compiled_submodels;

Expand All @@ -178,6 +184,8 @@ class CompiledModel : public ov::npuw::ICompiledModel {
execution_stats m_total_stat;

std::shared_ptr<weights::Bank> m_weights_bank = nullptr;

std::unordered_map<const void*, std::size_t> m_const_to_offset;
};
} // namespace npuw
} // namespace ov
Loading

0 comments on commit ae1f79d

Please sign in to comment.