Skip to content

Commit

Permalink
[CPU]Check runtime_options from IR model (#27765)
Browse files Browse the repository at this point in the history
### Details:
 - *Check `runtim_options` from IR model*
- *Set `KV_CACHE_PRECISION` & `DYNAMIC_QUANTIZATION_GROUP_SIZE`* from
`runtim_options` of IR model
 - Example IR model with  `runtim_options`
- #27778 to
releases/2024/5
```
        <rt_info>
                <runtime_options>
                        <KV_CACHE_PRECISION value="f16" />
                </runtime_options>
        </rt_info>
```

### Tickets:
 - *CVS-157571*
  • Loading branch information
zhangYiIntel authored Nov 30, 2024
1 parent f89b8de commit 09d1e50
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 2 deletions.
10 changes: 10 additions & 0 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,5 +460,15 @@ void Config::updateProperties() {
_config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)});
}

void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
this->kvCachePrecision = model->get_rt_info<ov::element::Type>({"runtime_options", ov::hint::kv_cache_precision.name()});
}
if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) {
this->fcDynamicQuantizationGroupSize =
model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
}
}

} // namespace intel_cpu
} // namespace ov
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ struct Config {

void updateProperties();

void applyRtInfo(const std::shared_ptr<const ov::Model>& model);

std::map<std::string, std::string> _config;

int modelPreferThreads = -1;
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_cpu/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
// update the props after the perf mode translated to configs
// TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
Config conf = engConfig;
conf.applyRtInfo(cloned_model);
conf.readProperties(config, modelType);

Transformations transformations(cloned_model, conf);
Expand Down Expand Up @@ -520,6 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&

Config conf = engConfig;
Config::ModelType modelType = getModelType(model);
conf.applyRtInfo(model);
conf.readProperties(config, modelType);

auto context = std::make_shared<GraphContext>(conf, fake_w_cache, false);
Expand Down Expand Up @@ -575,7 +577,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& model_str

Config conf = engConfig;
Config::ModelType modelType = getModelType(model);

conf.applyRtInfo(model);
// check ov::loaded_from_cache property and erase it to avoid exception in readProperties.
auto _config = config;
const auto& it = _config.find(ov::loaded_from_cache.name());
Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_cpu/src/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class Plugin : public ov::IPlugin {

void get_performance_streams(Config& config, const std::shared_ptr<ov::Model>& model) const;
void calculate_streams(Config& conf, const std::shared_ptr<ov::Model>& model, bool imported = false) const;

Config engConfig;
/* Explicily configured streams have higher priority than performance hints.
So track if streams is set explicitly (not auto-configured) */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,4 +327,35 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPUExecutionDevice) {
ASSERT_EQ(value.as<std::string>(), "CPU");
}

TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) {
ov::Core ie;
ov::Any type;
ov::Any size;
ov::CompiledModel compiledModel;
model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName));
OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
ASSERT_EQ(size.as<uint64_t>(), 0);
}

TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) {
ov::Core ie;
ov::Any type;
ov::Any size;
ov::CompiledModel compiledModel;
model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
ov::AnyMap config;
config[ov::hint::kv_cache_precision.name()] = "u8";
config[ov::hint::dynamic_quantization_group_size.name()] = "16";
OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config));
OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
ASSERT_EQ(size.as<uint64_t>(), 16);
}

} // namespace

0 comments on commit 09d1e50

Please sign in to comment.