-
Notifications
You must be signed in to change notification settings - Fork 145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Tokenizers #687
Add Tokenizers #687
Changes from 75 commits
70f867a
1fac3de
821dee5
b9b0693
c785ec1
1d129ac
71bc5bf
6c5eec0
29dfe38
40063c1
cc47b12
7644231
80b8023
46c82b8
d7ca2ab
2baac3d
d270dd6
119d6e9
4d4ad89
f4eee84
1e50352
0966b8a
61d7983
5609ee6
062acf3
0f772dc
10e3d18
c413cb6
8c8994c
8750ae6
be6dc3f
e4dcdda
b45e5ec
5f03ed0
2a65502
2e34b92
a6f9110
5c29254
f8d0e0d
10c10c5
4eb12f8
c5efaf0
239acc4
38552b0
597ccd4
e6933b7
bd7f9d9
99c603f
6cc9b36
973c52d
1fa02b2
e37f89d
88bf7c6
bb1b57a
6b4be05
539797f
79c3e09
203ffbb
45c0068
372465b
64567ea
f54076e
c42d1bd
8b29443
2ee3707
4b57fcc
803d831
72f6d9f
386cb02
79bd05f
24a60b3
f01afee
b22569f
96673f5
0e7ae87
5ebdb1f
6a55877
06d5159
fa5360d
e855193
d495d3b
d7bebd0
b2e35ed
f81bd18
0bd23b5
12ac9f8
9e6ae6f
f5d2d4c
cf039b9
795306d
9c200c2
0e9b960
95aa47c
e1de338
f23e59b
dbec117
ce25397
d46f594
6f213ab
6ed52e4
ca62321
52bfe5a
b504013
cc663dc
4c9ceed
745e969
48564b7
056eb9f
309b8e9
b193cb2
debcb5d
b739ffd
e70a3f2
2ce27cd
3022a5a
c467a8c
1ec4c5f
8505b51
82639e6
fb37580
a45b826
35cc136
244a593
84686f4
ad1c589
0f63c3d
0f1c1cc
3edb73b
48bba34
fe507ff
4b0c4ec
4656238
2f5cc1c
1568727
fa822c2
6ddb2a6
14f993b
cae3098
b59204d
e54b42e
6c3bae3
d34d401
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Original problem is described here: https://github.com/openvinotoolkit/openvino.genai/blob/4c6c6cb9cf2b64584c797f2ff4ca0b7b658dabca/llm/llm.cpp#L85 Here's my attempt to simplify the reproducer, but it requires Debug build: #include <openvino/openvino.hpp>
int main(int argc, char* argv[]) try {
if (argc != 4) {
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <openvino_model.xml> <tokenizer.xml> '<prompt>'");
}
ov::Core core;
core.add_extension(USER_OV_EXTENSIONS_PATH); // USER_OV_EXTENSIONS_PATH is defined in CMakeLists.txt
ov::InferRequest tolenizer = core.compile_model(argv[2], "CPU").create_infer_request();
std::shared_ptr<ov::Model> model = core.read_model(argv[1]);
constexpr size_t BATCH_SIZE = 1;
std::map<std::string, ov::PartialShape> shapes = {
{"input_ids", ov::PartialShape{
BATCH_SIZE, {1, std::numeric_limits<ov::Dimension::value_type>::max()}
}},
{"attention_mask", ov::PartialShape{
BATCH_SIZE, {1, std::numeric_limits<ov::Dimension::value_type>::max()}
}}
};
for (const ov::Output<ov::Node>& input : model->inputs()) {
for (const std::string& name : input.get_names()) {
if (name.rfind("past_key_values", 0) == 0) {
ov::PartialShape shape = input.get_partial_shape();
shape[0] = BATCH_SIZE;
shapes.emplace(name, shape);
break;
}
}
}
model->reshape(shapes);
ov::preprocess::PrePostProcessor p3(model);
p3.input("input_ids").tensor().set_element_type(ov::element::i32); // cast to the type of tokenyzer's output
p3.input("attention_mask").tensor().set_element_type(ov::element::i32);
p3.input("input_ids").preprocess().convert_element_type(ov::element::i64);
p3.input("attention_mask").preprocess().convert_element_type(ov::element::i64);
model = p3.build();
ov::InferRequest ireq = core.compile_model(model, "CPU", {ov::cache_dir("llm-cache")}).create_infer_request();
for (const ov::Output<ov::Node>& input : model->inputs()) {
for (const std::string& name : input.get_names()) {
if (name.rfind("past_key_values", 0) == 0) {
ireq.get_tensor(input).set_shape(input.get_partial_shape().get_min_shape());
break;
}
}
}
float* logits;
size_t n_vocab;
int32_t out_token;
{
ov::Tensor inp{ov::element::i32, {1, 1}};
inp.data<int32_t>()[0] = 29528;
ireq.get_tensor("input_ids").set_shape({1, 1});
ireq.set_tensor("input_ids", inp);
ireq.get_tensor("attention_mask").set_shape({1, 1});
std::fill_n(ireq.get_tensor("attention_mask").data<int32_t>(), inp.get_size(), 1);
ireq.infer();
size_t n_vocab = ireq.get_tensor("logits").get_shape().back();
logits = ireq.get_tensor("logits").data<float>() + (inp.get_size() - 1) * n_vocab;
out_token = int32_t(std::max_element(logits, logits + n_vocab) - logits);
}
ireq.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
ireq.get_tensor("attention_mask").set_shape({BATCH_SIZE, 1});
ireq.get_tensor("attention_mask").data<int32_t>()[0] = 1;
constexpr int32_t SPECIAL_EOS_TOKEN = 2;
while (out_token != SPECIAL_EOS_TOKEN) {
std::cout << out_token << ' ' << std::flush;
for (const ov::Output<ov::Node>& input : model->inputs()) {
for (const std::string& name : input.get_names()) {
if (name.rfind("past_key_values", 0) == 0) {
ireq.set_tensor(input, ireq.get_tensor("present" + name.substr(15)));
break;
}
}
}
ireq.get_tensor("input_ids").data<int32_t>()[0] = 29528;
ireq.infer();
logits = ireq.get_tensor("logits").data<float>();
out_token = int32_t(std::max_element(logits, logits + n_vocab) - logits);
}
std::cout << '\n';
} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
return 1;
} catch (...) {
std::cerr << "Non-exception object thrown\n";
return 1;
}
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using different Core object for llama fixes the problem, removing ov::preprocess::PrePostProcessor usage fixes the problem, storing CompiledModel instead of ireq of tokenizer fixes the problem. Moving |
This file was deleted.
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
causes Segmentation fault. Removing
, {ov::cache_dir("llm-cache")}
fixes the problem. StrangelyCACHE_DIR
works for python.