From 70f867aac575c0a59ce6fc3676ec2e474cca3938 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Wed, 26 Apr 2023 03:07:36 +0400 Subject: [PATCH 001/116] Added string tensor implementation with explicit pointer unpack --- .../sentence_piece/sentence_piece.cpp | 45 ++++++++++++++----- .../sentence_piece/sentence_piece.hpp | 2 +- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 78a31b0f2..b79eb0fb0 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -18,6 +18,8 @@ #include "openvino/opsets/opset10.hpp" #include "openvino/op/util/framework_node.hpp" +#define USE_STRING_TENSORS + using sentencepiece::ModelProto; using sentencepiece::NormalizerSpec; using sentencepiece::SentencePieceProcessor; @@ -30,9 +32,9 @@ using namespace ov::opset10; namespace { bool evaluate_helper(const ov::TensorVector& inputs, - std::vector& sparse_indices, + std::vector& sparse_indices, std::vector& sparse_values, - std::vector& sparse_dense_shape) { + std::vector& sparse_dense_shape) { // the operation has the following inputs: // 0. spm_model // 1. data input @@ -41,14 +43,24 @@ namespace { // 4. add_bos // 5. add_eos // 6. reverse + + //std::cerr << "SentencePiece Op evaluate\n"; + auto spm_model = static_cast(inputs[0].data()); auto spm_model_size = inputs[0].get_byte_size(); +#ifdef USE_STRING_TENSORS + const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); + const std::string* strings = strings_tensor.data(); + size_t batch_size = ov::shape_size(strings_tensor.get_shape()); + //std::cerr << " Batch size: " << batch_size << "\n"; +#else const uint8_t* strings = inputs[1].data(); auto batch_size = *reinterpret_cast(strings + 0); auto begin_ids = reinterpret_cast(strings + 4); auto end_ids = begin_ids + 1; auto data = strings + 4 + 4 + 4*batch_size; +#endif auto nbest_size = *static_cast(inputs[2].data()); auto alpha = *static_cast(inputs[3].data()); @@ -80,10 +92,15 @@ namespace { size_t max_token_id = 0; for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { +#ifdef USE_STRING_TENSORS + const std::string& sentence = strings[batch_ind]; + //std::cerr << " sentence: " << sentence << "\n"; +#else auto begin_ind = begin_ids[batch_ind]; auto end_ind = end_ids[batch_ind]; - std::vector ids; std::string sentence(data + begin_ind, data + end_ind); +#endif + std::vector ids; CHECK_OK(sp.SampleEncode(sentence, nbest_size, alpha, &ids)); // put into resulted vectors for (size_t token_id = 0; token_id < ids.size(); ++token_id) { @@ -109,24 +126,24 @@ SentencepieceTokenizer::SentencepieceTokenizer(const ov::OutputVector& args) void SentencepieceTokenizer::validate_and_infer_types() { // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values // and dense shape - set_output_type(0, element::i32, PartialShape{ Dimension(), Dimension(2) }); // FIXME: change to i64 after CPU fix + set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); // FIXME: change to i64 after CPU fix set_output_type(1, element::i32, PartialShape{ Dimension() }); - set_output_type(2, element::i32, PartialShape{ Dimension(2) }); // FIXME: change to i64 after CPU fix + set_output_type(2, element::i64, PartialShape{ Dimension(2) }); // FIXME: change to i64 after CPU fix } bool SentencepieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - std::vector m_sparse_indices; + std::vector m_sparse_indices; std::vector m_sparse_values; - std::vector m_sparse_dense_shape; + std::vector m_sparse_dense_shape; evaluate_helper(inputs, m_sparse_indices, m_sparse_values, m_sparse_dense_shape); outputs[0].set_shape({ m_sparse_indices.size() / 2, 2 }); - memcpy(outputs[0].data(), m_sparse_indices.data(), sizeof(int32_t) * m_sparse_indices.size()); + memcpy(outputs[0].data(), m_sparse_indices.data(), sizeof(int64_t) * m_sparse_indices.size()); outputs[1].set_shape({ m_sparse_values.size() }); memcpy(outputs[1].data(), m_sparse_values.data(), sizeof(int32_t) * m_sparse_values.size()); outputs[2].set_shape({ 2 }); - memcpy(outputs[2].data(), m_sparse_dense_shape.data(), sizeof(int32_t) * m_sparse_dense_shape.size()); + memcpy(outputs[2].data(), m_sparse_dense_shape.data(), sizeof(int64_t) * m_sparse_dense_shape.size()); return true; } @@ -148,7 +165,7 @@ OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node) { return { sp_model_const }; } -OutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node) { +frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node) { // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp FRONT_END_GENERAL_CHECK(node.get_input_size() > 0, "RaggedTensorToSparse expects at least one input."); @@ -172,15 +189,21 @@ OutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& OutputVector inputs_vector = OutputVector{ sp_model_const, inputs, nbest_size, alpha, add_bos, add_eos, reverse }; +#ifndef USE_STRING_TENSORS // Override type of input tensor if this is a Parameter if(auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { parameter->set_partial_shape(ov::PartialShape{Dimension()}); parameter->set_element_type(ov::element::u8); parameter->validate_and_infer_types(); } +#endif // create a node with custom operation auto sp_tokenizer_ext = std::make_shared(inputs_vector); - return sp_tokenizer_ext->outputs(); + return { + {"sparse_indices", sp_tokenizer_ext->output(0)}, + {"sparse_values", sp_tokenizer_ext->output(1)}, + {"sparse_dense_shape", sp_tokenizer_ext->output(2)}, + }; } diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 818f62f05..3c9ca711f 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -31,4 +31,4 @@ namespace TemplateExtension { ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node); -ov::OutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); +ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); From 821dee5eb2d7ac8e3133630b5f09cfdadd204bd8 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Tue, 2 May 2023 04:17:15 +0400 Subject: [PATCH 002/116] Started to migrate to extension-only support of string operations with and without string support in OV core. Moved StringTensorUnpack and reworked it to be aligned with the new approach. Reworked sentece piece op and translation code to be compatible with several variants of string tensor representation and the plugin wrapping hack. --- .../sentence_piece/sentence_piece.cpp | 340 +++++++++++++++++- 1 file changed, 327 insertions(+), 13 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index d8c1e2c67..59dcf7bb4 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -7,7 +7,20 @@ #include "openvino/opsets/opset10.hpp" -#define USE_STRING_TENSORS +//#define USE_STRING_TENSORS + +#ifdef USE_STRING_TENSORS + +// A plugin can support a string tensor on inputs and outputs via the hack which wraps such tensor to +// a u8 tensor holding a pointer to the original string tensor. The hack lets us avoid more deep +// plugin modifications by pre-transform a model where string tensor parameters and results are replaced +// by the described wrapping tensors. Such a hack requires some pre/post processing in operations +// that handle such wrapping tensors on the edge of a model. +#define USE_INPUT_OUTPUT_STRING_TENSOR_HACK + +#endif + +#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS using sentencepiece::SentencePieceProcessor; using namespace TemplateExtension; @@ -30,7 +43,6 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t bool add_bos, bool add_eos, bool reverse) : m_sp(std::make_shared()), m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), m_reverse(reverse), Op(args) { - FRONT_END_GENERAL_CHECK(args.size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); auto spm_model = static_cast(sp_model_const->get_data_ptr()); @@ -68,6 +80,48 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s } void SentencepieceTokenizer::validate_and_infer_types() { + + #ifdef SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)"); + FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor"); + + #else + + FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); + FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); + + #ifdef USE_STRING_TENSORS + + #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::string || get_input_element_type(1) == element::u8, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 or string depending on the current stage of model preparation"); + #else + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::string, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor"); + #endif + + #else + + if(get_input_element_type(1) != element::u8) { + std::cout << "Stopped\n"; + std::cin.get(); + } + + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::u8, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 tensor, but got " + + get_input_element_type(1).get_type_name()); + + #endif + + #endif + // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values // and dense shape set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); @@ -89,15 +143,29 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& std::vector sparse_values; std::vector sparse_dense_shape; - FRONT_END_GENERAL_CHECK(inputs.size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); +#ifdef SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + auto begin_ids = inputs[1].data(); + auto end_ids = inputs[2].data(); + auto data = inputs[3].data(); + auto batch_size = shape_size(inputs[1].get_shape()); + +#else #ifdef USE_STRING_TENSORS - const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); - const std::string* strings = strings_tensor.data(); - size_t batch_size = ov::shape_size(strings_tensor.get_shape()); - //std::cerr << " Batch size: " << batch_size << "\n"; + + #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); + #else + const ov::Tensor& strings_tensor = inputs[1]; + #endif + + const std::string* strings = strings_tensor.data(); + size_t batch_size = ov::shape_size(strings_tensor.get_shape()); + #else + const uint8_t* strings = inputs[1].data(); auto bitstream_size = inputs[1].get_byte_size(); @@ -109,17 +177,23 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& auto begin_ids = reinterpret_cast(strings + 4); auto end_ids = begin_ids + 1; auto data = strings + 4 + 4 + 4 * batch_size; + +#endif + #endif + //std::cerr << " Batch size: " << batch_size << "\n"; size_t max_token_id = 0; for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { -#ifdef USE_STRING_TENSORS - const std::string& sentence = strings[batch_ind]; - //std::cerr << " sentence: " << sentence << "\n"; +#if defined(USE_STRING_TENSORS) && !defined(SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS) + const std::string& sentence = strings[batch_ind]; + //std::cerr << " sentence: " << sentence << "\n"; #else auto begin_ind = begin_ids[batch_ind]; auto end_ind = end_ids[batch_ind]; - std::string sentence(data + begin_ind, data + end_ind); + //std::string sentence(data + begin_ind, data + end_ind); + absl::string_view sentence((const char*)data + begin_ind, end_ind - begin_ind); + //std::cerr << "string: " << sentence << "\n"; #endif std::vector ids; CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); @@ -161,6 +235,236 @@ OutputVector translate_sentencepiece_op(const NodeContext& node) { return { sp_model_const }; } + +// Unpack a string tensor representation regardless of the source format, which +// can be an OV tensor with element::string element type (if supported) or u8 +// packed representation, to a decompose tensor representation that may potentially +// consist of multiple tensors. The destination format is defined by `mode` attribute. +// Shape of the output tensor is compitelly recognized from the input (if supported) +// or defined partially by a dedicated input attribute `shape`. If `shape` is not set, +// which default to completelly dynamic `shape`, then output shape is defined +// by an input tensor. +class StringTensorUnpack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorUnpack"); + + StringTensorUnpack(OutputVector inputs, const std::string& mode = "begins_ends" + /*const std::string* _data = nullptr, PartialShape _input_shape = PartialShape::dynamic()*/) + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + //const std::string* data = nullptr; + //PartialShape input_shape; + + void validate_and_infer_types() override { + OPENVINO_ASSERT( + get_input_size() == 1, + "Number of inputs for StringTensorUnpack is not equal to 1"); + + OPENVINO_ASSERT( + #ifdef USE_STRING_TENSORS + get_input_element_type(0) == element::string || + #endif + get_input_element_type(0) == element::dynamic || + get_input_element_type(0) == element::u8, + "Unsupported input element type for StringTensorUnpack"); + + OPENVINO_ASSERT( + get_input_partial_shape(0).rank().is_static(), + "StringTensorUnpack supports only static input rank"); + +#if 0 + // Obtain shape from rt_info. + auto& rt_info = get_input_node_shared_ptr(0)->get_rt_info(); + auto ops = rt_info.find("original_partial_shape"); + if(ops != rt_info.end()) { + input_shape = ops->second.as(); + std::cerr << "StringTensorUnpack: orig_partial_shape: " << input_shape << "\n"; + } else { + std::cerr << "Impossible\n"; + std::cerr << get_input_node_shared_ptr(0) << "\n"; + } +#endif + + auto output_shape = PartialShape::dynamic(); + +#ifdef USE_STRING_TENSORS + + // In case of explicit string tensors the shape is carried by input tensor itself + // OPENVINO_ASSERT( + // input_shape == PartialShape::dynamic(), + // "Excplicitly set shape for a string tensor in the unpacking is not supported"); + + #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + + // There are two cases that affect expected element type of the input tensor: + // before the hack is applied (element::string) and after it (element::u8). + + OPENVINO_ASSERT( + get_input_element_type(0) == element::string + || get_input_element_type(0) == element::u8, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation"); + + if(get_input_element_type(0) == element::string) { + output_shape = get_input_partial_shape(0); + } + + if(get_input_element_type(0) == element::u8) + { + // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. + // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. + const auto& rt_info = get_input_tensor(0).get_rt_info(); + auto it = rt_info.find("__original_partial_shape"); + + // StringTensorUnpack expects __original_partial_shape attribute of type PartialShape in the input tensor. + // If it is not found that means that model compilation wasn't pass the expected transformation where a string tensor + // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. + if(it != rt_info.end() && it->second.is()) { + output_shape = it->second.as(); + } + } + + #else + + OPENVINO_ASSERT( + get_input_element_type(0) == element::string, + "StringTensorUnpack expects element::string in an input tensor, but it is " + std::string(get_input_element_type(0))); + + output_shape = get_input_partial_shape(0); + + #endif + +#else + // Expect packed string tensor represenation which can carry only a string tensors of shape [?] + // Shape is not known in advance and only rank of the output can be set + + OPENVINO_ASSERT( + get_input_element_type(0) == element::u8 && + get_input_partial_shape(0).rank().is_static() && get_input_partial_shape(0).rank().get_length() == 1, + "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + + get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); + + output_shape = PartialShape({Dimension()}); // [?] + + #if 0 + + if(get_input_element_type(0) == element::u8) { + if(all_inputs_are_constants(this)) { + std::cerr << "StringTensorUnpack: u8/const\n"; + // HACK: Tensor of strings is passed by a raw pointer to a tensor + auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); + size_t raw_size = constant->get_shape()[0]; + if(raw_size == 0) { + // means empty input + std::cerr << "StringTensorUnpack: empty\n"; + data = nullptr; + input_shape = PartialShape({0}); + } else if(raw_size == sizeof(void*)) { + std::cerr << "StringTensorUnpack: not empty, tensor HACK\n"; + auto tensor = *reinterpret_cast(constant->get_data_ptr()); + std::cerr << "Pointer to tensor from op: " << tensor << "\n"; + input_shape = tensor->get_shape(); + data = tensor->data(); + } else { + + OPENVINO_ASSERT( + false, + "Unexpected size for hacked Tensor input. Something went wrong."); + } + } else { + std::cerr << "StringTensorUnpack: u8/not constant\n"; + } + } else { + std::cerr << "StringTensorUnpack: string\n"; + input_shape = get_input_partial_shape(0); + if(all_inputs_are_constants(this)) { + auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); + data = constant->get_data_ptr(); + } else { + input_shape = get_input_partial_shape(0); + } + } + + #endif + +#endif + + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); + + if (m_mode == "begins_ends") { + set_output_type(0, element::i32, output_shape); + set_output_type(1, element::i32, output_shape); + set_output_type(2, element::u8, PartialShape{Dimension()}); + } + } + + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + // FIXME: Serialization only, there is no deserialization + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + + +#ifdef USE_STRING_TENSORS + + #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + auto tensor = *reinterpret_cast(inputs[0].data()); + #else + auto tensor = inputs[0]; + #endif + + //std::cerr << "Pointer to tensor from op evaluate: " << tensor << "\n"; + Shape input_shape = tensor->get_shape(); + const std::string* input_strings = tensor->data(); + std::cerr << "input_shape = " << input_shape << "\n"; + //std::cerr << data << "\n"; + + auto nelements = shape_size(input_shape); + size_t total = 0; + for(size_t i = 0; i < nelements; ++i) + total += input_strings[i].length(); + + outputs[0].set_shape(input_shape); + outputs[1].set_shape(input_shape); + outputs[2].set_shape(Shape{total}); + + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto output_symbols = reinterpret_cast(outputs[2].data()); + size_t offset = 0; + + for(size_t i = 0; i < nelements; ++i) + { + begins[i] = offset; + output_symbols = std::copy(input_strings[i].begin(), input_strings[i].end(), output_symbols); + offset += input_strings[i].length(); + ends[i] = offset; + } + + return true; + +#else + + OPENVINO_ASSERT(false, "StringTensorUnpack supporst only element::string representation"); + return false; + +#endif + } + + std::string m_mode; +}; + NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp @@ -187,8 +491,6 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { auto add_eos = extract_scalar_const_value(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos"); auto reverse = extract_scalar_const_value(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse"); - OutputVector inputs_vector = OutputVector{ sp_model_const, inputs }; - #ifndef USE_STRING_TENSORS // Override type of input tensor if this is a Parameter if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { @@ -198,6 +500,18 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { } #endif +#ifdef SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + OutputVector inputs_vector = OutputVector{ sp_model_const }; + auto unpacked_outputs = std::make_shared(OutputVector{inputs}, "begins_ends")->outputs(); + inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end()); + +#else + + OutputVector inputs_vector = OutputVector{ sp_model_const, inputs }; + +#endif + // create a node with custom operation auto sp_tokenizer_ext = std::make_shared(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse); FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3, From b9b06936fbdb76d8f168db49ee095db1ae929280 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Wed, 10 May 2023 23:15:47 +0400 Subject: [PATCH 003/116] Started to merge string/tokenizer related stuff from a dedicated OV branch to contrib in form compatible with both master and the branch with string tensors support. Added CaseFoldUTF8 from that branch. --- .../user_ie_extensions/ov_extension.cpp | 2 + .../sentence_piece/sentence_piece.cpp | 264 ++++++++++++++++-- .../sentence_piece/sentence_piece.hpp | 30 ++ .../sentence_piece/str_pack.py | 28 ++ 4 files changed, 299 insertions(+), 25 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/sentence_piece/str_pack.py diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 47f548022..2efde15f1 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -55,6 +55,8 @@ #ifdef sentence_piece # include "sentence_piece/sentence_piece.hpp" # define SENTENSE_PIECE_EXT \ + std::make_shared>(), \ + std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 59dcf7bb4..0b17c332b 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -20,7 +20,7 @@ #endif -#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS +//#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS using sentencepiece::SentencePieceProcessor; using namespace TemplateExtension; @@ -108,15 +108,12 @@ void SentencepieceTokenizer::validate_and_infer_types() { #else - if(get_input_element_type(1) != element::u8) { - std::cout << "Stopped\n"; - std::cin.get(); - } - +#if 0 // change to 0 when compiled with master and the bug with data propagation from within inline context is not solved FRONT_END_GENERAL_CHECK( get_input_element_type(1) == element::u8, "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 tensor, but got " + get_input_element_type(1).get_type_name()); +#endif #endif @@ -138,6 +135,19 @@ bool SentencepieceTokenizer::visit_attributes(AttributeVisitor& visitor) { return true; } +void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) { + auto strings = packed.data(); + auto bitstream_size = packed.get_byte_size(); + // check the format of the input bitstream representing the string tensor + FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + batch_size = *reinterpret_cast(strings + 0); + FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + begin_ids = reinterpret_cast(strings + 4); + end_ids = begin_ids + 1; + symbols = strings + 4 + 4 + 4 * batch_size; +} + bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { std::vector sparse_indices; std::vector sparse_values; @@ -166,17 +176,22 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& #else - const uint8_t* strings = inputs[1].data(); - auto bitstream_size = inputs[1].get_byte_size(); - - // check the format of the input bitstream representing the string tensor - FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); - FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, - "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto data = strings + 4 + 4 + 4 * batch_size; + // const uint8_t* strings = inputs[1].data(); + // auto bitstream_size = inputs[1].get_byte_size(); + + // // check the format of the input bitstream representing the string tensor + // FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + // auto batch_size = *reinterpret_cast(strings + 0); + // FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, + // "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + // auto begin_ids = reinterpret_cast(strings + 4); + // auto end_ids = begin_ids + 1; + // auto data = strings + 4 + 4 + 4 * batch_size; + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data); #endif @@ -236,6 +251,100 @@ OutputVector translate_sentencepiece_op(const NodeContext& node) { } + + +void check_string_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::u8, "Expected a u8 tensor as the third part of the decomposed string representation"); +} + +void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); + node->set_output_type(output_index+1, element::i32, shape); + node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); +} + + +// Having a decomposed representation for a tensor, converts it to a single string tensor +// (packed u8 or natively supported element::string depending on whether or not USE_STRING_TENSORS defined). +class StringTensorPack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorPack"); + + StringTensorPack(OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override { + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supporst only 'begins_ends' mode, but get " + m_mode); + check_string_input(this, 0); + #ifdef USE_STRING_TENSORS + set_output_type(0, element::string, get_input_partial_shape(0)); + #else + set_output_type(0, element::u8, PartialShape{Dimension()}); + #endif + } + + + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +#ifdef USE_STRING_TENSORS + // TODO + return false; +#else + auto rank = inputs[0].get_shape().size(); + if (rank != 1) { + std::cerr << "[ WARNING ] StringTensorPack ignores the rank " << rank << " of input tensor and set rank=1 in the output\n"; + } + + auto num_elements = shape_size(inputs[0].get_shape()); + auto num_chars = shape_size(inputs[2].get_shape()); + auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; + outputs[0].set_shape(Shape{num_output_elements}); + + //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + auto output = outputs[0].data(); + auto output_int32 = reinterpret_cast(output); + + *output_int32++ = num_elements; + *output_int32++ = 0; + output_int32 = std::copy(ends, ends + num_elements, output_int32); + output = reinterpret_cast(output_int32); + output = std::copy(chars, chars + num_chars, output); + + OPENVINO_ASSERT(num_output_elements == output - outputs[0].data(), "[ INTERNAL ERROR ] StringTensorPack output tensor is corrupted"); + + // WARNING! Chars are not repacked. If there are gaps between strings, they will remain. + + return true; +#endif + } + +private: + + std::string m_mode; +}; + + + // Unpack a string tensor representation regardless of the source format, which // can be an OV tensor with element::string element type (if supported) or u8 // packed representation, to a decompose tensor representation that may potentially @@ -261,13 +370,15 @@ class StringTensorUnpack : public ov::op::Op { get_input_size() == 1, "Number of inputs for StringTensorUnpack is not equal to 1"); +#if 0 // Uncomment it when the bug is fixed with type substitution in TF partition call inlining OPENVINO_ASSERT( #ifdef USE_STRING_TENSORS get_input_element_type(0) == element::string || #endif get_input_element_type(0) == element::dynamic || get_input_element_type(0) == element::u8, - "Unsupported input element type for StringTensorUnpack"); + "Unsupported input element type for StringTensorUnpack: " + get_input_element_type(0).get_type_name()); +#endif OPENVINO_ASSERT( get_input_partial_shape(0).rank().is_static(), @@ -339,7 +450,9 @@ class StringTensorUnpack : public ov::op::Op { // Shape is not known in advance and only rank of the output can be set OPENVINO_ASSERT( +#if 0 // Uncomment it when the bug is fixed with type substitution in TF partition call inlining get_input_element_type(0) == element::u8 && +#endif get_input_partial_shape(0).rank().is_static() && get_input_partial_shape(0).rank().get_length() == 1, "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); @@ -392,9 +505,7 @@ class StringTensorUnpack : public ov::op::Op { OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); if (m_mode == "begins_ends") { - set_output_type(0, element::i32, output_shape); - set_output_type(1, element::i32, output_shape); - set_output_type(2, element::u8, PartialShape{Dimension()}); + set_string_output(this, 0, output_shape); } } @@ -404,7 +515,6 @@ class StringTensorUnpack : public ov::op::Op { } bool visit_attributes(ov::AttributeVisitor& visitor) override { - // FIXME: Serialization only, there is no deserialization visitor.on_attribute("mode", m_mode); return true; } @@ -415,7 +525,6 @@ class StringTensorUnpack : public ov::op::Op { bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - #ifdef USE_STRING_TENSORS #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK @@ -456,8 +565,27 @@ class StringTensorUnpack : public ov::op::Op { #else - OPENVINO_ASSERT(false, "StringTensorUnpack supporst only element::string representation"); - return false; + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(inputs[0], batch_size, begin_ids, end_ids, data); + + auto num_chars = end_ids[batch_size - 1]; + + outputs[0].set_shape(Shape({static_cast(batch_size)})); + outputs[1].set_shape(Shape({static_cast(batch_size)})); + outputs[2].set_shape(Shape{static_cast(num_chars)}); + + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + + std::copy(begin_ids, begin_ids + batch_size, begins); + std::copy(end_ids, end_ids + batch_size, ends); + std::copy(data, data + num_chars, chars); + + return true; #endif } @@ -465,6 +593,41 @@ class StringTensorUnpack : public ov::op::Op { std::string m_mode; }; +OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t input_index) { + auto input = node.get_input(input_index); + auto input_node = input.get_node_shared_ptr(); + +#ifndef USE_STRING_TENSORS + // Override type of input tensor if this is a Parameter + if (auto parameter = std::dynamic_pointer_cast(input_node)) { + std::cerr << "Overriding Parameter element_type to U8 to be ready to accept a packed batch of strings\n"; + parameter->set_partial_shape(PartialShape{ Dimension() }); + parameter->set_element_type(element::u8); + parameter->validate_and_infer_types(); + } +#endif + + if (auto struct_pack = std::dynamic_pointer_cast(input_node)) { + FRONT_END_GENERAL_CHECK(struct_pack->get_input_size() == 3, "Expected 3 inputs to StringTensorPack which represents a string tensor"); + return struct_pack->input_values(); + } else { + #if defined(USE_STRING_TENSORS) || true // always + return std::make_shared(OutputVector{input}, "begins_ends")->outputs(); + #else + // Suppose this is u8 packed string tensor with a single batch dimension + // Unpack this tensor using standard operations + + // Cannot do that because there is not ReinterprectCast operation in OV + // TODO: Find a way to make it without reinterpretation operation + #endif + } +} + +ov::Output post_translate_string_tensor_output(const OutputVector& outputs) { + FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); + return std::make_shared(outputs, "begins_ends"); +} + NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp @@ -494,6 +657,7 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { #ifndef USE_STRING_TENSORS // Override type of input tensor if this is a Parameter if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { + std::cerr << "HERE\n"; parameter->set_partial_shape(PartialShape{ Dimension() }); parameter->set_element_type(element::u8); parameter->validate_and_infer_types(); @@ -530,3 +694,53 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { return named_results; } + + +void CaseFoldUTF8::validate_and_infer_types() { + check_string_input(this, 0); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool CaseFoldUTF8::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + // Stub implementation that transforms each input string "X" to "CaseFoldUTF8(X)" for debugging purposes + { + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const std::string left_side = "CaseFoldUTF8(", right_side = ")"; + const size_t num_elements = inputs[0].get_size(); + const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length())*num_elements; + outputs[2].set_shape(Shape{new_len}); + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + int32_t char_offset = 0; + + for(size_t i = 0; i < num_elements; ++i) { + new_begins[i] = char_offset; + std::string new_str = left_side + std::string(chars + begins[i], chars + ends[i]) + right_side; + std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); + char_offset += new_str.length(); + new_ends[i] = char_offset; + } + return true; + } + // End of stub implementation +} + + +ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { + std::cerr << "translate_case_fold_utf8\n"; + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFoldUTF8 expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared(pre_translate_string_tensor_input(node, 0))->outputs()) }; +} + diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 9823163d2..756c59f4d 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -44,3 +44,33 @@ namespace TemplateExtension { ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node); ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); + +// https://www.tensorflow.org/text/api_docs/python/text/case_fold_utf8 +class OPENVINO_API CaseFoldUTF8 : public ov::op::Op { +public: + OPENVINO_OP("CaseFoldUTF8"); + + CaseFoldUTF8() = default; + + CaseFoldUTF8(const ov::OutputVector& arguments) : ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; + +ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/str_pack.py b/modules/custom_operations/user_ie_extensions/sentence_piece/str_pack.py new file mode 100644 index 000000000..01d739661 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/str_pack.py @@ -0,0 +1,28 @@ +import numpy as np + +# Convert any list of string to U8/1D numpy array compatible with converted OV model input +def pack_strings(strings): + to_bytes = lambda x: x.to_bytes(4, 'little') + batch_size = len(strings) + if batch_size == 0: + return to_bytes(0) + offsets = to_bytes(0) + symbols = bytes() + for s in strings: + symbols += bytes(s, 'utf-8') + offsets += to_bytes(len(symbols)) + return np.frombuffer(bytearray(to_bytes(batch_size) + offsets + symbols), np.uint8) + +# Convert an array of uint8 elements to a list of strings; reverse to pack_strings +# TODO: handle possible sighed values in batch size and offsets +def unpack_strings(u8_tensor): + from_bytes = lambda offset, size: int.from_bytes(u8_tensor[offset:offset+size], 'little') + batch_size = from_bytes(0, 4) + strings = [] + for i in range(batch_size): + begin = from_bytes(4 + i*4, 4) + end = from_bytes(4 + (i+1)*4, 4) + length = end - begin + begin += 4*(batch_size + 2) + strings.append(bytes(u8_tensor[begin:begin+length]).decode('utf-8')) + return strings From c785ec1167e6fd6b5348c63fb8cf2de057ca1bb5 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 11 May 2023 00:30:42 +0400 Subject: [PATCH 004/116] Rename CaseFoldUTF8 to name from opset proposal: CaseFold, added NormalizeUnicode --- .../user_ie_extensions/ov_extension.cpp | 4 +- .../sentence_piece/sentence_piece.cpp | 67 +++++++++++++++++-- .../sentence_piece/sentence_piece.hpp | 48 +++++++++++-- 3 files changed, 107 insertions(+), 12 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 2efde15f1..073f1a731 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -55,8 +55,10 @@ #ifdef sentence_piece # include "sentence_piece/sentence_piece.hpp" # define SENTENSE_PIECE_EXT \ - std::make_shared>(), \ + std::make_shared>(), \ std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ + std::make_shared>(), \ + std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 0b17c332b..f5692e442 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -696,22 +696,22 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { } -void CaseFoldUTF8::validate_and_infer_types() { +void CaseFold::validate_and_infer_types() { check_string_input(this, 0); set_string_output(this, 0, get_input_partial_shape(0)); } -bool CaseFoldUTF8::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { auto begins = inputs[0].data(); auto ends = inputs[1].data(); auto chars = inputs[2].data(); - // Stub implementation that transforms each input string "X" to "CaseFoldUTF8(X)" for debugging purposes + // Stub implementation that transforms each input string "X" to "CaseFold(X)" for debugging purposes { // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); - const std::string left_side = "CaseFoldUTF8(", right_side = ")"; + const std::string left_side = "CaseFold(", right_side = ")"; const size_t num_elements = inputs[0].get_size(); const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length())*num_elements; outputs[2].set_shape(Shape{new_len}); @@ -740,7 +740,62 @@ bool CaseFoldUTF8::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { std::cerr << "translate_case_fold_utf8\n"; - FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFoldUTF8 expects only 1 input"); - return { post_translate_string_tensor_output(std::make_shared(pre_translate_string_tensor_input(node, 0))->outputs()) }; + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared( + pre_translate_string_tensor_input(node, 0))->outputs()) }; } + + +void NormalizeUnicode::validate_and_infer_types() { + check_string_input(this, 0); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + +#if 0 + // TODO: Complete implementation +#else + // Stub implementation that transforms each input string "X" to "NormalizeUnicode(X, normalization_form)" for debugging purposes + { + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const std::string left_side = "NormalizeUnicode(", right_side = ")", delimeter = ", "; + const size_t num_elements = inputs[0].get_size(); + const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length() + delimeter.length() + m_normalization_form.length())*num_elements; + outputs[2].set_shape(Shape{new_len}); + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + int32_t char_offset = 0; + + for(size_t i = 0; i < num_elements; ++i) { + new_begins[i] = char_offset; + std::string new_str = left_side + std::string(chars + begins[i], chars + ends[i]) + delimeter + m_normalization_form + right_side; + std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); + char_offset += new_str.length(); + new_ends[i] = char_offset; + } + return true; + } + // End of stub implementation +#endif +} + + +ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "NormalizeUTF8 expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared( + pre_translate_string_tensor_input(node, 0), + node.get_attribute("normalization_form"))->outputs()) }; +} diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 756c59f4d..9adcd6702 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -46,20 +46,20 @@ ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& nod ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); // https://www.tensorflow.org/text/api_docs/python/text/case_fold_utf8 -class OPENVINO_API CaseFoldUTF8 : public ov::op::Op { +class OPENVINO_API CaseFold : public ov::op::Op { public: - OPENVINO_OP("CaseFoldUTF8"); + OPENVINO_OP("CaseFold "); - CaseFoldUTF8() = default; + CaseFold () = default; - CaseFoldUTF8(const ov::OutputVector& arguments) : ov::op::Op(arguments) { + CaseFold (const ov::OutputVector& arguments) : ov::op::Op(arguments) { constructor_validate_and_infer_types(); } void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); + return std::make_shared(inputs); } bool visit_attributes(ov::AttributeVisitor& visitor) override { @@ -73,4 +73,42 @@ class OPENVINO_API CaseFoldUTF8 : public ov::op::Op { } }; + ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node); + + +class OPENVINO_API NormalizeUnicode : public ov::op::Op { +public: + OPENVINO_OP("NormalizeUnicode"); + + NormalizeUnicode () = default; + + NormalizeUnicode(const ov::OutputVector& arguments, const std::string& normalization_form) : + ov::op::Op(arguments), + m_normalization_form(normalization_form) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_normalization_form); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("normalization_form", m_normalization_form); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_normalization_form; +}; + +ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); From 1d129acbaa6fd6af18999bbaa4a1bfb8a6f1302e Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 11 May 2023 21:19:24 +0400 Subject: [PATCH 005/116] Added a stub for RegexNormalization operation, WA for CPU bug with empty constants, register StringTensorPack and StringTensorUnpack as OV operations to be able to read IRs with those operations --- .../user_ie_extensions/ov_extension.cpp | 4 + .../sentence_piece/sentence_piece.cpp | 552 ++++++++++-------- .../sentence_piece/sentence_piece.hpp | 115 +++- 3 files changed, 414 insertions(+), 257 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 073f1a731..c5f844f49 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -55,10 +55,14 @@ #ifdef sentence_piece # include "sentence_piece/sentence_piece.hpp" # define SENTENSE_PIECE_EXT \ + std::make_shared>(), \ + std::make_shared>(), \ std::make_shared>(), \ std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ std::make_shared>(), \ std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ + std::make_shared>(), \ + std::make_shared("StaticRegexReplace", translate_static_regex_replace), \ std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index f5692e442..96e42f8d5 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -259,339 +259,293 @@ void check_string_input(const Node* node, size_t input_index) { FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::u8, "Expected a u8 tensor as the third part of the decomposed string representation"); } -void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { - node->set_output_type(output_index+0, element::i32, shape); - node->set_output_type(output_index+1, element::i32, shape); - node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); -} +void check_string_scalar_input(const Node* node, size_t input_index) { + auto shape = node->get_input_partial_shape(input_index); + auto element_type = node->get_input_element_type(input_index); + #ifdef USE_STRING_TENSORS -// Having a decomposed representation for a tensor, converts it to a single string tensor -// (packed u8 or natively supported element::string depending on whether or not USE_STRING_TENSORS defined). -class StringTensorPack : public ov::op::Op { -public: - OPENVINO_OP("StringTensorPack"); + OPENVINO_ASSERT( + (element_type == element::dynamic || element_type == element::string) && + (shape.rank().is_dynamic() || shape.rank().get_length() == 0), + "string/0D tensor is expected"); - StringTensorPack(OutputVector inputs, const std::string& mode = "begins_ends") - : ov::op::Op(inputs), m_mode(mode) { - constructor_validate_and_infer_types(); - } + #else - void validate_and_infer_types() override { - OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supporst only 'begins_ends' mode, but get " + m_mode); - check_string_input(this, 0); - #ifdef USE_STRING_TENSORS - set_output_type(0, element::string, get_input_partial_shape(0)); - #else - set_output_type(0, element::u8, PartialShape{Dimension()}); - #endif - } + OPENVINO_ASSERT( + (element_type == element::dynamic || element_type == element::u8) && + (shape.rank().is_dynamic() || shape.rank().get_length() == 1), + "u8/1D tensor is expected"); + #endif +} - std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - auto result = std::make_shared(inputs, m_mode); - return result; - } +void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); + node->set_output_type(output_index+1, element::i32, shape); + node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); +} - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("mode", m_mode); - return true; - } - bool has_evaluate() const { - return true; - } +void StringTensorPack::validate_and_infer_types() { + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supporst only 'begins_ends' mode, but get " + m_mode); + check_string_input(this, 0); + #ifdef USE_STRING_TENSORS + set_output_type(0, element::string, get_input_partial_shape(0)); + #else + set_output_type(0, element::u8, PartialShape{Dimension()}); + #endif +} + - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { #ifdef USE_STRING_TENSORS - // TODO - return false; + // TODO + return false; #else - auto rank = inputs[0].get_shape().size(); - if (rank != 1) { - std::cerr << "[ WARNING ] StringTensorPack ignores the rank " << rank << " of input tensor and set rank=1 in the output\n"; - } + auto rank = inputs[0].get_shape().size(); + if (rank != 1) { + std::cerr << "[ WARNING ] StringTensorPack ignores the rank " << rank << " of input tensor and set rank=1 in the output\n"; + } - auto num_elements = shape_size(inputs[0].get_shape()); - auto num_chars = shape_size(inputs[2].get_shape()); - auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; - outputs[0].set_shape(Shape{num_output_elements}); + auto num_elements = shape_size(inputs[0].get_shape()); + auto num_chars = shape_size(inputs[2].get_shape()); + auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; + outputs[0].set_shape(Shape{num_output_elements}); - //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); + //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); - auto output = outputs[0].data(); - auto output_int32 = reinterpret_cast(output); + auto output = outputs[0].data(); + auto output_int32 = reinterpret_cast(output); - *output_int32++ = num_elements; - *output_int32++ = 0; - output_int32 = std::copy(ends, ends + num_elements, output_int32); - output = reinterpret_cast(output_int32); - output = std::copy(chars, chars + num_chars, output); + *output_int32++ = num_elements; + *output_int32++ = 0; + output_int32 = std::copy(ends, ends + num_elements, output_int32); + output = reinterpret_cast(output_int32); + output = std::copy(chars, chars + num_chars, output); - OPENVINO_ASSERT(num_output_elements == output - outputs[0].data(), "[ INTERNAL ERROR ] StringTensorPack output tensor is corrupted"); + OPENVINO_ASSERT(num_output_elements == output - outputs[0].data(), "[ INTERNAL ERROR ] StringTensorPack output tensor is corrupted"); - // WARNING! Chars are not repacked. If there are gaps between strings, they will remain. + // WARNING! Chars are not repacked. If there are gaps between strings, they will remain. - return true; + return true; #endif - } - -private: - - std::string m_mode; -}; - - +} -// Unpack a string tensor representation regardless of the source format, which -// can be an OV tensor with element::string element type (if supported) or u8 -// packed representation, to a decompose tensor representation that may potentially -// consist of multiple tensors. The destination format is defined by `mode` attribute. -// Shape of the output tensor is compitelly recognized from the input (if supported) -// or defined partially by a dedicated input attribute `shape`. If `shape` is not set, -// which default to completelly dynamic `shape`, then output shape is defined -// by an input tensor. -class StringTensorUnpack : public ov::op::Op { -public: - OPENVINO_OP("StringTensorUnpack"); - StringTensorUnpack(OutputVector inputs, const std::string& mode = "begins_ends" - /*const std::string* _data = nullptr, PartialShape _input_shape = PartialShape::dynamic()*/) - : ov::op::Op(inputs), m_mode(mode) { - constructor_validate_and_infer_types(); - } - //const std::string* data = nullptr; - //PartialShape input_shape; - - void validate_and_infer_types() override { - OPENVINO_ASSERT( - get_input_size() == 1, - "Number of inputs for StringTensorUnpack is not equal to 1"); +void StringTensorUnpack::validate_and_infer_types() { + OPENVINO_ASSERT( + get_input_size() == 1, + "Number of inputs for StringTensorUnpack is not equal to 1"); #if 0 // Uncomment it when the bug is fixed with type substitution in TF partition call inlining - OPENVINO_ASSERT( - #ifdef USE_STRING_TENSORS - get_input_element_type(0) == element::string || - #endif - get_input_element_type(0) == element::dynamic || - get_input_element_type(0) == element::u8, - "Unsupported input element type for StringTensorUnpack: " + get_input_element_type(0).get_type_name()); + OPENVINO_ASSERT( + #ifdef USE_STRING_TENSORS + get_input_element_type(0) == element::string || + #endif + get_input_element_type(0) == element::dynamic || + get_input_element_type(0) == element::u8, + "Unsupported input element type for StringTensorUnpack: " + get_input_element_type(0).get_type_name()); #endif - OPENVINO_ASSERT( - get_input_partial_shape(0).rank().is_static(), - "StringTensorUnpack supports only static input rank"); + OPENVINO_ASSERT( + get_input_partial_shape(0).rank().is_static(), + "StringTensorUnpack supports only static input rank"); #if 0 - // Obtain shape from rt_info. - auto& rt_info = get_input_node_shared_ptr(0)->get_rt_info(); - auto ops = rt_info.find("original_partial_shape"); - if(ops != rt_info.end()) { - input_shape = ops->second.as(); - std::cerr << "StringTensorUnpack: orig_partial_shape: " << input_shape << "\n"; - } else { - std::cerr << "Impossible\n"; - std::cerr << get_input_node_shared_ptr(0) << "\n"; - } + // Obtain shape from rt_info. + auto& rt_info = get_input_node_shared_ptr(0)->get_rt_info(); + auto ops = rt_info.find("original_partial_shape"); + if(ops != rt_info.end()) { + input_shape = ops->second.as(); + std::cerr << "StringTensorUnpack: orig_partial_shape: " << input_shape << "\n"; + } else { + std::cerr << "Impossible\n"; + std::cerr << get_input_node_shared_ptr(0) << "\n"; + } #endif - auto output_shape = PartialShape::dynamic(); + auto output_shape = PartialShape::dynamic(); #ifdef USE_STRING_TENSORS - // In case of explicit string tensors the shape is carried by input tensor itself - // OPENVINO_ASSERT( - // input_shape == PartialShape::dynamic(), - // "Excplicitly set shape for a string tensor in the unpacking is not supported"); + // In case of explicit string tensors the shape is carried by input tensor itself + // OPENVINO_ASSERT( + // input_shape == PartialShape::dynamic(), + // "Excplicitly set shape for a string tensor in the unpacking is not supported"); - #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK - // There are two cases that affect expected element type of the input tensor: - // before the hack is applied (element::string) and after it (element::u8). + // There are two cases that affect expected element type of the input tensor: + // before the hack is applied (element::string) and after it (element::u8). - OPENVINO_ASSERT( - get_input_element_type(0) == element::string - || get_input_element_type(0) == element::u8, - "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation"); + OPENVINO_ASSERT( + get_input_element_type(0) == element::string + || get_input_element_type(0) == element::u8, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation"); - if(get_input_element_type(0) == element::string) { - output_shape = get_input_partial_shape(0); - } + if(get_input_element_type(0) == element::string) { + output_shape = get_input_partial_shape(0); + } - if(get_input_element_type(0) == element::u8) - { - // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. - // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. - const auto& rt_info = get_input_tensor(0).get_rt_info(); - auto it = rt_info.find("__original_partial_shape"); - - // StringTensorUnpack expects __original_partial_shape attribute of type PartialShape in the input tensor. - // If it is not found that means that model compilation wasn't pass the expected transformation where a string tensor - // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. - if(it != rt_info.end() && it->second.is()) { - output_shape = it->second.as(); - } + if(get_input_element_type(0) == element::u8) + { + // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. + // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. + const auto& rt_info = get_input_tensor(0).get_rt_info(); + auto it = rt_info.find("__original_partial_shape"); + + // StringTensorUnpack expects __original_partial_shape attribute of type PartialShape in the input tensor. + // If it is not found that means that model compilation wasn't pass the expected transformation where a string tensor + // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. + if(it != rt_info.end() && it->second.is()) { + output_shape = it->second.as(); } + } - #else + #else - OPENVINO_ASSERT( - get_input_element_type(0) == element::string, - "StringTensorUnpack expects element::string in an input tensor, but it is " + std::string(get_input_element_type(0))); + OPENVINO_ASSERT( + get_input_element_type(0) == element::string, + "StringTensorUnpack expects element::string in an input tensor, but it is " + std::string(get_input_element_type(0))); - output_shape = get_input_partial_shape(0); + output_shape = get_input_partial_shape(0); - #endif + #endif #else - // Expect packed string tensor represenation which can carry only a string tensors of shape [?] - // Shape is not known in advance and only rank of the output can be set + // Expect packed string tensor represenation which can carry only a string tensors of shape [?] + // Shape is not known in advance and only rank of the output can be set - OPENVINO_ASSERT( + OPENVINO_ASSERT( #if 0 // Uncomment it when the bug is fixed with type substitution in TF partition call inlining - get_input_element_type(0) == element::u8 && + get_input_element_type(0) == element::u8 && #endif - get_input_partial_shape(0).rank().is_static() && get_input_partial_shape(0).rank().get_length() == 1, - "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + - get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); - - output_shape = PartialShape({Dimension()}); // [?] - - #if 0 - - if(get_input_element_type(0) == element::u8) { - if(all_inputs_are_constants(this)) { - std::cerr << "StringTensorUnpack: u8/const\n"; - // HACK: Tensor of strings is passed by a raw pointer to a tensor - auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); - size_t raw_size = constant->get_shape()[0]; - if(raw_size == 0) { - // means empty input - std::cerr << "StringTensorUnpack: empty\n"; - data = nullptr; - input_shape = PartialShape({0}); - } else if(raw_size == sizeof(void*)) { - std::cerr << "StringTensorUnpack: not empty, tensor HACK\n"; - auto tensor = *reinterpret_cast(constant->get_data_ptr()); - std::cerr << "Pointer to tensor from op: " << tensor << "\n"; - input_shape = tensor->get_shape(); - data = tensor->data(); - } else { - - OPENVINO_ASSERT( - false, - "Unexpected size for hacked Tensor input. Something went wrong."); - } + get_input_partial_shape(0).rank().is_static() && get_input_partial_shape(0).rank().get_length() == 1, + "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + + get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); + + output_shape = PartialShape({Dimension()}); // [?] + + #if 0 + + if(get_input_element_type(0) == element::u8) { + if(all_inputs_are_constants(this)) { + std::cerr << "StringTensorUnpack: u8/const\n"; + // HACK: Tensor of strings is passed by a raw pointer to a tensor + auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); + size_t raw_size = constant->get_shape()[0]; + if(raw_size == 0) { + // means empty input + std::cerr << "StringTensorUnpack: empty\n"; + data = nullptr; + input_shape = PartialShape({0}); + } else if(raw_size == sizeof(void*)) { + std::cerr << "StringTensorUnpack: not empty, tensor HACK\n"; + auto tensor = *reinterpret_cast(constant->get_data_ptr()); + std::cerr << "Pointer to tensor from op: " << tensor << "\n"; + input_shape = tensor->get_shape(); + data = tensor->data(); } else { - std::cerr << "StringTensorUnpack: u8/not constant\n"; + + OPENVINO_ASSERT( + false, + "Unexpected size for hacked Tensor input. Something went wrong."); } } else { - std::cerr << "StringTensorUnpack: string\n"; + std::cerr << "StringTensorUnpack: u8/not constant\n"; + } + } else { + std::cerr << "StringTensorUnpack: string\n"; + input_shape = get_input_partial_shape(0); + if(all_inputs_are_constants(this)) { + auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); + data = constant->get_data_ptr(); + } else { input_shape = get_input_partial_shape(0); - if(all_inputs_are_constants(this)) { - auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); - data = constant->get_data_ptr(); - } else { - input_shape = get_input_partial_shape(0); - } } + } - #endif + #endif #endif - OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); - if (m_mode == "begins_ends") { - set_string_output(this, 0, output_shape); - } + if (m_mode == "begins_ends") { + set_string_output(this, 0, output_shape); } +} - std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - auto result = std::make_shared(inputs, m_mode); - return result; - } +bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("mode", m_mode); - return true; - } +#ifdef USE_STRING_TENSORS - bool has_evaluate() const { - return true; - } + #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + auto tensor = *reinterpret_cast(inputs[0].data()); + #else + auto tensor = inputs[0]; + #endif - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + //std::cerr << "Pointer to tensor from op evaluate: " << tensor << "\n"; + Shape input_shape = tensor->get_shape(); + const std::string* input_strings = tensor->data(); + std::cerr << "input_shape = " << input_shape << "\n"; + //std::cerr << data << "\n"; -#ifdef USE_STRING_TENSORS + auto nelements = shape_size(input_shape); + size_t total = 0; + for(size_t i = 0; i < nelements; ++i) + total += input_strings[i].length(); - #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK - auto tensor = *reinterpret_cast(inputs[0].data()); - #else - auto tensor = inputs[0]; - #endif + outputs[0].set_shape(input_shape); + outputs[1].set_shape(input_shape); + outputs[2].set_shape(Shape{total}); - //std::cerr << "Pointer to tensor from op evaluate: " << tensor << "\n"; - Shape input_shape = tensor->get_shape(); - const std::string* input_strings = tensor->data(); - std::cerr << "input_shape = " << input_shape << "\n"; - //std::cerr << data << "\n"; - - auto nelements = shape_size(input_shape); - size_t total = 0; - for(size_t i = 0; i < nelements; ++i) - total += input_strings[i].length(); - - outputs[0].set_shape(input_shape); - outputs[1].set_shape(input_shape); - outputs[2].set_shape(Shape{total}); - - auto begins = outputs[0].data(); - auto ends = outputs[1].data(); - auto output_symbols = reinterpret_cast(outputs[2].data()); - size_t offset = 0; - - for(size_t i = 0; i < nelements; ++i) - { - begins[i] = offset; - output_symbols = std::copy(input_strings[i].begin(), input_strings[i].end(), output_symbols); - offset += input_strings[i].length(); - ends[i] = offset; - } + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto output_symbols = reinterpret_cast(outputs[2].data()); + size_t offset = 0; - return true; + for(size_t i = 0; i < nelements; ++i) + { + begins[i] = offset; + output_symbols = std::copy(input_strings[i].begin(), input_strings[i].end(), output_symbols); + offset += input_strings[i].length(); + ends[i] = offset; + } + + return true; #else - int32_t batch_size; - const int32_t* begin_ids; - const int32_t* end_ids; - const uint8_t* data; - parse_packed_strings(inputs[0], batch_size, begin_ids, end_ids, data); + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(inputs[0], batch_size, begin_ids, end_ids, data); - auto num_chars = end_ids[batch_size - 1]; + auto num_chars = end_ids[batch_size - 1]; - outputs[0].set_shape(Shape({static_cast(batch_size)})); - outputs[1].set_shape(Shape({static_cast(batch_size)})); - outputs[2].set_shape(Shape{static_cast(num_chars)}); + outputs[0].set_shape(Shape{static_cast(batch_size)}); + outputs[1].set_shape(Shape{static_cast(batch_size)}); + outputs[2].set_shape(Shape{static_cast(num_chars)}); - auto begins = outputs[0].data(); - auto ends = outputs[1].data(); - auto chars = outputs[2].data(); + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); - std::copy(begin_ids, begin_ids + batch_size, begins); - std::copy(end_ids, end_ids + batch_size, ends); - std::copy(data, data + num_chars, chars); + std::copy(begin_ids, begin_ids + batch_size, begins); + std::copy(end_ids, end_ids + batch_size, ends); + std::copy(data, data + num_chars, chars); - return true; + return true; #endif - } +} - std::string m_mode; -}; OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t input_index) { auto input = node.get_input(input_index); @@ -799,3 +753,89 @@ ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) pre_translate_string_tensor_input(node, 0), node.get_attribute("normalization_form"))->outputs()) }; } + + + + +void RegexNormalization::validate_and_infer_types() { + check_string_input(this, 0); + check_string_scalar_input(this, 3); + check_string_scalar_input(this, 4); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + +#ifdef USE_STRING_TENSORS + auto search_pattern = *inputs[3].data(); + auto replace_pattern = *inputs[4].data(); +#else + auto search_pattern_buf = inputs[3].data(); + auto replace_pattern_buf = inputs[4].data(); + auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant +#endif + +#if 0 + // TODO: Complete implementation +#else + // Stub implementation that transforms each input string "X" to "RegexNormalization(X, search_pattern, replace_pattern)" for debugging purposes + { + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const std::string left_side = "RegexNormalization(", right_side = ")", delimeter = ", "; + const size_t num_elements = inputs[0].get_size(); + const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length() + 2*delimeter.length() + search_pattern.length() + replace_pattern.length())*num_elements; + outputs[2].set_shape(Shape{new_len}); + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + int32_t char_offset = 0; + + for(size_t i = 0; i < num_elements; ++i) { + new_begins[i] = char_offset; + + std::string new_str = + left_side + std::string(chars + begins[i], chars + ends[i]) + delimeter + + std::string(search_pattern) + delimeter + + std::string(replace_pattern) + right_side; + + std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); + char_offset += new_str.length(); + new_ends[i] = char_offset; + } + return true; + } + // End of stub implementation +#endif +} + + +std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name) { + // FIXME: using space to pad the value to work-around CPU issue with empty constants + auto value = node.get_attribute(name) + " "; + + #ifdef USE_STRING_TENSORS + return std::make_shared(element::string, {}, value); + #else + return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); + #endif +} + + +ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input"); + ov::OutputVector inputs = pre_translate_string_tensor_input(node, 0); + inputs.push_back(string_attribute_to_constant(node, "pattern")); + inputs.push_back(string_attribute_to_constant(node, "rewrite")); + return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) }; +} diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 9adcd6702..6e77abcc9 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -11,6 +11,88 @@ namespace sentencepiece { class SentencePieceProcessor; } +// Having a decomposed representation for a tensor, converts it to a single string tensor +// (packed u8 or natively supported element::string depending on whether or not USE_STRING_TENSORS defined). +class StringTensorPack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorPack"); + + StringTensorPack () = default; + + StringTensorPack(ov::OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; + +private: + + std::string m_mode; +}; + + + +// Unpack a string tensor representation regardless of the source format, which +// can be an OV tensor with element::string element type (if supported) or u8 +// packed representation, to a decompose tensor representation that may potentially +// consist of multiple tensors. The destination format is defined by `mode` attribute. +// Shape of the output tensor is compitelly recognized from the input (if supported) +// or defined partially by a dedicated input attribute `shape`. If `shape` is not set, +// which default to completelly dynamic `shape`, then output shape is defined +// by an input tensor. +class StringTensorUnpack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorUnpack"); + + StringTensorUnpack () = default; + + StringTensorUnpack(ov::OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; + +private: + + std::string m_mode; +}; + + + namespace TemplateExtension { class SentencepieceTokenizer : public ov::op::Op { public: @@ -45,7 +127,7 @@ ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& nod ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); -// https://www.tensorflow.org/text/api_docs/python/text/case_fold_utf8 + class OPENVINO_API CaseFold : public ov::op::Op { public: OPENVINO_OP("CaseFold "); @@ -112,3 +194,34 @@ class OPENVINO_API NormalizeUnicode : public ov::op::Op { }; ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); + + +class OPENVINO_API RegexNormalization : public ov::op::Op { +public: + OPENVINO_OP("RegexNormalization"); + + RegexNormalization () = default; + + RegexNormalization(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; + +ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node); From 71bc5bf13d48b7a62b15a0bb2c76d568bff9b9db Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Fri, 12 May 2023 01:57:41 +0400 Subject: [PATCH 006/116] Implemented Reshape for decomposed string tensors --- .../user_ie_extensions/ov_extension.cpp | 17 ++++++----- .../sentence_piece/sentence_piece.cpp | 29 +++++++++++++++++++ .../sentence_piece/sentence_piece.hpp | 2 ++ 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index c5f844f49..424776c19 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -55,14 +55,15 @@ #ifdef sentence_piece # include "sentence_piece/sentence_piece.hpp" # define SENTENSE_PIECE_EXT \ - std::make_shared>(), \ - std::make_shared>(), \ - std::make_shared>(), \ - std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ - std::make_shared>(), \ - std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ - std::make_shared>(), \ - std::make_shared("StaticRegexReplace", translate_static_regex_replace), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ + std::make_shared>(), \ + std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ + std::make_shared>(), \ + std::make_shared("StaticRegexReplace", translate_static_regex_replace), \ + std::make_shared("Reshape", translate_reshape), \ std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 96e42f8d5..7044c424a 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -554,6 +554,7 @@ OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t i #ifndef USE_STRING_TENSORS // Override type of input tensor if this is a Parameter if (auto parameter = std::dynamic_pointer_cast(input_node)) { + // TODO: Apply this change conditionally based on real Parameter value std::cerr << "Overriding Parameter element_type to U8 to be ready to accept a packed batch of strings\n"; parameter->set_partial_shape(PartialShape{ Dimension() }); parameter->set_element_type(element::u8); @@ -839,3 +840,31 @@ ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& inputs.push_back(string_attribute_to_constant(node, "rewrite")); return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) }; } + + +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { + // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. + // It checks if the input tensor has string type, and then perform custom tranlation. + // Otherwise it should operate identically to the stock version of Reshape translator in TF FE. + // TODO: Introduce an API to call original translators from an extension without copying the code to an extension. + + FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "Tensorflow Reshape op should have two inputs"); + auto tensor = node.get_input(0); + auto shape = node.get_input(1); + if(auto pack = dynamic_cast(tensor.get_node())) { + // TODO: If it is a beginning of the graph, how to detect strings? It falls in 'else' branch in this case. + // FIXME: Needs extension for a Parameter to prepare it first + auto begins = std::make_shared(pack->input_value(0), shape, false); + auto ends = std::make_shared(pack->input_value(1), shape, false); + auto chars = pack->input_value(2); + + auto reshape = post_translate_string_tensor_output({begins, ends, chars}); + + return {reshape}; + } else { + auto reshape = std::make_shared(tensor, shape, false); + return {reshape}; + } + // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals +} + diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 6e77abcc9..e3aa64296 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -225,3 +225,5 @@ class OPENVINO_API RegexNormalization : public ov::op::Op { }; ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node); + +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); \ No newline at end of file From 6c5eec0003ce211ca4f1b802aa3500df537bce0d Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Sat, 13 May 2023 01:45:33 +0400 Subject: [PATCH 007/116] Added RaggedTensorPack, sophisticated stup for RegexSplit and overridden Const translator for TF to intercept string constants --- .../user_ie_extensions/ov_extension.cpp | 4 + .../sentence_piece/sentence_piece.cpp | 247 +++++++++++++++--- .../sentence_piece/sentence_piece.hpp | 74 +++++- 3 files changed, 294 insertions(+), 31 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 424776c19..6b09e722d 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -56,6 +56,7 @@ # include "sentence_piece/sentence_piece.hpp" # define SENTENSE_PIECE_EXT \ std::make_shared>(), \ + std::make_shared>(), \ std::make_shared>(), \ std::make_shared>(), \ std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ @@ -63,7 +64,10 @@ std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ std::make_shared>(), \ std::make_shared("StaticRegexReplace", translate_static_regex_replace), \ + std::make_shared>(), \ + std::make_shared("RegexSplitWithOffsets", translate_regex_split_with_offsets), \ std::make_shared("Reshape", translate_reshape), \ + std::make_shared("Const", translate_const), \ std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 7044c424a..74015201b 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -5,8 +5,10 @@ #include "normalizer.h" #include "sentence_piece.hpp" +#include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" + //#define USE_STRING_TENSORS #ifdef USE_STRING_TENSORS @@ -281,9 +283,17 @@ void check_string_scalar_input(const Node* node, size_t input_index) { } void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { - node->set_output_type(output_index+0, element::i32, shape); - node->set_output_type(output_index+1, element::i32, shape); - node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); + node->set_output_type(output_index+0, element::i32, shape); // byte offset in output[+2] -- begin of each string + node->set_output_type(output_index+1, element::i32, shape); // byte offset in output[+2] -- end of each string + node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); // symbols from all strings concatenated +} + +void set_ragged_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements + node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+3] -- end of each ragged dimension elements + node->set_output_type(output_index+2, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- begin of each string + node->set_output_type(output_index+3, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- end of each string + node->set_output_type(output_index+4, element::u8, PartialShape{Dimension()}); // symbols from all strings cnocatenated } @@ -335,6 +345,30 @@ bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVecto } + +void RaggedTensorPack::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 3); + OPENVINO_ASSERT(get_input_element_type(0) == element::i32); + OPENVINO_ASSERT(get_input_element_type(1) == element::i32); + + // Pass through the base tensor which is used to build ragged dimensions + // TODO: Provide correct implementation that saves information about ragged structure + // TODO: Requires single-tensor packed representation for ragged tensor + set_output_type(0, get_input_element_type(2), get_input_partial_shape(2)); +} + + +bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass throug the base tensor with elements. + + // TODO: Actually print indices, skipped for now... + + inputs[2].copy_to(outputs[0]); + + return true; +} + + void StringTensorUnpack::validate_and_infer_types() { OPENVINO_ASSERT( get_input_size() == 1, @@ -479,6 +513,31 @@ void StringTensorUnpack::validate_and_infer_types() { } } +void unpack_strings (const std::string* strings, const Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars) { // TODO: no need for a reference to a ov::Tensor? + auto nelements = shape_size(shape); + + size_t total = 0; + for(size_t i = 0; i < nelements; ++i) + total += strings[i].length(); + + begins.set_shape(shape); + ends.set_shape(shape); + chars.set_shape(Shape{total}); + + auto pbegins = begins.data(); + auto pends = ends.data(); + auto poutput_symbols = reinterpret_cast(chars.data()); + size_t offset = 0; + + for(size_t i = 0; i < nelements; ++i) + { + pbegins[i] = offset; + poutput_symbols = std::copy(strings[i].begin(), strings[i].end(), poutput_symbols); + offset += strings[i].length(); + pends[i] = offset; + } +} + bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { #ifdef USE_STRING_TENSORS @@ -489,33 +548,10 @@ bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto tensor = inputs[0]; #endif - //std::cerr << "Pointer to tensor from op evaluate: " << tensor << "\n"; Shape input_shape = tensor->get_shape(); const std::string* input_strings = tensor->data(); - std::cerr << "input_shape = " << input_shape << "\n"; - //std::cerr << data << "\n"; - - auto nelements = shape_size(input_shape); - size_t total = 0; - for(size_t i = 0; i < nelements; ++i) - total += input_strings[i].length(); - - outputs[0].set_shape(input_shape); - outputs[1].set_shape(input_shape); - outputs[2].set_shape(Shape{total}); - - auto begins = outputs[0].data(); - auto ends = outputs[1].data(); - auto output_symbols = reinterpret_cast(outputs[2].data()); - size_t offset = 0; - - for(size_t i = 0; i < nelements; ++i) - { - begins[i] = offset; - output_symbols = std::copy(input_strings[i].begin(), input_strings[i].end(), output_symbols); - offset += input_strings[i].length(); - ends[i] = offset; - } + //std::cerr << "input_shape = " << input_shape << "\n"; + unpack_strings(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); return true; @@ -583,6 +619,11 @@ ov::Output post_translate_string_tensor_output(const OutputVector& out return std::make_shared(outputs, "begins_ends"); } +ov::Output post_translate_ragged_tensor_output(const OutputVector& outputs) { + FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); + return std::make_shared(outputs); +} + NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp @@ -756,8 +797,6 @@ ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) } - - void RegexNormalization::validate_and_infer_types() { check_string_input(this, 0); check_string_scalar_input(this, 3); @@ -842,6 +881,126 @@ ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& } + +void RegexSplit::validate_and_infer_types() { + check_string_input(this, 0); + check_string_scalar_input(this, 3); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + +bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + +#ifdef USE_STRING_TENSORS + auto split_pattern = *inputs[3].data(); +#else + auto split_pattern_buf = inputs[3].data(); + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant +#endif + +#if 0 + // TODO: Complete implementation +#else + // Stub implementation that transforms each input string "X" to multiple "RegexSplit(X, split_pattern) = part(X)" for debugging purposes + // Where part(X) is a part of original X divided by predefined length with some reminder + // So each element X is divided into multiple output elements along ragged dimension, and the number of elements depends on the input X length and + // can vary for different X. For example, let the length = 2 and input X = "words", the output would consist of 3 elements along corresponding + // ragged dimension in the output with values: + // - "RegexSplit(word, search_pattern, replace_pattern) = wo", + // - "RegexSplit(word, search_pattern, replace_pattern) = rd", + // - "RegexSplit(word, search_pattern, replace_pattern) = s" + // split_pattern is cut for the sake of readability of ouput + { + #if 1 + const size_t part_length = 30; // any positive number, defines the length of each part in bytes + + std::string split_pattern_part = std::string(split_pattern.substr(0, part_length)); + std::cerr << "Split patter part: " << split_pattern_part << "\n"; + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + const std::string left_side = "RegexSplit(", right_side = ")", delimeter = ", "; + const size_t num_elements = inputs[0].get_size(); + size_t num_parts = 0; // will count the number of all parts + size_t num_additional_chars = 0; // + // Count the resulting number of part that we are going to obtain + for(size_t i = 0; i < num_elements; ++i) { + auto length = ends[i] - begins[i]; + auto num_of_whole_parts = length/part_length; + auto remainder = length%part_length; + auto num_local_parts = num_of_whole_parts + int(bool(remainder)); + num_parts += num_local_parts; + num_additional_chars += length*num_local_parts; + } + + size_t num_chars = inputs[2].get_size(); + + // FIXME: Overestimation + const size_t new_num_chars = num_chars + num_parts*30/*!*/ + (left_side.length() + right_side.length() + delimeter.length() + split_pattern_part.length())*num_elements; + outputs[2].set_shape(Shape{num_parts}); + outputs[3].set_shape(Shape{num_parts}); + outputs[4].set_shape(Shape{new_num_chars}); + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + int32_t ragged_offset = 0; + int32_t char_offset = 0; + + for(size_t i = 0; i < num_elements; ++i) { + new_ragged_begins[i] = ragged_offset; + auto old_str = std::string(chars + begins[i], chars + ends[i]); + auto new_str_part_base = left_side + old_str + delimeter + split_pattern_part + right_side; + + for(size_t j = 0; j < old_str.length(); j += part_length) { + new_begins[ragged_offset] = char_offset; + //auto new_str_part = new_str_part_base + old_str.substr(j, part_length); + std::string new_str_part = j == 0 ? new_str_part_base : "part[" + std::to_string(i) + "," + std::to_string(j) + "]"; + std::copy(new_str_part.data(), new_str_part.data() + new_str_part.length(), new_chars + char_offset); + char_offset += new_str_part.length(); + new_ends[ragged_offset] = char_offset; + ++ragged_offset; + } + + new_ragged_ends[i] = ragged_offset; + } + + outputs[4].set_shape({char_offset}); + + //OPENVINO_ASSERT(char_offset == new_num_chars, "Internal error in RegexSplit::evaluate: out of range for chars"); + OPENVINO_ASSERT(ragged_offset == num_parts, "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + + #endif + return true; + } + // End of stub implementation +#endif +} + + +ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "RegexSplitWithOffsets expects 3 inputs"); + ov::OutputVector inputs = pre_translate_string_tensor_input(node, 0); + auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there + inputs.push_back(delim_regex_pattern); + std::cerr << "String constant: " << delim_regex_pattern << "\n"; + //inputs.push_back(string_attribute_to_constant(node, "rewrite")); + auto outputs = std::make_shared(inputs)->outputs(); + auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); + return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; +} + + ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. // It checks if the input tensor has string type, and then perform custom tranlation. @@ -868,3 +1027,31 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals } + +// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes +ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { + auto ov_type = node.get_attribute_as_any("dtype"); + std::shared_ptr const_node; + if (!ov_type.is() || ov_type.as() == ov::element::dynamic || + ov_type.as() == ov::element::undefined) { + if (ov_type.is() && ov_type.as() == "DT_STRING") { + auto value_as_any = node.get_attribute_as_any("value"); + const auto& values = value_as_any.as>(); + ov::Tensor begins(element::i32, {}), ends(element::i32, {}), chars(element::u8, {}); + unpack_strings(&values[0], {values.size()}, begins, ends, chars); + const_node = std::make_shared(OutputVector{ + std::make_shared(begins), + std::make_shared(ends), + std::make_shared(chars) + }); + } else { + const_node = std::make_shared(OutputVector{}); + } + } else { + auto tensor = node.get_attribute("value"); + const_node = std::make_shared(tensor.get_element_type(), tensor.get_shape(), tensor.data()); + } + //set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name + return {const_node}; +} + diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index e3aa64296..2843f7857 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -48,6 +48,37 @@ class StringTensorPack : public ov::op::Op { }; +// Having a decomposed representation for a tensor, converts it to a single string tensor for debugging purposes and to facilitate model conversion +// Base tensor on which this operation builds a ragged tensor can have any shape or type, this operation doesn't try to interpret it. +class RaggedTensorPack : public ov::op::Op { +public: + OPENVINO_OP("RaggedTensorPack"); + + RaggedTensorPack () = default; + + RaggedTensorPack(ov::OutputVector inputs) + : ov::op::Op(inputs) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; +}; + // Unpack a string tensor representation regardless of the source format, which // can be an OV tensor with element::string element type (if supported) or u8 @@ -226,4 +257,45 @@ class OPENVINO_API RegexNormalization : public ov::op::Op { ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node); -ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); \ No newline at end of file +class OPENVINO_API RegexSplit : public ov::op::Op { +public: + OPENVINO_OP("RegexSplit"); + + RegexSplit () = default; + + RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "removed", bool invert = false) : + ov::op::Op(arguments), + m_behaviour(behaviour), + m_invert(invert) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_behaviour, m_invert); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("behaviour", m_behaviour); + visitor.on_attribute("invert", m_invert); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_behaviour; + bool m_invert; +}; + +ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node); + + +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); +ov::OutputVector translate_const(const ov::frontend::NodeContext& node); From 29dfe38640722913f6b6a9dbcbae5bc2f87fd535 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Tue, 16 May 2023 03:36:05 +0400 Subject: [PATCH 008/116] Fixes for both master and element::string branches of OpenVINO; better conditional compilation based on available features in OpenVINO --- .../sentence_piece/sentence_piece.cpp | 252 +++++++----------- 1 file changed, 90 insertions(+), 162 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 74015201b..024694c04 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -8,21 +8,21 @@ #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" +#ifndef OPENVINO_ELEMENT_STRING_SUPPORTED + #define OPENVINO_ELEMENT_STRING_SUPPORTED 0 +#endif -//#define USE_STRING_TENSORS - -#ifdef USE_STRING_TENSORS +#ifndef OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + #define OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK 0 +#endif -// A plugin can support a string tensor on inputs and outputs via the hack which wraps such tensor to -// a u8 tensor holding a pointer to the original string tensor. The hack lets us avoid more deep -// plugin modifications by pre-transform a model where string tensor parameters and results are replaced -// by the described wrapping tensors. Such a hack requires some pre/post processing in operations -// that handle such wrapping tensors on the edge of a model. -#define USE_INPUT_OUTPUT_STRING_TENSOR_HACK +#define USE_STRING_TENSORS 0 // modify this depending on willingness to use explicit string tensors +#if USE_STRING_TENSORS && !OPENVINO_ELEMENT_STRING_SUPPORTED + #error "USE_STRING_TENSORS = 1 can be used only when OpenVINO supports element::string that is determined by OPENVINO_ELEMENT_STRING_SUPPORTED == 1" #endif -//#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS +#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS 0 using sentencepiece::SentencePieceProcessor; using namespace TemplateExtension; @@ -83,7 +83,7 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s void SentencepieceTokenizer::validate_and_infer_types() { - #ifdef SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)"); FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); @@ -96,9 +96,9 @@ void SentencepieceTokenizer::validate_and_infer_types() { FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); - #ifdef USE_STRING_TENSORS + #if USE_STRING_TENSORS - #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK FRONT_END_GENERAL_CHECK( get_input_element_type(1) == element::string || get_input_element_type(1) == element::u8, "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 or string depending on the current stage of model preparation"); @@ -155,7 +155,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& std::vector sparse_values; std::vector sparse_dense_shape; -#ifdef SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS +#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS auto begin_ids = inputs[1].data(); auto end_ids = inputs[2].data(); @@ -165,9 +165,9 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& #else -#ifdef USE_STRING_TENSORS +#if USE_STRING_TENSORS - #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); #else const ov::Tensor& strings_tensor = inputs[1]; @@ -202,7 +202,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& size_t max_token_id = 0; for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { -#if defined(USE_STRING_TENSORS) && !defined(SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS) +#if USE_STRING_TENSORS && !SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS const std::string& sentence = strings[batch_ind]; //std::cerr << " sentence: " << sentence << "\n"; #else @@ -265,7 +265,7 @@ void check_string_scalar_input(const Node* node, size_t input_index) { auto shape = node->get_input_partial_shape(input_index); auto element_type = node->get_input_element_type(input_index); - #ifdef USE_STRING_TENSORS + #if USE_STRING_TENSORS OPENVINO_ASSERT( (element_type == element::dynamic || element_type == element::string) && @@ -300,7 +300,7 @@ void set_ragged_string_output(Node* node, size_t output_index, const PartialShap void StringTensorPack::validate_and_infer_types() { OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supporst only 'begins_ends' mode, but get " + m_mode); check_string_input(this, 0); - #ifdef USE_STRING_TENSORS + #if USE_STRING_TENSORS set_output_type(0, element::string, get_input_partial_shape(0)); #else set_output_type(0, element::u8, PartialShape{Dimension()}); @@ -309,7 +309,7 @@ void StringTensorPack::validate_and_infer_types() { bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { -#ifdef USE_STRING_TENSORS +#if USE_STRING_TENSORS // TODO return false; #else @@ -374,58 +374,39 @@ void StringTensorUnpack::validate_and_infer_types() { get_input_size() == 1, "Number of inputs for StringTensorUnpack is not equal to 1"); -#if 0 // Uncomment it when the bug is fixed with type substitution in TF partition call inlining - OPENVINO_ASSERT( - #ifdef USE_STRING_TENSORS - get_input_element_type(0) == element::string || - #endif - get_input_element_type(0) == element::dynamic || - get_input_element_type(0) == element::u8, - "Unsupported input element type for StringTensorUnpack: " + get_input_element_type(0).get_type_name()); -#endif - - OPENVINO_ASSERT( - get_input_partial_shape(0).rank().is_static(), - "StringTensorUnpack supports only static input rank"); - -#if 0 - // Obtain shape from rt_info. - auto& rt_info = get_input_node_shared_ptr(0)->get_rt_info(); - auto ops = rt_info.find("original_partial_shape"); - if(ops != rt_info.end()) { - input_shape = ops->second.as(); - std::cerr << "StringTensorUnpack: orig_partial_shape: " << input_shape << "\n"; - } else { - std::cerr << "Impossible\n"; - std::cerr << get_input_node_shared_ptr(0) << "\n"; - } -#endif - auto output_shape = PartialShape::dynamic(); -#ifdef USE_STRING_TENSORS // In case of explicit string tensors the shape is carried by input tensor itself // OPENVINO_ASSERT( // input_shape == PartialShape::dynamic(), // "Excplicitly set shape for a string tensor in the unpacking is not supported"); - #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK - - // There are two cases that affect expected element type of the input tensor: - // before the hack is applied (element::string) and after it (element::u8). + // There are three cases that affect expected element type of the input tensor: + // - when string tensor is passed and we are before the hack is applied (element::string) and + // - when string tensor is passed and we are after the hack in CPU (element::u8) and + // - when stirng tensor is not really used, and we expect a packed string tensor in this case (element::u8) OPENVINO_ASSERT( - get_input_element_type(0) == element::string - || get_input_element_type(0) == element::u8, - "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation"); +#if OPENVINO_ELEMENT_STRING_SUPPORTED + get_input_element_type(0) == element::string || +#endif +#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS + get_input_element_type(0) == element::u8 || +#endif + get_input_element_type(0) == element::dynamic, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); +#if OPENVINO_ELEMENT_STRING_SUPPORTED if(get_input_element_type(0) == element::string) { output_shape = get_input_partial_shape(0); } +#endif +#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS if(get_input_element_type(0) == element::u8) { + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. const auto& rt_info = get_input_tensor(0).get_rt_info(); @@ -436,74 +417,22 @@ void StringTensorUnpack::validate_and_infer_types() { // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. if(it != rt_info.end() && it->second.is()) { output_shape = it->second.as(); - } - } - - #else - - OPENVINO_ASSERT( - get_input_element_type(0) == element::string, - "StringTensorUnpack expects element::string in an input tensor, but it is " + std::string(get_input_element_type(0))); - - output_shape = get_input_partial_shape(0); - - #endif - -#else - // Expect packed string tensor represenation which can carry only a string tensors of shape [?] - // Shape is not known in advance and only rank of the output can be set - - OPENVINO_ASSERT( -#if 0 // Uncomment it when the bug is fixed with type substitution in TF partition call inlining - get_input_element_type(0) == element::u8 && -#endif - get_input_partial_shape(0).rank().is_static() && get_input_partial_shape(0).rank().get_length() == 1, - "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + - get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); - - output_shape = PartialShape({Dimension()}); // [?] - - #if 0 - - if(get_input_element_type(0) == element::u8) { - if(all_inputs_are_constants(this)) { - std::cerr << "StringTensorUnpack: u8/const\n"; - // HACK: Tensor of strings is passed by a raw pointer to a tensor - auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); - size_t raw_size = constant->get_shape()[0]; - if(raw_size == 0) { - // means empty input - std::cerr << "StringTensorUnpack: empty\n"; - data = nullptr; - input_shape = PartialShape({0}); - } else if(raw_size == sizeof(void*)) { - std::cerr << "StringTensorUnpack: not empty, tensor HACK\n"; - auto tensor = *reinterpret_cast(constant->get_data_ptr()); - std::cerr << "Pointer to tensor from op: " << tensor << "\n"; - input_shape = tensor->get_shape(); - data = tensor->data(); - } else { - - OPENVINO_ASSERT( - false, - "Unexpected size for hacked Tensor input. Something went wrong."); - } } else { - std::cerr << "StringTensorUnpack: u8/not constant\n"; - } - } else { - std::cerr << "StringTensorUnpack: string\n"; - input_shape = get_input_partial_shape(0); - if(all_inputs_are_constants(this)) { - auto constant = std::dynamic_pointer_cast(get_input_node_shared_ptr(0)); - data = constant->get_data_ptr(); - } else { - input_shape = get_input_partial_shape(0); + #endif + #if !USE_STRING_TENSORS + // If string tensors shouldn't be used, then the packed u8 format is also expected + // as an input, but in this case only rank is known + OPENVINO_ASSERT( + get_input_partial_shape(0).rank().is_dynamic() || get_input_partial_shape(0).rank().get_length() == 1, + "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + + get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); + + output_shape = PartialShape({Dimension()}); // [?] + #endif + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK } + #endif } - - #endif - #endif OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); @@ -539,46 +468,52 @@ void unpack_strings (const std::string* strings, const Shape shape, ov::Tensor& } bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - -#ifdef USE_STRING_TENSORS - - #ifdef USE_INPUT_OUTPUT_STRING_TENSOR_HACK - auto tensor = *reinterpret_cast(inputs[0].data()); - #else - auto tensor = inputs[0]; + auto ptensor = &inputs[0]; + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + if(ptensor->get_element_type() == element::u8 && ptensor->get_byte_size() == sizeof(void*)) { + auto data = *reinterpret_cast(ptensor->data()); + if(data != nullptr) { + ptensor = reinterpret_cast(data); + } + } #endif - Shape input_shape = tensor->get_shape(); - const std::string* input_strings = tensor->data(); - //std::cerr << "input_shape = " << input_shape << "\n"; - unpack_strings(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); + auto tensor = *ptensor; - return true; +#if OPENVINO_ELEMENT_STRING_SUPPORTED + if(tensor.get_element_type() == element::string) { + Shape input_shape = tensor.get_shape(); + const std::string* input_strings = tensor.data(); + unpack_strings(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); + return true; + } else { +#endif -#else +#if USE_STRING_TENSORS + OPENVINO_ASSERT(false, "Detected a u8 tensor but element::string tensor should be provided") +#endif int32_t batch_size; const int32_t* begin_ids; const int32_t* end_ids; const uint8_t* data; - parse_packed_strings(inputs[0], batch_size, begin_ids, end_ids, data); - + parse_packed_strings(tensor, batch_size, begin_ids, end_ids, data); auto num_chars = end_ids[batch_size - 1]; outputs[0].set_shape(Shape{static_cast(batch_size)}); outputs[1].set_shape(Shape{static_cast(batch_size)}); outputs[2].set_shape(Shape{static_cast(num_chars)}); - auto begins = outputs[0].data(); auto ends = outputs[1].data(); auto chars = outputs[2].data(); - std::copy(begin_ids, begin_ids + batch_size, begins); std::copy(end_ids, end_ids + batch_size, ends); std::copy(data, data + num_chars, chars); return true; +#if OPENVINO_ELEMENT_STRING_SUPPORTED + } #endif } @@ -587,7 +522,7 @@ OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t i auto input = node.get_input(input_index); auto input_node = input.get_node_shared_ptr(); -#ifndef USE_STRING_TENSORS +#if !USE_STRING_TENSORS // Override type of input tensor if this is a Parameter if (auto parameter = std::dynamic_pointer_cast(input_node)) { // TODO: Apply this change conditionally based on real Parameter value @@ -602,7 +537,7 @@ OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t i FRONT_END_GENERAL_CHECK(struct_pack->get_input_size() == 3, "Expected 3 inputs to StringTensorPack which represents a string tensor"); return struct_pack->input_values(); } else { - #if defined(USE_STRING_TENSORS) || true // always + #if USE_STRING_TENSORS || true // always return std::make_shared(OutputVector{input}, "begins_ends")->outputs(); #else // Suppose this is u8 packed string tensor with a single batch dimension @@ -650,17 +585,16 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { auto add_eos = extract_scalar_const_value(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos"); auto reverse = extract_scalar_const_value(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse"); -#ifndef USE_STRING_TENSORS +#if !USE_STRING_TENSORS // Override type of input tensor if this is a Parameter if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { - std::cerr << "HERE\n"; parameter->set_partial_shape(PartialShape{ Dimension() }); parameter->set_element_type(element::u8); parameter->validate_and_infer_types(); } #endif -#ifdef SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS +#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS OutputVector inputs_vector = OutputVector{ sp_model_const }; auto unpacked_outputs = std::make_shared(OutputVector{inputs}, "begins_ends")->outputs(); @@ -735,7 +669,6 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { - std::cerr << "translate_case_fold_utf8\n"; FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); return { post_translate_string_tensor_output(std::make_shared( pre_translate_string_tensor_input(node, 0))->outputs()) }; @@ -809,15 +742,10 @@ bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto ends = inputs[1].data(); auto chars = inputs[2].data(); -#ifdef USE_STRING_TENSORS - auto search_pattern = *inputs[3].data(); - auto replace_pattern = *inputs[4].data(); -#else auto search_pattern_buf = inputs[3].data(); auto replace_pattern_buf = inputs[4].data(); auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant -#endif #if 0 // TODO: Complete implementation @@ -864,7 +792,7 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont // FIXME: using space to pad the value to work-around CPU issue with empty constants auto value = node.get_attribute(name) + " "; - #ifdef USE_STRING_TENSORS + #if USE_STRING_TENSORS return std::make_shared(element::string, {}, value); #else return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); @@ -893,12 +821,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto ends = inputs[1].data(); auto chars = inputs[2].data(); -#ifdef USE_STRING_TENSORS - auto split_pattern = *inputs[3].data(); -#else auto split_pattern_buf = inputs[3].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant -#endif + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant #if 0 // TODO: Complete implementation @@ -913,11 +837,9 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp // - "RegexSplit(word, search_pattern, replace_pattern) = s" // split_pattern is cut for the sake of readability of ouput { - #if 1 const size_t part_length = 30; // any positive number, defines the length of each part in bytes std::string split_pattern_part = std::string(split_pattern.substr(0, part_length)); - std::cerr << "Split patter part: " << split_pattern_part << "\n"; // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); @@ -980,7 +902,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp //OPENVINO_ASSERT(char_offset == new_num_chars, "Internal error in RegexSplit::evaluate: out of range for chars"); OPENVINO_ASSERT(ragged_offset == num_parts, "Internal error in RegexSplit::evaluate: out of range for ragged parts"); - #endif return true; } // End of stub implementation @@ -993,8 +914,6 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont ov::OutputVector inputs = pre_translate_string_tensor_input(node, 0); auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there inputs.push_back(delim_regex_pattern); - std::cerr << "String constant: " << delim_regex_pattern << "\n"; - //inputs.push_back(string_attribute_to_constant(node, "rewrite")); auto outputs = std::make_shared(inputs)->outputs(); auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; @@ -1016,9 +935,7 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { auto begins = std::make_shared(pack->input_value(0), shape, false); auto ends = std::make_shared(pack->input_value(1), shape, false); auto chars = pack->input_value(2); - auto reshape = post_translate_string_tensor_output({begins, ends, chars}); - return {reshape}; } else { auto reshape = std::make_shared(tensor, shape, false); @@ -1048,8 +965,19 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { const_node = std::make_shared(OutputVector{}); } } else { + //static std::vector tensors; auto tensor = node.get_attribute("value"); - const_node = std::make_shared(tensor.get_element_type(), tensor.get_shape(), tensor.data()); + //tensors.push_back(tensor); + const_node = std::make_shared(tensor); + #if OPENVINO_ELEMENT_STRING_SUPPORTED + if (const_node->get_element_type() == element::string) { + if(shape_size(tensor.get_shape()) > 0) { + auto strings = std::dynamic_pointer_cast(const_node)->get_data_ptr(); + } + const_node = std::make_shared(const_node->outputs()); + const_node = std::make_shared(const_node->outputs()); + } + #endif } //set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name return {const_node}; From 40063c1d3d6ffc99311f2efb7b737bbda5862725 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Tue, 16 May 2023 04:02:43 +0400 Subject: [PATCH 009/116] Debug output of indices in RaggedTensorPack --- .../sentence_piece/sentence_piece.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 024694c04..bc49a65ec 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -359,9 +359,17 @@ void RaggedTensorPack::validate_and_infer_types() { bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass throug the base tensor with elements. + // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass the base tensor with elements throug. - // TODO: Actually print indices, skipped for now... + auto input_shape = inputs[0].get_shape(); + std::cout << "[ DEBUG ] RaggedTensorPack: shape = " << input_shape << "\n"; + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto num_elements = shape_size(input_shape); + + for(size_t i = 0; i < num_elements; ++i) { + std::cout << "[ DEBUG ] [" << i << "] " << begins[i] << ":" << ends[i] << " with size = " << ends[i] - begins[i] << "\n"; + } inputs[2].copy_to(outputs[0]); From cc47b12a10d3ad667219cfbc7ee404c972c9099a Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Wed, 17 May 2023 09:47:43 +0400 Subject: [PATCH 010/116] Implemented a stub for WordpieceTokenizer. Supported conversion of a combination of WordpieceTokenizeWithOffsets and LookupTableFindV2 from TensorFlow --- .../user_ie_extensions/ov_extension.cpp | 3 + .../sentence_piece/sentence_piece.cpp | 192 ++++++++++++++++-- .../sentence_piece/sentence_piece.hpp | 39 ++++ 3 files changed, 219 insertions(+), 15 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 6b09e722d..77868a243 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -66,6 +66,9 @@ std::make_shared("StaticRegexReplace", translate_static_regex_replace), \ std::make_shared>(), \ std::make_shared("RegexSplitWithOffsets", translate_regex_split_with_offsets), \ + std::make_shared>(), \ + std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ + std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ std::make_shared("Reshape", translate_reshape), \ std::make_shared("Const", translate_const), \ std::make_shared>(), \ diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index bc49a65ec..20c124fd9 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -8,6 +8,8 @@ #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" +// TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor + #ifndef OPENVINO_ELEMENT_STRING_SUPPORTED #define OPENVINO_ELEMENT_STRING_SUPPORTED 0 #endif @@ -282,6 +284,14 @@ void check_string_scalar_input(const Node* node, size_t input_index) { #endif } +void check_ragged_string_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::i32, "Expected an i32 tensor as the third part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+3) == element::i32, "Expected an i32 tensor as the forth part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+4) == element::u8, "Expected a u8 tensor as the fifth part of the decomposed ragged string representation"); +} + void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { node->set_output_type(output_index+0, element::i32, shape); // byte offset in output[+2] -- begin of each string node->set_output_type(output_index+1, element::i32, shape); // byte offset in output[+2] -- end of each string @@ -296,6 +306,12 @@ void set_ragged_string_output(Node* node, size_t output_index, const PartialShap node->set_output_type(output_index+4, element::u8, PartialShape{Dimension()}); // symbols from all strings cnocatenated } +void set_ragged_output(Node* node, size_t output_index, const PartialShape& shape, element::Type type) { + node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements + node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+2] -- end of each ragged dimension elements + node->set_output_type(output_index+2, type, PartialShape{Dimension()}); // flatten elements +} + void StringTensorPack::validate_and_infer_types() { OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supporst only 'begins_ends' mode, but get " + m_mode); @@ -526,19 +542,22 @@ bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVec } -OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t input_index) { - auto input = node.get_input(input_index); - auto input_node = input.get_node_shared_ptr(); - -#if !USE_STRING_TENSORS - // Override type of input tensor if this is a Parameter - if (auto parameter = std::dynamic_pointer_cast(input_node)) { +void override_parameter (std::shared_ptr node, element::Type type, const PartialShape& shape) { + if (auto parameter = std::dynamic_pointer_cast(node)) { // TODO: Apply this change conditionally based on real Parameter value - std::cerr << "Overriding Parameter element_type to U8 to be ready to accept a packed batch of strings\n"; - parameter->set_partial_shape(PartialShape{ Dimension() }); - parameter->set_element_type(element::u8); + std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n"; + parameter->set_partial_shape(shape); + parameter->set_element_type(type); parameter->validate_and_infer_types(); } +} + +// TODO: replace NodeContext and input_index by a single input +OutputVector pre_translate_string_tensor_input(ov::Output input) { + auto input_node = input.get_node_shared_ptr(); + +#if !USE_STRING_TENSORS + override_parameter(input_node, element::u8, PartialShape{Dimension()}); #endif if (auto struct_pack = std::dynamic_pointer_cast(input_node)) { @@ -552,11 +571,31 @@ OutputVector pre_translate_string_tensor_input(const NodeContext& node, size_t i // Unpack this tensor using standard operations // Cannot do that because there is not ReinterprectCast operation in OV - // TODO: Find a way to make it without reinterpretation operation + // TODO: Find a way to make it without reinterpretation operation or introduce it as an extension (easy) #endif } } + + +OutputVector pre_translate_ragged_tensor_input(ov::Output input) { + auto ragged_pack = dynamic_cast(input.get_node()); + OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); + return ragged_pack->input_values(); +} + +OutputVector pre_translate_ragged_string_tensor_input(ov::Output input) { + // auto ragged_pack = dynamic_cast(node.get_input(input_index).get_node()); + // OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); + auto ragged_inputs = pre_translate_ragged_tensor_input(input); + auto string_inputs = pre_translate_string_tensor_input(ragged_inputs[2]); + ragged_inputs.pop_back(); + ragged_inputs.insert(ragged_inputs.end(), string_inputs.begin(), string_inputs.end()); + // auto string_pack = dynamic_cast(ragged_pack->get_input_node_ptr(2)); + // OPENVINO_ASSERT(string_pack, "Expected StringTensorPack as a base for RaggedTensorPack but didn't find it"); + return ragged_inputs; +} + ov::Output post_translate_string_tensor_output(const OutputVector& outputs) { FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); return std::make_shared(outputs, "begins_ends"); @@ -679,7 +718,7 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); return { post_translate_string_tensor_output(std::make_shared( - pre_translate_string_tensor_input(node, 0))->outputs()) }; + pre_translate_string_tensor_input(node.get_input(0)))->outputs()) }; } @@ -733,7 +772,7 @@ bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVecto ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "NormalizeUTF8 expects only 1 input"); return { post_translate_string_tensor_output(std::make_shared( - pre_translate_string_tensor_input(node, 0), + pre_translate_string_tensor_input(node.get_input(0)), node.get_attribute("normalization_form"))->outputs()) }; } @@ -810,7 +849,7 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input"); - ov::OutputVector inputs = pre_translate_string_tensor_input(node, 0); + ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); inputs.push_back(string_attribute_to_constant(node, "pattern")); inputs.push_back(string_attribute_to_constant(node, "rewrite")); return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) }; @@ -919,7 +958,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "RegexSplitWithOffsets expects 3 inputs"); - ov::OutputVector inputs = pre_translate_string_tensor_input(node, 0); + ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there inputs.push_back(delim_regex_pattern); auto outputs = std::make_shared(inputs)->outputs(); @@ -928,6 +967,129 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont } + +void WordpieceTokenizer::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_input(this, 5); + set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); +} + +bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto vocab_begins = inputs[5].data(); + auto vocab_ends = inputs[6].data(); + auto vocab_chars = inputs[7].data(); + + OPENVINO_ASSERT(inputs.size() == 9, "Too few inputs passed to WordpieceTokenizer, it means it is not converted properly or it is not used in the supported pattern"); + + auto unk_token_id = *inputs[8].data(); +#if 0 + // TODO: Complete implementation +#else + // Stub implementation that transforms each input string to its length duplicating element if the length is odd + { + std::cout << "[ DEBUG ] WordpieceTokenizer\n"; + std::cout << "[ DEBUG ] vocab size: " << inputs[5].get_size() << "\n"; + std::cout << "[ DEBUG ] unk_token_id: " << unk_token_id << "\n"; + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elems = inputs[0].get_size(); + + const size_t num_parts = inputs[2].get_size(); + size_t new_num_parts = num_parts; + // Count number of output elements + for(size_t i = 0; i < num_parts; ++i) { + auto length = ends[i] - begins[i]; + new_num_parts += length % 2; + } + + outputs[2].set_shape({new_num_parts}); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t offset = 0; + + for(size_t j = 0; j < num_elems; ++j) { + new_begins[j] = offset; + + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + + auto length = ends[i] - begins[i]; + new_elems[offset++] = length; + + if(length % 2) { + new_elems[offset++] = length; + } + } + + new_ends[j] = offset; + } + + OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + return true; + } + // End of stub implementation +#endif +} + + +ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "WordpieceTokenizeWithOffsets expects 2 inputs"); + ov::OutputVector inputs = pre_translate_ragged_string_tensor_input(node.get_input(0)); + + #if USE_STRING_TENSORS + // It may seem enough to call pre_translate_string_tensor_input that will override Parameter element + // type in case if string tensors are not used. + // But a Parameter is still required to be overridden even if string tensors are used because in TF model + // it is represented not as a string tensor, but as a resource with hash table for lookup that we cannot interpret + // and have to replace by 1D string tensor. + override_parameter(node.get_input(1).get_node_shared_ptr(), element::string, PartialShape{Dimension()}); + #endif + + auto vocab = pre_translate_string_tensor_input(node.get_input(1)); + inputs.insert(inputs.end(), vocab.begin(), vocab.end()); + // FIXME: Cannot set real value for unk_token_id from attributes because it is not known in this operation + // TODO: Set other attributes. + auto wp_tokenizer = std::make_shared( + inputs, + node.get_attribute("suffix_indicator"), + node.get_attribute("max_bytes_per_word") + ); + return { post_translate_ragged_tensor_output(wp_tokenizer->outputs()) }; +} + + +ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFindV2 expects 3 inputs"); + + // Check if this node is used in a combination with already converted WordpieceTokenizeWithOffsets + auto wp_tokenizer_outputs = pre_translate_ragged_tensor_input(node.get_input(1)); + auto wp_tokenizer = dynamic_cast(wp_tokenizer_outputs[0].get_node()); + OPENVINO_ASSERT(wp_tokenizer, "Conversion of LookupTableFindV2 without coupled WordpieceTokenizer is not yet supported"); + + // TODO: Check vocab matching for LookupTableFindV2 and WordpieceTokenizer + + // TODO: Check if overflow really happens in real models due to i64 to i32 conversion + auto unk_token_id = std::make_shared(node.get_input(2), element::i32); + + auto wp_tokenizer_inputs = wp_tokenizer->input_values(); + wp_tokenizer_inputs.push_back(unk_token_id); + std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n"; + + auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs); + return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) }; +} + + ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. // It checks if the input tensor has string type, and then perform custom tranlation. diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 2843f7857..8950b0952 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -296,6 +296,45 @@ class OPENVINO_API RegexSplit : public ov::op::Op { ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node); +class OPENVINO_API WordpieceTokenizer : public ov::op::Op { +public: + OPENVINO_OP("WordpieceTokenizer"); + + WordpieceTokenizer () = default; + + WordpieceTokenizer(const ov::OutputVector& arguments, const std::string& suffix_indicator = "##", int max_bytes_per_word = 100) : + ov::op::Op(arguments), + m_suffix_indicator(suffix_indicator), + m_max_bytes_per_word(max_bytes_per_word) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_suffix_indicator, m_max_bytes_per_word); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("suffix_indicator", m_suffix_indicator); + visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_suffix_indicator; + int m_max_bytes_per_word; // TODO: Why do we need to limit it in this way? Can it be done outside the op as preprocessing of the input? +}; + +ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); +ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); ov::OutputVector translate_const(const ov::frontend::NodeContext& node); From 76442314f8deb6a827635000e6e78e0f28f94cbd Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Wed, 17 May 2023 20:47:21 +0400 Subject: [PATCH 011/116] Disabled debug output --- .../user_ie_extensions/sentence_piece/sentence_piece.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 20c124fd9..50e2ea64e 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1083,7 +1083,7 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& auto wp_tokenizer_inputs = wp_tokenizer->input_values(); wp_tokenizer_inputs.push_back(unk_token_id); - std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n"; + //std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n"; auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs); return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) }; From 80b8023127e9356d23adfb3e4d403f4580524744 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 18 May 2023 09:19:48 +0400 Subject: [PATCH 012/116] Define default values for custom operations attributes to make attribute initialization optional (needed for core.make_node) --- .../sentence_piece/sentence_piece.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 8950b0952..b4ffc8a2e 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -44,7 +44,7 @@ class StringTensorPack : public ov::op::Op { private: - std::string m_mode; + std::string m_mode = "begins_ends"; }; @@ -119,7 +119,7 @@ class StringTensorUnpack : public ov::op::Op { private: - std::string m_mode; + std::string m_mode = "begins_ends"; }; @@ -161,7 +161,7 @@ ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::fron class OPENVINO_API CaseFold : public ov::op::Op { public: - OPENVINO_OP("CaseFold "); + OPENVINO_OP("CaseFold"); CaseFold () = default; @@ -196,7 +196,7 @@ class OPENVINO_API NormalizeUnicode : public ov::op::Op { NormalizeUnicode () = default; - NormalizeUnicode(const ov::OutputVector& arguments, const std::string& normalization_form) : + NormalizeUnicode(const ov::OutputVector& arguments, const std::string& normalization_form = "NFD") : ov::op::Op(arguments), m_normalization_form(normalization_form) { constructor_validate_and_infer_types(); @@ -221,7 +221,7 @@ class OPENVINO_API NormalizeUnicode : public ov::op::Op { private: - std::string m_normalization_form; + std::string m_normalization_form = "NFD"; }; ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); @@ -290,8 +290,8 @@ class OPENVINO_API RegexSplit : public ov::op::Op { private: - std::string m_behaviour; - bool m_invert; + std::string m_behaviour = "removed"; + bool m_invert = false; }; ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node); @@ -329,8 +329,8 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { private: - std::string m_suffix_indicator; - int m_max_bytes_per_word; // TODO: Why do we need to limit it in this way? Can it be done outside the op as preprocessing of the input? + std::string m_suffix_indicator = "##"; + int m_max_bytes_per_word = 100; // TODO: Can it be done outside the op as preprocessing of the input? }; ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); From 46c82b854c3af74b32710fe5ed6f64667122c93e Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Sat, 20 May 2023 04:03:03 +0400 Subject: [PATCH 013/116] Added fast_tokenizer lib to the build. Implemented CaseFold based on fast_tokenizer. --- .../sentence_piece/CMakeLists.txt | 15 ++++++- .../sentence_piece/sentence_piece.cpp | 42 +++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt index 5817a9ad3..1a8edc33e 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt @@ -20,6 +20,12 @@ FetchContent_Declare( URL_HASH SHA256=a7c105aca0131b4a899155a6c44ea9728e63514edaa8d71fa92e7a5de53b6ca0 ) +FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz + URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc +) + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") set(cxx_flags "-Wno-undef") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") @@ -36,6 +42,9 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") FetchContent_MakeAvailable(sentencepiece) +FetchContent_MakeAvailable(fast_tokenizer) + +include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") # set include dirs for specific source files target_include_directories(${TARGET_NAME} PRIVATE @@ -44,13 +53,15 @@ target_include_directories(${TARGET_NAME} PRIVATE "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite" "${sentencepiece_SOURCE_DIR}" "${sentencepiece_SOURCE_DIR}" - "${sentencepiece_BINARY_DIR}") + "${sentencepiece_BINARY_DIR}" + "${FAST_TOKENIZER_INCS}") + if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) endif() -target_link_libraries(${TARGET_NAME} PRIVATE sentencepiece-static) +target_link_libraries(${TARGET_NAME} PRIVATE sentencepiece-static ${FAST_TOKENIZER_LIBS}) # string_view is used from cxx17 string(REPLACE " " ";" cxx_flags "${cxx_flags}") diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 50e2ea64e..ea95f188a 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -8,6 +8,10 @@ #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + // TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor #ifndef OPENVINO_ELEMENT_STRING_SUPPORTED @@ -683,6 +687,43 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input auto ends = inputs[1].data(); auto chars = inputs[2].data(); +#if 1 + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elements = inputs[0].get_size(); + + // TODO: Provide more accurate heuristics to estimate output shape + const size_t new_len = 2*inputs[2].get_size(); + + outputs[2].set_shape(Shape{new_len}); + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + int32_t char_offset = 0; + + for(size_t i = 0; i < num_elements; ++i) { + new_begins[i] = char_offset; + + using namespace paddlenlp::fast_tokenizer; + normalizers::NormalizedString str(std::string(chars + begins[i], chars + ends[i])); + + // Do the job + str.Lowercase(); + + const std::string& new_str = str.GetStr(); + std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); + char_offset += new_str.length(); + new_ends[i] = char_offset; + } + std::cerr << "hey\n"; + return true; +#else // Stub implementation that transforms each input string "X" to "CaseFold(X)" for debugging purposes { // Set output shapes @@ -712,6 +753,7 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input return true; } // End of stub implementation +#endif } From d7ca2abcf8652a94b6a0e81934d2505c172d3fe0 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Sat, 20 May 2023 04:47:56 +0400 Subject: [PATCH 014/116] Removed debug output --- .../user_ie_extensions/sentence_piece/sentence_piece.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index ea95f188a..630e27b42 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -721,7 +721,6 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input char_offset += new_str.length(); new_ends[i] = char_offset; } - std::cerr << "hey\n"; return true; #else // Stub implementation that transforms each input string "X" to "CaseFold(X)" for debugging purposes From 2baac3de01bfed53420d3e93eada230e1a28fe1d Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Sat, 20 May 2023 08:22:23 +0400 Subject: [PATCH 015/116] Implemented RaggedToDense always in pad_right=true mode and with boolean mask extra output --- .../user_ie_extensions/ov_extension.cpp | 1 + .../sentence_piece/sentence_piece.cpp | 86 ++++++++++++++++++- .../sentence_piece/sentence_piece.hpp | 29 +++++++ 3 files changed, 115 insertions(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 77868a243..3cdd2406d 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -69,6 +69,7 @@ std::make_shared>(), \ std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ + std::make_shared>(), \ std::make_shared("Reshape", translate_reshape), \ std::make_shared("Const", translate_const), \ std::make_shared>(), \ diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 630e27b42..11ca2a410 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -288,6 +288,13 @@ void check_string_scalar_input(const Node* node, size_t input_index) { #endif } +void check_ragged_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged representation"); + auto rank = node->get_input_partial_shape(input_index+2).rank(); + FRONT_END_GENERAL_CHECK(rank.is_dynamic() || rank.get_length() == 1, "The last tensor in ragged tensor representation should be a 1D tensor"); +} + void check_ragged_string_input(const Node* node, size_t input_index) { FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged string representation"); FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged string representation"); @@ -1008,7 +1015,6 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont } - void WordpieceTokenizer::validate_and_infer_types() { check_ragged_string_input(this, 0); check_string_input(this, 5); @@ -1131,6 +1137,84 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& } +void RaggedToDense::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 3 + 1 + 1); + + // Input ragged tensor + check_ragged_input(this, 0); + + // Target size along ragged dimension + OPENVINO_ASSERT(get_input_element_type(3).is_integral_number()); + auto rank = get_input_partial_shape(3).rank(); + OPENVINO_ASSERT( + rank.is_dynamic() || + rank.get_length() == 0 || + rank.get_length() == 1 && get_input_partial_shape(3)[0].compatible(1), + "Target dense dimension size for RaggedToDense should be a 0D or 1D tensor with a single element"); + + // Default value to fill out of ragged range elements in output tensor + OPENVINO_ASSERT(get_input_element_type(4).compatible(get_input_element_type(2))); + auto input4_rank = get_input_partial_shape(4).rank(); + OPENVINO_ASSERT(input4_rank.compatible(0)); + + set_input_is_relevant_to_shape(3); + + if(get_input_partial_shape(0).rank().is_dynamic()) { + set_output_type(0, get_input_element_type(2), PartialShape::dynamic()); + set_output_type(1, element::boolean, PartialShape::dynamic()); + } else { + auto shape = get_input_partial_shape(0); + if(auto target_dim = dynamic_cast(get_input_node_ptr(3))) { + shape.push_back(target_dim->cast_vector()[0]); + } else { + shape.push_back(Dimension()); + } + set_output_type(0, get_input_element_type(2), shape); + set_output_type(1, element::boolean, shape); + } +} + + +bool RaggedToDense::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // FIXME: Works for POD types only (not for strings!) + // FIXME: Output mask is calculated even if there are no consumers + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto nelems = inputs[0].get_size(); + auto elems = reinterpret_cast(inputs[2].data()); + auto elem_size = inputs[2].get_element_type().size(); + auto default_value = reinterpret_cast(inputs[4].data()); + + // Suppose validate was called and set correct output shape + // Take a target shape value for ragged dimension + size_t target_dim = outputs[0].get_shape().back(); + + auto out_elems = reinterpret_cast(outputs[0].data()); + auto out_mask = outputs[1].data(); + + auto out_elem_orig = out_elems; + auto out_mask_orig = out_mask; + + for(size_t i = 0; i < nelems; ++i) { + auto begin = elems + elem_size*begins[i]; + auto len = std::min(size_t(ends[i] - begins[i]), target_dim); // truncation + auto end = begin + elem_size*len; + out_elems = std::copy(begin, end, out_elems); + out_mask = std::fill_n(out_mask, len, char(1)); + if(len < target_dim) + out_mask = std::fill_n(out_mask, target_dim - len, char(0)); + while(len < target_dim) { + out_elems = std::copy(default_value, default_value + elem_size, out_elems); + ++len; + } + } + + OPENVINO_ASSERT(out_elems == out_elem_orig + outputs[0].get_byte_size()); + OPENVINO_ASSERT(out_mask == out_mask_orig + outputs[1].get_byte_size()); + return true; +} + + ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. // It checks if the input tensor has string type, and then perform custom tranlation. diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index b4ffc8a2e..a84236726 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -336,5 +336,34 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); +// Takes a ragged tensor with one ragged right-most dimension and produces a normal tensor +class OPENVINO_API RaggedToDense : public ov::op::Op { +public: + OPENVINO_OP("RaggedToDense"); + + RaggedToDense () = default; + + RaggedToDense(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; + ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); ov::OutputVector translate_const(const ov::frontend::NodeContext& node); From d270dd677e5c525bc4f339adb1db1b3fc598a853 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Tue, 23 May 2023 04:17:39 +0400 Subject: [PATCH 016/116] Provided real implementations for NormalizeUnicode, RegexNormalization and RegexSplit based on paddle fast_tokenizer lib. Limited implementation, not all of the features of ops and TF translated ops are implemented. --- .../sentence_piece/sentence_piece.cpp | 190 ++++++++++++++---- 1 file changed, 156 insertions(+), 34 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 11ca2a410..63fd67b70 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include "normalizer.h" #include "sentence_piece.hpp" @@ -350,6 +352,7 @@ bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVecto auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; outputs[0].set_shape(Shape{num_output_elements}); + // FIXME: Do the repacking, otherwise cannot handle string tensors with gaps between strings //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code auto ends = inputs[1].data(); auto chars = inputs[2].data(); @@ -683,27 +686,19 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { return named_results; } - -void CaseFold::validate_and_infer_types() { - check_string_input(this, 0); - set_string_output(this, 0, get_input_partial_shape(0)); -} - -bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorVector& inputs, std::function normalizer) { auto begins = inputs[0].data(); auto ends = inputs[1].data(); auto chars = inputs[2].data(); -#if 1 // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); const size_t num_elements = inputs[0].get_size(); - // TODO: Provide more accurate heuristics to estimate output shape - const size_t new_len = 2*inputs[2].get_size(); - - outputs[2].set_shape(Shape{new_len}); + // TODO: How to avoid copying from this temporary buffer? + // TODO: It can be possible to collect output symbols directly in the output tensor memory if `normalizer` has reasonable estimation for the final size. + std::deque buffer; // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions // and only number of elements in the original tensors matter @@ -711,27 +706,46 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input // Get pointers in the output tensors auto new_begins = outputs[0].data(); auto new_ends = outputs[1].data(); - auto new_chars = outputs[2].data(); - int32_t char_offset = 0; for(size_t i = 0; i < num_elements; ++i) { - new_begins[i] = char_offset; + new_begins[i] = buffer.size(); + std::string new_str = normalizer(std::string(chars + begins[i], chars + ends[i])); + buffer.insert(buffer.end(), new_str.begin(), new_str.end()); + new_ends[i] = buffer.size(); + } - using namespace paddlenlp::fast_tokenizer; - normalizers::NormalizedString str(std::string(chars + begins[i], chars + ends[i])); + // Copy collected symbols to the target output tensor - // Do the job - str.Lowercase(); + outputs[2].set_shape(Shape{buffer.size()}); + auto new_chars = outputs[2].data(); + std::copy(buffer.begin(), buffer.end(), new_chars); - const std::string& new_str = str.GetStr(); - std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); - char_offset += new_str.length(); - new_ends[i] = char_offset; - } return true; +} + + +void CaseFold::validate_and_infer_types() { + check_string_input(this, 0); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +#if 1 + + return evaluate_normalization_helper( + outputs, inputs, + [](const std::string& str) { + using namespace paddlenlp::fast_tokenizer; + return normalizers::NormalizedString(str).Lowercase().GetStr(); + }); + #else // Stub implementation that transforms each input string "X" to "CaseFold(X)" for debugging purposes { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); @@ -769,21 +783,37 @@ ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) pre_translate_string_tensor_input(node.get_input(0)))->outputs()) }; } +namespace { +using namespace paddlenlp::fast_tokenizer::normalizers; +using NormalizersMap = std::map>; + +const NormalizersMap normalizers = { + {"NFD", [](const std::string& str) { return NormalizedString(str).NFD().GetStr(); }}, + {"NFC", [](const std::string& str) { return NormalizedString(str).NFC().GetStr(); }}, + {"NFKD", [](const std::string& str) { return NormalizedString(str).NFKD().GetStr(); }}, + {"NFKC", [](const std::string& str) { return NormalizedString(str).NFKC().GetStr(); }}, +}; + +} void NormalizeUnicode::validate_and_infer_types() { check_string_input(this, 0); + OPENVINO_ASSERT(normalizers.find(m_normalization_form) != normalizers.end(), "NormalizeUnicode doesn't know normalization form " + m_normalization_form); set_string_output(this, 0, get_input_partial_shape(0)); } bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +#if 1 + + return evaluate_normalization_helper(outputs, inputs, normalizers.at(m_normalization_form)); + +#else + auto begins = inputs[0].data(); auto ends = inputs[1].data(); auto chars = inputs[2].data(); -#if 0 - // TODO: Complete implementation -#else // Stub implementation that transforms each input string "X" to "NormalizeUnicode(X, normalization_form)" for debugging purposes { // Set output shapes @@ -833,20 +863,29 @@ void RegexNormalization::validate_and_infer_types() { } bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - auto search_pattern_buf = inputs[3].data(); auto replace_pattern_buf = inputs[4].data(); auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant -#if 0 - // TODO: Complete implementation +#if 1 + + using namespace paddlenlp::fast_tokenizer::normalizers; + re2::RE2 search_pattern_re(search_pattern); + + return evaluate_normalization_helper( + outputs, inputs, + [&replace_pattern, &search_pattern_re](const std::string& str) { + return NormalizedString(str).Replace(search_pattern_re, std::string(replace_pattern)).GetStr(); + }); + #else // Stub implementation that transforms each input string "X" to "RegexNormalization(X, search_pattern, replace_pattern)" for debugging purposes { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); @@ -887,6 +926,8 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont // FIXME: using space to pad the value to work-around CPU issue with empty constants auto value = node.get_attribute(name) + " "; + // TODO: How to translate attribute `replace_global`? + #if USE_STRING_TENSORS return std::make_shared(element::string, {}, value); #else @@ -904,10 +945,23 @@ ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& } +namespace { + +using paddlenlp::fast_tokenizer::core::SplitMode; +const std::map split_modes = { + {"removed", SplitMode::REMOVED}, + {"isolated", SplitMode::ISOLATED}, + {"merged_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, + {"merged_with_next", SplitMode::MERGED_WITH_NEXT}, +}; + +} + void RegexSplit::validate_and_infer_types() { check_string_input(this, 0); check_string_scalar_input(this, 3); + OPENVINO_ASSERT(split_modes.find(m_behaviour) != split_modes.end(), "RegexSplit doesn't support unknown split mode: " + m_behaviour); set_ragged_string_output(this, 0, get_input_partial_shape(0)); } @@ -919,7 +973,74 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto split_pattern_buf = inputs[3].data(); auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant -#if 0 +#if 1 + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + const size_t num_elements = inputs[0].get_size(); + const size_t num_chars = inputs[2].get_size(); + + // TODO: Better estimations for max size? + // Assume we cannot have empty parts, so the number of parts cannot be bigger than the number of symbols + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); + + // Assume we cannot introduce new symbols to output, only existing can be distributed (with gaps) + + // TODO: Can we just route input tensor directly to the output outside evaluate when graph is being constructed? + outputs[4] = inputs[2]; // TODO: Does it really work? + + // If line above doesn't work, do this instead: + //outputs[4].set_shape(Shape{num_chars}); + //inputs[2].copy_to(outputs[4]); + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; + + using namespace paddlenlp::fast_tokenizer; + + auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); + + for(size_t i = 0; i < num_elements; ++i) { + auto old_str = std::string(chars + begins[i], chars + ends[i]); + //std::cerr << "[ RegexSplit ] old_str: " << old_str << "\n"; + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(old_str); + pretokenizer(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); + + new_ragged_begins[i] = ragged_offset; + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + //const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); + //std::cerr << "[ RegexSplit ] split part: " << value << "\n"; + //std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[i] + offset.first; + new_ends[ragged_offset] = begins[i] + offset.second; + + ++ragged_offset; + }; + + new_ragged_ends[i] = ragged_offset; + } + + // Fix real shape based on collected results + outputs[2].set_shape({ragged_offset}); + outputs[3].set_shape({ragged_offset}); + //outputs[4].set_shape({char_offset}); + + return true; + // TODO: Complete implementation #else // Stub implementation that transforms each input string "X" to multiple "RegexSplit(X, split_pattern) = part(X)" for debugging purposes @@ -1009,6 +1130,7 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there inputs.push_back(delim_regex_pattern); + // TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolated` behaviour auto outputs = std::make_shared(inputs)->outputs(); auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; From 119d6e923515ce1e14b0d00c0add813c14999b7e Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Tue, 23 May 2023 20:41:35 +0400 Subject: [PATCH 017/116] Implemented WordpieceTokenizer with fast_tokenizer library --- .../sentence_piece/sentence_piece.cpp | 74 ++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 63fd67b70..7f7486728 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1154,11 +1154,81 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto vocab_ends = inputs[6].data(); auto vocab_chars = inputs[7].data(); + auto vocab_size = inputs[5].get_size(); + OPENVINO_ASSERT(inputs.size() == 9, "Too few inputs passed to WordpieceTokenizer, it means it is not converted properly or it is not used in the supported pattern"); auto unk_token_id = *inputs[8].data(); -#if 0 - // TODO: Complete implementation + //std::cerr << "[ WordpieceTokenizer ] unk_token_id = " << unk_token_id << "\n"; + +#if 1 + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elems = inputs[0].get_size(); + + //const size_t num_parts = inputs[2].get_size(); + //size_t new_num_parts = num_parts; + + // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area + // to represent different elements in ragged tensor + outputs[2].set_shape({inputs[4].get_size()}); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t offset = 0; + + using namespace paddlenlp::fast_tokenizer; + + std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; + core::Vocab vocab; + std::string unk_token; + if(unk_token_id < 0) + unk_token_id += vocab_size; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + if(id == unk_token_id) + unk_token = token; + } + + std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; + std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; + std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; + + auto tokenizer = models::FastWordPiece(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); // FIXME: why true? + + std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; + + + for(size_t j = 0; j < num_elems; ++j) { + new_begins[j] = offset; + + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + + auto str = std::string(chars + begins[i], chars + ends[i]); + std::vector results = tokenizer.Tokenize(str); + + for (const core::Token& token : results) { + //std::cout << "[ WordpieceTokenizer ] id: " << token.id_ << ", value: " << token.value_ + // << ", offset: (" << token.offset_.first << ", " + // << token.offset_.second << ")." << std::endl; + OPENVINO_ASSERT(offset < outputs[2].get_size()); + new_elems[offset++] = token.id_; + }; + } + + new_ends[j] = offset; + } + + outputs[2].set_shape({offset}); + + OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + return true; + #else // Stub implementation that transforms each input string to its length duplicating element if the length is odd { From 4d4ad89c2043f379aafefe6f0bda6c4216b0630b Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 25 May 2023 04:15:21 +0400 Subject: [PATCH 018/116] Renamed behaviours to be verbs instead of adjectives --- .../user_ie_extensions/sentence_piece/sentence_piece.cpp | 8 ++++---- .../user_ie_extensions/sentence_piece/sentence_piece.hpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 7f7486728..6e517273d 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -949,10 +949,10 @@ namespace { using paddlenlp::fast_tokenizer::core::SplitMode; const std::map split_modes = { - {"removed", SplitMode::REMOVED}, - {"isolated", SplitMode::ISOLATED}, - {"merged_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, - {"merged_with_next", SplitMode::MERGED_WITH_NEXT}, + {"remove", SplitMode::REMOVED}, + {"isolate", SplitMode::ISOLATED}, + {"merge_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, + {"merge_with_next", SplitMode::MERGED_WITH_NEXT}, }; } diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index a84236726..3b307485e 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -290,7 +290,7 @@ class OPENVINO_API RegexSplit : public ov::op::Op { private: - std::string m_behaviour = "removed"; + std::string m_behaviour = "remove"; bool m_invert = false; }; From f4eee84b142f04537d2ec9f2aa109c2cda6d63cd Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 25 May 2023 04:16:27 +0400 Subject: [PATCH 019/116] Added modified version of HF tokenizer parser from Artur; implemented necessary steps to complete HF bert preprocessing conversion (not validated) --- .../sentence_piece/convert_tokenizer.py | 39 ++ .../sentence_piece/hf_parser.py | 189 ++++++ .../sentence_piece/tokenizer_pipeline.py | 581 ++++++++++++++++++ 3 files changed, 809 insertions(+) create mode 100644 modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py create mode 100644 modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py create mode 100644 modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py new file mode 100644 index 000000000..f2d5f4630 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import sys +from typing import Any, List + +from openvino.runtime.exceptions import OVTypeError +from openvino.runtime import Model, Output +from tokenizer_pipeline import TokenizerPipeline + + +def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> TokenizerPipeline: + if "transformers" in sys.modules: + from transformers import PreTrainedTokenizerBase + from hf_parser import TransformersTokenizerPipelineParser + + if isinstance(tokenizer_object, PreTrainedTokenizerBase): + return TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs) + + + raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") + + +def apply_tokenizer(model: Model, tokenizer: Model) -> Model: + assert len(model.inputs) == len(tokenizer.outputs) + + # need to check if the inputs are aligned: + # - inputs_ids -> inputs_ids + # - attention_mask -> attention_mask + # - token_type_ids -> token_type_ids + aligned_model_inputs = model.inputs + aligned_tokenizer_outputs: List[Output] = tokenizer.outputs + + for model_input, tokenizer_output in zip(aligned_model_inputs, aligned_tokenizer_outputs): + for target in model_input.get_target_inputs(): + target.replace_source_output(tokenizer_output) + + return Model(model.outputs, tokenizer.inputs, name=model.name) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py new file mode 100644 index 000000000..17bf6aaad --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Optional, Dict, Callable, Union, List + +from openvino.runtime.exceptions import OVTypeError +from tokenizer_pipeline import ( + TokenizerPipeline, + NormalizationStep, + NormalizeUnicode, + NMTNormalizationStep, + CaseFoldStep, + RegExpNormalizationStep, + StripStringStep, + PreTokenizatinStep, + PunctuationSplitStep, + RegexSplitStep, + WhitespaceSplitStep, + WordPieceTokenizationStep, + TruncationStep, + PaddingStep, + CombineSegmentsStep, +) + + +def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegExpNormalizationStep: + regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"] + return RegExpNormalizationStep( + regex_search_pattern=regex_search_pattern, + replace_term=normalizer_dict["content"], + ) + + +def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]: + steps: List[NormalizationStep] = [NormalizeUnicode("NFD")] + + if normalizer_dict["lowercase"] is True: + steps.append(CaseFoldStep()) + + if normalizer_dict["clean_text"] is True: + steps.append(RegExpNormalizationStep.del_control_chars_regex()) + + if normalizer_dict["strip_accents"] is True: + steps.append(RegExpNormalizationStep.strip_accents_regex()) + + return steps + + +def parse_strip_step(split_dict: Dict[str, Any]) -> StripStringStep: + return StripStringStep( + left=split_dict["strip_left"], + right=split_dict["strip_right"], + ) + + +def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep: + split_pattern = pretokenizer_dict["pattern"].get("String") or pretokenizer_dict["pattern"]["Regex"] + return RegexSplitStep( + split_pattern=split_pattern, + invert=pretokenizer_dict["invert"], + behaviour=pretokenizer_dict["behavior"], + ) + + +class TransformersTokenizerPipelineParser: + def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None: + assert tokenizer_object.is_fast + + self.original_tokenizer = tokenizer_object + with TemporaryDirectory() as tmpdir: + tokenizer_object.save_pretrained(tmpdir) + with open(Path(tmpdir) / "tokenizer.json") as tj: + self.tokenizer_json = json.load(tj) + self.pipeline = TokenizerPipeline() + self.number_of_inputs = number_of_inputs + self.num_of_added_tokens = 0 + + def parse(self, number_of_inputs: Optional[int] = None) -> TokenizerPipeline: + self.number_of_inputs = self.number_of_inputs if number_of_inputs is None else number_of_inputs + self.pipeline.number_of_inputs = self.number_of_inputs + for add_steps in [ + self.normalization, + self.pre_tokenization, + self.tokenization_model, + self.post_tokenization, + ]: + add_steps() + + return self.pipeline + + normalizers_map: Dict[str, Callable[[Dict[str, Any]], Union[NormalizationStep, List[NormalizationStep]]]] = { + "NFC": lambda step_dict: NormalizeUnicode("NFC"), + "NFD": lambda step_dict: NormalizeUnicode("NFD"), + "NFKC": lambda step_dict: NormalizeUnicode("NFKC"), + "NFKD": lambda step_dict: NormalizeUnicode("NFKD"), + "Nmt": lambda step_dict: NMTNormalizationStep(), + "Lowercase": lambda step_dict: CaseFoldStep(), + "StripAccents": lambda step_dict: RegExpNormalizationStep.strip_accents_regex(), + "BertNormalizer": parse_bert_normalizer, + "Replace": parse_replace_normalizer, + "Strip": parse_strip_step, + } + + def parse_normalizer_step(self, step_dict: Dict[str, Any]) -> None: + try: + self.pipeline.add_steps(self.normalizers_map[step_dict["type"]](step_dict)) + except KeyError: + raise OVTypeError(f"Normalizer type '{step_dict['type']}' is not supported") + + def normalization(self) -> None: + if self.tokenizer_json["normalizer"] is None: + return + + if self.tokenizer_json["normalizer"].get("type") == "Sequence": + for normalizer in self.tokenizer_json["normalizer"]["normalizers"]: + self.parse_normalizer_step(normalizer) + else: + self.parse_normalizer_step(self.tokenizer_json["normalizer"]) + + pre_tokenization_map: Dict[str, Callable[[Dict[str, Any]], Union[PreTokenizatinStep, List[PreTokenizatinStep]]]] = { + "BertPreTokenizer": lambda step_dict: RegexSplitStep.bert_splitter(), + "Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(), + "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(), + "Split": parse_split_step, + "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]), + } + + def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None: + try: + self.pipeline.add_steps(self.pre_tokenization_map[step_dict["type"]](step_dict)) + except KeyError: + raise OVTypeError(f"Pre-tokenizer type '{step_dict['type']}' is not supported") + + def pre_tokenization(self) -> None: + if self.tokenizer_json["pre_tokenizer"] is None: + return + + if self.tokenizer_json["pre_tokenizer"].get("type") == "Sequence": + for pretokenizer in self.tokenizer_json["pre_tokenizer"]["pretokenizers"]: + self.parse_pre_tokenization_step(pretokenizer) + else: + self.parse_pre_tokenization_step(self.tokenizer_json["pre_tokenizer"]) + + def tokenization_model(self) -> None: + if self.tokenizer_json["model"]["type"] == "WordPiece": + self.pipeline.add_steps(WordPieceTokenizationStep.from_hf_json(self.tokenizer_json)) + self.pipeline.vocab = self.pipeline[-1].vocab + else: + raise OVTypeError(f"Tokenizer type '{self.tokenizer_json['model']['type']}' is not supported") + + def post_tokenization(self) -> None: + if self.tokenizer_json["post_processor"] is None: + return + + if self.tokenizer_json["post_processor"]["type"] == "TemplateProcessing": + combine_segments_step = CombineSegmentsStep.from_hf_json_template_postprocessor( + self.tokenizer_json, self.number_of_inputs + ) + elif self.tokenizer_json["post_processor"]["type"] == "BertProcessing": + combine_segments_step = CombineSegmentsStep.from_hf_json_bert_postprocessor( + self.tokenizer_json, self.number_of_inputs + ) + else: + raise OVTypeError(f"Post-processor type '{self.tokenizer_json['post_processor']['type']}' is not supported") + + self.num_of_added_tokens += combine_segments_step.number_of_added_tokens + combine_segments_step.set_tokens_ids(self.pipeline.vocab) + + self.add_truncation() + self.pipeline.add_steps(combine_segments_step) + + self.add_padding() + + def add_truncation(self) -> None: + if self.tokenizer_json["truncation"] is not None: + self.pipeline.add_steps(TruncationStep.from_hf_json(self.tokenizer_json, self.num_of_added_tokens)) + elif self.original_tokenizer.model_max_length is not None: + self.pipeline.add_steps(TruncationStep.from_hf_object(self.original_tokenizer, self.num_of_added_tokens)) + + def add_padding(self) -> None: + if self.tokenizer_json["padding"] is not None: + self.pipeline.add_steps(PaddingStep.from_hf_json(self.tokenizer_json)) + else: + self.pipeline.add_steps(PaddingStep(token=self.original_tokenizer.pad_token)) + self.pipeline[-1].set_token_id(self.pipeline.vocab) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py new file mode 100644 index 000000000..fdb7a2c09 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -0,0 +1,581 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import os +from dataclasses import dataclass, field +from functools import singledispatchmethod +from typing import List, Optional, Any, Dict +from unittest.mock import MagicMock +import weakref + +import numpy as np + +from openvino.runtime.exceptions import UserInputError, OVTypeError +from openvino.runtime import Type, PartialShape, op, Model, Core, Output, Node, opset10 +from openvino.runtime.utils.types import as_node, make_constant_node + + +string_ops = None #MagicMock() + + +def pack_strings(strings): + assert isinstance(strings, list) + to_bytes = lambda x: x.to_bytes(4, "little") + batch_size = len(strings) + if batch_size == 0: + return to_bytes(0) + offsets = to_bytes(0) + symbols = bytes() + for s in strings: + symbols += bytes(s, "utf-8") + offsets += to_bytes(len(symbols)) + return np.frombuffer(bytearray(to_bytes(batch_size) + offsets + symbols), np.uint8) + + +def pack_string(s): + return np.frombuffer( + bytes(s + " ", "utf-8"), dtype=np.uint8 + ) # + ' ' is WA for CPU bug + + +core = Core() +core.add_extension("/home/slyalin/openvino-contrib/muse/modules/custom_operations/build/user_ie_extensions/libuser_ov_extensions.so") + + +class BasePipelineStep: + _pipeline = field(default=None, init=False, repr=False) + + def __str__(self) -> str: + params_string = ", ".join(f"{key}={val!r}" for key, val in self.get_config().items()) + return f"{self.__class__.__name__}({params_string})" + + def get_config(self) -> Dict[str, Any]: + config = {key: value for key, value in vars(self).items() if not key.startswith("_")} + properties = { + key: getattr(self, key) + for key in dir(type(self)) + if not key.startswith("_") and isinstance(getattr(type(self), key), property) + } + config.update(properties) + return config + + def get_pipeline(self) -> Optional["TokenizerPipeline"]: + return self._pipeline() + + def set_pipeline(self, pipeline: "TokenizerPipeline") -> None: + self._pipeline = weakref.ref(pipeline) + + def get_ov_subgraph(self, *input_nodes: List[Output]) -> Node: + raise NotImplementedError + + @staticmethod + def create_string_constant_node(value: str) -> op.Constant: + if isinstance(value, str): + # string scalar + ps = pack_string(value) + return op.Constant(ps) + else: + # support only 1D strings for now + ps = pack_strings(value) + return core.make_node("StringTensorUnpack", op.Constant(ps).outputs()) + + +@dataclass +class NormalizationStep(BasePipelineStep): + pass + + +@dataclass +class NormalizeUnicode(NormalizationStep): + normalization_form: str = "NFD" + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + return core.make_node( + "NormalizeUnicode", input_nodes, {"normalization_form": self.normalization_form} + ).outputs() + + +@dataclass +class CaseFoldStep(NormalizationStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + return core.make_node("CaseFold", input_nodes).outputs() + + +@dataclass +class RegExpNormalizationStep(NormalizationStep): + regex_search_pattern: str + replace_term: str + + @classmethod + def strip_accents_regex(cls) -> "RegExpNormalizationStep": + return cls(regex_search_pattern=r"\p{Mn}", replace_term="") + + @classmethod + def del_control_chars_regex(cls) -> "RegExpNormalizationStep": + return cls(regex_search_pattern=r"\p{Cc}|\p{Cf}", replace_term=" ") + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + input_nodes.extend( + ( + *self.create_string_constant_node("search_pattern").outputs(), + *self.create_string_constant_node("replace_pattern").outputs(), + ) + ) + return core.make_node( + "RegexNormalization", + input_nodes + ).outputs() + + +@dataclass +class NMTNormalizationStep(NormalizationStep): + """Normaization based on NMT task. + + https://github.com/huggingface/tokenizers/blob/28cd3dce2a75d106572392194ff2564574c33235/tokenizers/src/normalizers/unicode.rs#L44 + """ + + +@dataclass +class StripAccentsStep(NormalizationStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + return RegExpNormalizationStep.strip_accents_regex().get_ov_subgraph(input_nodes) + + +@dataclass +class DelControlCharsStep(NormalizationStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + return RegExpNormalizationStep.del_control_chars_regex().get_ov_subgraph(input_nodes) + + +@dataclass +class StripStringStep(NormalizationStep): + left: bool + right: bool + + +@dataclass +class PreTokenizatinStep(BasePipelineStep): + pass + + +@dataclass +class RegexSplitStep(PreTokenizatinStep): + split_pattern: str + invert: bool = False + behaviour: str = "Remove" + + @classmethod + def bert_splitter(cls) -> "RegexSplitStep": + """Generates a step with a standard BERT regex. + + The source: + https://github.com/tensorflow/text/blob/4a098cd852c0b7ebee621e2d211c7f202dd679c2/tensorflow_text/python/ops/bert_tokenizer.py#L39 + """ + return cls( + "|".join( + [ + r"\s+", + r"|".join( + [ + r"[!-/]", + r"[:-@]", + r"[\[-`]", + r"[{-~]", + r"[\p{P}]", + ], + ), + r"|".join( + [ + r"[\x{4E00}-\x{9FFF}]", + r"[\x{3400}-\x{4DBF}]", + r"[\x{20000}-\x{2A6DF}]", + r"[\x{2A700}-\x{2B73F}]", + r"[\x{2B740}-\x{2B81F}]", + r"[\x{2B820}-\x{2CEAF}]", + r"[\x{F900}-\x{FAFF}]", + r"[\x{2F800}-\x{2FA1F}]", + ], + ), + ], + ), + ) + + @classmethod + def whitespace_splitter(cls) -> "RegexSplitStep": + return cls(r"\w+|[^\w\s]+") + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + input_nodes.extend( + self.create_string_constant_node(self.split_pattern).outputs() + ) + return core.make_node( + "RegexSplit", + input_nodes, + { + "behaviour": self.behaviour.lower(), + "invert": self.invert, + }, + ).outputs() + + +@dataclass +class WhitespaceSplitStep(PreTokenizatinStep): + """Works like python `str.split`.""" + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes) + + +@dataclass +class PunctuationSplitStep(PreTokenizatinStep): + """Splits string on punctuation chars.""" + # behaviour: str = "Isolated" + + +@dataclass +class TokenizationModelStep(BasePipelineStep): + pass + + +@dataclass +class WordPieceTokenizationStep(TokenizationModelStep): + vocab: List[str] = field(repr=False) + unk_token: str = "[UNK]" + suffix_indicator: str = "##" + max_bytes_per_word: int = 100 + unk_token_id: int = field(init=False) + + def __post_init__(self) -> None: + try: + self.unk_token_id = self.vocab.index(self.unk_token) + except ValueError: + raise UserInputError(f"Cannot find unknown token '{self.unk_token}' in the vocab") + + @property + def vocab_size(self) -> int: + return len(self.vocab) + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "WordPieceTokenizationStep": + return cls( + unk_token=tokenizer_json["model"]["unk_token"], + suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"], + vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])], + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + input_nodes.extend( + ( + *self.create_string_constant_node(self.vocab).outputs(), + *as_node(self.unk_token_id).outputs(), + ) + ) + return core.make_node( + "WordpieceTokenizer", + input_nodes, + { + "suffix_indicator": self.suffix_indicator, + "max_bytes_per_word": self.max_bytes_per_word, + } + ).outputs() + + +@dataclass +class PostTokenizationStep(BasePipelineStep): + pass + + +@dataclass +class TruncationStep(PostTokenizationStep): + max_length: int + truncate_right: bool = True + axis: int = -1 + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any], num_of_added_tokens: int = 0) -> "TruncationStep": + return cls( + max_length=tokenizer_json["truncation"]["max_length"] - num_of_added_tokens, + truncate_right=tokenizer_json["truncation"]["direction"] == "Right", + ) + + @classmethod + def from_hf_object(cls, tokenizer: Any, num_of_added_tokens: int = 0) -> "TruncationStep": + return cls( + max_length=tokenizer.model_max_length - num_of_added_tokens, + truncate_right=tokenizer.truncation_side == "right", + ) + + def get_ov_subgraph(self, input_nodes: List[Output]): + # FIXME: disabled for now + # operation = string_ops.Truncation( + # *input_nodes, + # as_node(self.max_length), + # self.truncate_right, + # self.axis, + # ) + # operation.configure_mock(**{"outputs.return_value": [MagicMock() for _ in range(len(input_nodes))]}) + # return operation + print('[ TOKENIZER PIPELINE CONVERSION ] WARNING: Truncation is not applied because it is not implemented') + return input_nodes + + +@dataclass +class SpecialTokenWithId: + token: str + _token_id: Optional[int] = None + + def set_token_id(self, vocab: Optional[List[str]]) -> None: + if vocab is not None: + self._token_id = vocab.index(self.token) + + @property + def token_id(self) -> Optional[int]: + return self._token_id + + +@dataclass +class TokenWithTypeId: + token_type_id: Optional[int] = None + + +@dataclass +class AddToken(TokenWithTypeId, SpecialTokenWithId): + pass + + +@dataclass +class Sequence(TokenWithTypeId): + pass + + +@dataclass +class CombineSegmentsStep(PostTokenizationStep): + inputs: List[TokenWithTypeId] = field(default_factory=list) + segment_ids: Optional[List[int]] = None + axis: int = -1 + + def __post_init__(self): + if self.segment_ids is not None: + return + + segment_ids_tensor = [node.token_type_id for node in self.inputs] + if any(segment is None for segment in segment_ids_tensor): + segment_ids_tensor = [0] * len(self.inputs) + + self.segment_ids = segment_ids_tensor + + def set_tokens_ids(self, vocab: Optional[List[int]]) -> None: + for input_ in self.inputs: + if isinstance(input_, AddToken): + input_.set_token_id(vocab) + + @property + def number_of_added_tokens(self) -> int: + return sum(1 for input_ in self.inputs if isinstance(input_, AddToken)) + + @classmethod + def from_hf_json_template_postprocessor( + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + ) -> "CombineSegmentsStep": + inputs: List[TokenWithTypeId] = [] + if number_of_inputs == 1: + post_processor = tokenizer_json["post_processor"]["single"] + else: + post_processor = tokenizer_json["post_processor"]["pair"] + + for template_dict in post_processor: + if "SpecialToken" in template_dict: + step = AddToken( + token=template_dict["SpecialToken"]["id"], + token_type_id=template_dict["SpecialToken"]["type_id"], + ) + inputs.append(step) + else: + inputs.append(Sequence(token_type_id=template_dict["Sequence"]["type_id"])) + + return cls(inputs) + + @classmethod + def from_hf_json_bert_postprocessor(cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1): + post_processor_dict = tokenizer_json["post_processor"] + inputs: List[TokenWithTypeId] = [ + AddToken( + token=post_processor_dict["cls"][0], + token_type_id=0, + ), + Sequence(token_type_id=0), + AddToken( + token=post_processor_dict["sep"][0], + token_type_id=0, + ), + ] + + if number_of_inputs == 2: + inputs.extend( + [ + Sequence(token_type_id=1), + AddToken( + token=post_processor_dict["sep"][0], + token_type_id=1, + ), + ] + ) + return cls(inputs) + + def get_ov_subgraph(self, input_nodes): + number_of_sequence_inputs = sum( + 1 for input_ in self.inputs if isinstance(input_, Sequence) + ) + if number_of_sequence_inputs != len(input_nodes)/3: + raise UserInputError( + f"Number of input nodes: {len(input_nodes)}, must be equal to {number_of_sequence_inputs}" + ) + + input_nodes_iter = iter(input_nodes) + op_inputs = [ + next(input_nodes_iter) if isinstance(node, Sequence) else as_node(node.token_type_id) + for node in self.inputs + ] + + # FIXME: Disabled for now, no implementation + # operation = string_ops.CombineSegments( + # *op_inputs, + # self.segment_ids, + # self.axis, + # ) + # operation.configure_mock(**{"outputs.return_value": [MagicMock()]}) + # return operation + + # Decomposed implementation + print(input_nodes) + assert len(input_nodes) == 3, '[ TOKENIZER PIPELINE CONVERSION ] CombineSegments can be converted for a single ragged input tensor only, this is temporary limitation' + # Make another ragged tensor with identical structure but with all values filled with self.segment_ids[0] + segment_ids_output = [input_nodes[0], input_nodes[1], opset10.broadcast(make_constant_node(self.segment_ids[0], Type.i32), opset10.shape_of(input_nodes[2])).output(0)] + print('[ TOKENIZER PIPELINE CONVERSION ] [ DEBUG ] CombineSegments outputs:', input_nodes + segment_ids_output) + return input_nodes + segment_ids_output + + + +@dataclass +class PaddingStep(PostTokenizationStep, SpecialTokenWithId): + pad_right: bool = True + token_type_id: Optional[int] = None + max_length: int = -1 + axis: int = -1 + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "PaddingStep": + padding_dict = tokenizer_json["padding"] + return cls( + token=padding_dict["pad_token"], + pad_right=padding_dict["direction"] == "Right", + token_type_id=padding_dict["pad_type_id"], + ) + + def get_ov_subgraph(self, input_nodes): + # Suppose input_nodes may have multiple tuples each with 3 tensors represented decomposed ragged tensors + # We suppose that all ragged tensors represent the same structure and produce the mask only once + assert len(input_nodes) % 3 == 0 + assert len(input_nodes) >= 3 + + #lens = opset10.subtract(input_nodes[1], input_nodes[2]) + #max_len = opset10.reduce_max(lens) + #padded_len = + outputs = [] + print(self.token) + print(self.max_length) + print('ERRROR: SETTING MAX_LENGTH = 100') + print('ERROR: Ignoring pad token and set it to id = 0') + self.max_length = 100 + #if self.token_type_id == -1: + # self.token_type_id = 0 + for i in range(len(input_nodes)//3): + print(input_nodes[3*i:3*(i+1)]) + print(as_node(self.max_length).outputs()) + print(as_node(np.array(0, dtype=int)).outputs()) + cur_outputs = core.make_node('RaggedToDense', input_nodes[3*i:3*(i+1)] + make_constant_node(self.max_length, Type.i32).outputs() + make_constant_node(0, Type.i32).outputs()).outputs() + outputs.append(cur_outputs[0]) + if i == 0: + mask = opset10.convert(cur_outputs[1], 'i32').output(0) # TODO: Change RaggedToDense to generate mask of any type + + outputs.append(mask) + + return outputs + + +@dataclass +class TokenizerPipeline: + steps: List[BasePipelineStep] = field(default_factory=list) + vocab: Optional[List[str]] = field(default=None, repr=False) + number_of_inputs: int = 1 + + def get_config(self) -> Dict[str, Dict[str, Any]]: + return {type(step).__name__: step.get_config() for step in self.steps} + + @singledispatchmethod + def add_steps(self, steps: Any) -> None: + raise OVTypeError(f"Type {type(steps)} is not supported") + + @add_steps.register + def _(self, steps: BasePipelineStep) -> None: + self.steps.append(steps) + steps.set_pipeline(self) + + @add_steps.register + def _(self, steps: list) -> None: + for step in steps: + self.steps.append(step) + step.set_pipeline(self) + + def __getitem__(self, item: int) -> BasePipelineStep: + return self.steps[item] + + @property + def processing_steps(self) -> List[BasePipelineStep]: + return [step for step in self.steps if not isinstance(step, PostTokenizationStep)] + + @property + def post_tokenization_steps(self) -> List[PostTokenizationStep]: + return [step for step in self.steps if isinstance(step, PostTokenizationStep)] + + def create_string_input(self) -> Node: + return op.Parameter(Type.u8, PartialShape(["?"])) + + def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[Node]: + processing_pipelines_outputs = [] + + for input_node in input_nodes: + input_node = core.make_node("StringTensorUnpack", input_node.outputs()).outputs() + print(input_node) + for step in self.processing_steps: + input_node = step.get_ov_subgraph(input_node) + print('input_node:', input_node) + #ragged_tensor_pack = core.make_node("RaggedTensorPack", input_node) + processing_pipelines_outputs += input_node + + return processing_pipelines_outputs + + def create_post_tokenization_pipeline(self, input_nodes): + #outputs = [] + for step in self.post_tokenization_steps: + pipeline_step = step.get_ov_subgraph(input_nodes) + input_nodes = pipeline_step + + #if isinstance(step, CombineSegmentsStep): + # input_nodes.append(MagicMock(name="token_type_ids")) + # outputs.append(input_nodes.pop(-1)) # token_type_ids node + #if isinstance(step, PaddingStep): + # print('HEY!!!!!!!') + # input_nodes.append(MagicMock(name="attention_mask")) + # outputs.append(input_nodes.pop(-1)) # attention_mask node + + #outputs.insert(0, input_nodes[0]) + return input_nodes + + def get_ov_subgraph(self) -> Model: + input_nodes = [self.create_string_input() for _ in range(self.number_of_inputs)] + processing_outputs = self.create_processing_pipeline(input_nodes) + outputs = self.create_post_tokenization_pipeline(processing_outputs) + + return Model(outputs, input_nodes, name="tokenizer") + + From 1e50352167b32061b2e89bf51e101647b1737918 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Fri, 26 May 2023 00:07:39 +0400 Subject: [PATCH 020/116] Renamed apply_tokenizer to connect_tokeniser and removed obsolete handling of model name --- .../user_ie_extensions/sentence_piece/convert_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index f2d5f4630..c2cee34ef 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -22,7 +22,7 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") -def apply_tokenizer(model: Model, tokenizer: Model) -> Model: +def connect_tokenizer(model: Model, tokenizer: Model) -> Model: assert len(model.inputs) == len(tokenizer.outputs) # need to check if the inputs are aligned: @@ -36,4 +36,4 @@ def apply_tokenizer(model: Model, tokenizer: Model) -> Model: for target in model_input.get_target_inputs(): target.replace_source_output(tokenizer_output) - return Model(model.outputs, tokenizer.inputs, name=model.name) + return Model(model.outputs, tokenizer.inputs) From 0966b8ac253ae2303952cfaa80ea42e196c82766 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Wed, 31 May 2023 04:30:47 +0400 Subject: [PATCH 021/116] CombineSegments is implemented, used in HF converter. Stitching of tokenizer and main model is fixed partially (still produces topologically incorrect model) --- .../user_ie_extensions/ov_extension.cpp | 1 + .../sentence_piece/convert_tokenizer.py | 6 +- .../sentence_piece/sentence_piece.cpp | 126 ++++++++++++++++++ .../sentence_piece/sentence_piece.hpp | 28 ++++ .../sentence_piece/tokenizer_pipeline.py | 26 +++- 5 files changed, 180 insertions(+), 7 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 3cdd2406d..cddca5f0b 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -69,6 +69,7 @@ std::make_shared>(), \ std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ + std::make_shared>(), \ std::make_shared>(), \ std::make_shared("Reshape", translate_reshape), \ std::make_shared("Const", translate_const), \ diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index c2cee34ef..dc55d72c3 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -16,7 +16,7 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token from hf_parser import TransformersTokenizerPipelineParser if isinstance(tokenizer_object, PreTrainedTokenizerBase): - return TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs) + return TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs).get_ov_subgraph() raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") @@ -36,4 +36,6 @@ def connect_tokenizer(model: Model, tokenizer: Model) -> Model: for target in model_input.get_target_inputs(): target.replace_source_output(tokenizer_output) - return Model(model.outputs, tokenizer.inputs) + connected_model = Model(model.outputs, tokenizer.get_parameters()) + connected_model.validate_nodes_and_infer_types() + return connected_model diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 6e517273d..1df5e4708 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1406,6 +1406,132 @@ bool RaggedToDense::evaluate(ov::TensorVector& outputs, const ov::TensorVector& return true; } +void CombineSegments::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() > 0); + OPENVINO_ASSERT((get_input_size() - 1)%3 == 0); + + // First come several ragged tensors each represented as 3 regular tesors + size_t num_inputs = (get_input_size() - 1)/3; + PartialShape ps = PartialShape::dynamic(); + element::Type et = element::dynamic; + for (size_t i = 0; i < num_inputs; ++i) { + check_ragged_input(this, 3*i); + // Check limited broadcast + // Limited means that we support only two shapes on inputs: scalar and not scalars, + // and all not-scalars should have the same shape + auto rank = get_input_partial_shape(3*i).rank(); + if(rank.is_static() && rank.get_length()) { + OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); + } + OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i))); + OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i + 1))); + } + + std::cerr << ps << '\n'; + + set_ragged_output(this, 0, ps, et); + // TODO: Avoid emitting ragged indices for the second ragged tensor, they should be identical to the first output ragged tensor + set_ragged_output(this, 3, ps, get_input_element_type(get_input_size() - 1)); +} + + +bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // FIXME: Works for POD types only (not for strings!) + size_t num_of_ragged = (inputs.size() - 1)/3; + OPENVINO_ASSERT(num_of_ragged == inputs.back().get_size()); + std::vector begins; + std::vector ends; + std::vector nelems; + std::vector elems; + auto element_type = inputs[2].get_element_type(); + auto elem_size = element_type.size(); + size_t max_nelems = 0; + size_t flat_out_size = 0; + Shape ps; + + for(size_t i = 0; i < num_of_ragged; ++i) { + OPENVINO_ASSERT(inputs[3*i + 2].get_element_type() == element_type); + begins.push_back(inputs[3*i + 0].data()); + ends.push_back(inputs[3*i + 1].data()); + nelems.push_back(inputs[3*i + 0].get_size()); + std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; + elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); + if(inputs[3*i + 0].get_shape().size() > 0) { + ps = inputs[3*i + 0].get_shape(); + std::cerr << "updated\n"; + } + std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; + //OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); + } + + for(size_t i = 0; i < num_of_ragged; ++i) { + max_nelems = std::max(max_nelems, nelems[i]); + std::cerr << "max_nelems = " << max_nelems << "\n"; + if(nelems[i] == 1) { + flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast + } else { + flat_out_size += inputs[3*i + 2].get_size(); // FIXME: doesn't work for overlapped ragged regions + } + } + + auto ids = reinterpret_cast(inputs.back().data()); + size_t id_type_size = inputs.back().get_element_type().size(); + + outputs[3*0 + 0].set_shape(ps); + outputs[3*0 + 1].set_shape(ps); + OPENVINO_ASSERT(max_nelems == outputs[3*0 + 0].get_size()); + OPENVINO_ASSERT(max_nelems == outputs[3*0 + 1].get_size()); + outputs[3*0 + 2].set_shape({flat_out_size}); + + outputs[3*1 + 0].set_shape(ps); + outputs[3*1 + 1].set_shape(ps); + OPENVINO_ASSERT(max_nelems == outputs[3*1 + 0].get_size()); + OPENVINO_ASSERT(max_nelems == outputs[3*1 + 1].get_size()); + outputs[3*1 + 2].set_shape({flat_out_size}); + + auto out_elem_begins = outputs[3*0 + 0].data(); + auto out_elem_ends = outputs[3*0 + 1].data(); + auto out_elems = reinterpret_cast(outputs[3*0 + 2].data()); + auto out_id_begins = outputs[3*1 + 0].data(); + auto out_id_ends = outputs[3*1 + 1].data(); + auto out_ids = reinterpret_cast(outputs[3*1 + 2].data()); + + auto out_elems_orig = out_elems; + auto out_ids_orig = out_ids; + size_t out_offset = 0; + + for(size_t i = 0; i < max_nelems; ++i) { + out_elem_begins[i] = out_offset; + out_id_begins[i] = out_offset; + + for(size_t j = 0; j < num_of_ragged; ++j) { + const char* begin; + size_t len; + if(nelems[j] == 1) { + begin = elems[j] + elem_size*begins[j][0]; + len = ends[j][0] - begins[j][0]; + } else { + begin = elems[j] + elem_size*begins[j][i]; + len = ends[j][i] - begins[j][i]; + } + auto end = begin + elem_size*len; + out_elems = std::copy(begin, end, out_elems); + for(size_t k = 0; k < len; ++k) { + out_ids = std::copy(ids + id_type_size*j, ids + id_type_size*(j + 1), out_ids); + } + out_offset += len; + } + + out_elem_ends[i] = out_offset; + out_id_ends[i] = out_offset; + } + + OPENVINO_ASSERT(out_elems == out_elems_orig + outputs[3*0 + 2].get_byte_size()); + OPENVINO_ASSERT(out_ids == out_ids_orig + outputs[3*1 + 2].get_byte_size()); + OPENVINO_ASSERT(out_offset == flat_out_size); + return true; +} + ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 3b307485e..f990d53f2 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -336,6 +336,34 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); +class OPENVINO_API CombineSegments : public ov::op::Op { +public: + OPENVINO_OP("CombineSegments"); + + CombineSegments () = default; + + CombineSegments(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; + // Takes a ragged tensor with one ragged right-most dimension and produces a normal tensor class OPENVINO_API RaggedToDense : public ov::op::Op { public: diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index fdb7a2c09..fdc4323bb 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -425,16 +425,29 @@ def get_ov_subgraph(self, input_nodes): number_of_sequence_inputs = sum( 1 for input_ in self.inputs if isinstance(input_, Sequence) ) + print('number_of_sequence_inputs:', number_of_sequence_inputs) if number_of_sequence_inputs != len(input_nodes)/3: raise UserInputError( f"Number of input nodes: {len(input_nodes)}, must be equal to {number_of_sequence_inputs}" ) - input_nodes_iter = iter(input_nodes) - op_inputs = [ - next(input_nodes_iter) if isinstance(node, Sequence) else as_node(node.token_type_id) - for node in self.inputs - ] + op_inputs = [] + i = 0 + + for node in self.inputs: + if isinstance(node, Sequence): + op_inputs += input_nodes[3*i:3*(i+1)] + i += 1 + else: + # Put a scalar as a ragged tensor with scalar shape and a single element + op_inputs += make_constant_node(0, Type.i32).outputs() + op_inputs += make_constant_node(1, Type.i32).outputs() + print('token', node._token_id) + op_inputs.append(make_constant_node(np.array([node._token_id]), Type.i32).output(0)) + + op_inputs.append(make_constant_node(self.segment_ids, Type.i32).output(0)) + + print('op_inputs:', op_inputs) # FIXME: Disabled for now, no implementation # operation = string_ops.CombineSegments( @@ -446,8 +459,11 @@ def get_ov_subgraph(self, input_nodes): # return operation # Decomposed implementation + + return core.make_node('CombineSegments', op_inputs).outputs() print(input_nodes) assert len(input_nodes) == 3, '[ TOKENIZER PIPELINE CONVERSION ] CombineSegments can be converted for a single ragged input tensor only, this is temporary limitation' + print('self.segment_ids:', self.segment_ids) # Make another ragged tensor with identical structure but with all values filled with self.segment_ids[0] segment_ids_output = [input_nodes[0], input_nodes[1], opset10.broadcast(make_constant_node(self.segment_ids[0], Type.i32), opset10.shape_of(input_nodes[2])).output(0)] print('[ TOKENIZER PIPELINE CONVERSION ] [ DEBUG ] CombineSegments outputs:', input_nodes + segment_ids_output) From 61d798354946e9f0135c5f4fbe6c88f637183457 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 1 Jun 2023 00:59:31 +0400 Subject: [PATCH 022/116] Fixed stitching of two models by connecting with names of inputs/outputs, now Bert and its tokenizer are connected together correctly --- .../sentence_piece/convert_tokenizer.py | 50 +++++++++++++------ .../sentence_piece/tokenizer_pipeline.py | 1 + 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index dc55d72c3..ce5d0afc5 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -6,7 +6,7 @@ from typing import Any, List from openvino.runtime.exceptions import OVTypeError -from openvino.runtime import Model, Output +from openvino.runtime import Model from tokenizer_pipeline import TokenizerPipeline @@ -15,27 +15,47 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token from transformers import PreTrainedTokenizerBase from hf_parser import TransformersTokenizerPipelineParser + # TODO: Remove this check if isinstance(tokenizer_object, PreTrainedTokenizerBase): - return TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs).get_ov_subgraph() - + ov_tokenizer = TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs).get_ov_subgraph() + output_names = tokenizer_object.model_input_names + for i, output_name in enumerate(output_names): + ov_tokenizer.output(i).tensor.add_names({output_name}) + return ov_tokenizer raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") -def connect_tokenizer(model: Model, tokenizer: Model) -> Model: - assert len(model.inputs) == len(tokenizer.outputs) +def connect_models(model1: Model, model2: Model, name_map=None, *, by_indices=None, by_names=None) -> Model: + # TODO: Relax this limitation by not connecting some inputs/outputs together + assert len(model2.inputs) == len(model1.outputs) + + if by_indices is None and by_names is None: + by_names = True + + if name_map is not None: + by_names = True + + # TODO: Check only one of by_indices and by_names is set - # need to check if the inputs are aligned: - # - inputs_ids -> inputs_ids - # - attention_mask -> attention_mask - # - token_type_ids -> token_type_ids - aligned_model_inputs = model.inputs - aligned_tokenizer_outputs: List[Output] = tokenizer.outputs + if by_indices: + aligned_model1_outputs = model1.outputs + aligned_model2_inputs = model2.inputs + elif by_names: + if name_map is None: + aligned_model1_outputs = model1.outputs + aligned_model2_inputs = [model2.input(model1_output.get_any_name()) for model1_output in aligned_model1_outputs] + else: + aligned_model1_outputs = [model1.output(name1) for name1, _ in name_map] + aligned_model2_inputs = [model2.input(name2) for _, name2 in name_map] - for model_input, tokenizer_output in zip(aligned_model_inputs, aligned_tokenizer_outputs): - for target in model_input.get_target_inputs(): - target.replace_source_output(tokenizer_output) + for model2_input, model1_output in zip(aligned_model2_inputs, aligned_model1_outputs): + print(f'Connecting: {model1_output.get_any_name()} -> {model2_input.get_any_name()}') + for target in model2_input.get_target_inputs(): + target.replace_source_output(model1_output.get_node().input_value(0)) + #target.replace_source_output(model1_output) # TODO: Produces incorrect topology - connected_model = Model(model.outputs, tokenizer.get_parameters()) + connected_model = Model(model2.outputs, model1.get_parameters()) + # TODO: Cleanup model1 and mode2 to avoid using them, they are ill-formed after the reconnection connected_model.validate_nodes_and_infer_types() return connected_model diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index fdc4323bb..3f2cfc37b 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -39,6 +39,7 @@ def pack_string(s): core = Core() +# TODO: Use relative path core.add_extension("/home/slyalin/openvino-contrib/muse/modules/custom_operations/build/user_ie_extensions/libuser_ov_extensions.so") From 5609ee6120cf80d4cd9ef19525cdd4557e589274 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 1 Jun 2023 04:43:08 +0400 Subject: [PATCH 023/116] WA for CPU bug with scalar inputs, correct truncation and dynamic padding, fix bugs for batches processing --- .../sentence_piece/sentence_piece.cpp | 14 ++++++++--- .../sentence_piece/tokenizer_pipeline.py | 24 +++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 1df5e4708..edee163df 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1456,16 +1456,20 @@ bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector nelems.push_back(inputs[3*i + 0].get_size()); std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); - if(inputs[3*i + 0].get_shape().size() > 0) { + // TODO: Get rank from a tensor instead of partial_shape. This is a WA for CPU bug that gives 1D tensors instead of 0D tensors. + if(get_input_partial_shape(3*i + 0).rank().get_length() > 0) { ps = inputs[3*i + 0].get_shape(); std::cerr << "updated\n"; } std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; //OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); + max_nelems = std::max(max_nelems, nelems.back()); } + // flat_out_size is going to be an estimation of the final size + // This is only an estimation, not the exact output size, because ragged tensor may have gaps in the representation + for(size_t i = 0; i < num_of_ragged; ++i) { - max_nelems = std::max(max_nelems, nelems[i]); std::cerr << "max_nelems = " << max_nelems << "\n"; if(nelems[i] == 1) { flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast @@ -1526,9 +1530,13 @@ bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector out_id_ends[i] = out_offset; } + OPENVINO_ASSERT(out_offset <= flat_out_size); + + outputs[3*0 + 2].set_shape({out_offset}); + outputs[3*1 + 2].set_shape({out_offset}); + OPENVINO_ASSERT(out_elems == out_elems_orig + outputs[3*0 + 2].get_byte_size()); OPENVINO_ASSERT(out_ids == out_ids_orig + outputs[3*1 + 2].get_byte_size()); - OPENVINO_ASSERT(out_offset == flat_out_size); return true; } diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index 3f2cfc37b..15102e9bd 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -315,8 +315,14 @@ def get_ov_subgraph(self, input_nodes: List[Output]): # ) # operation.configure_mock(**{"outputs.return_value": [MagicMock() for _ in range(len(input_nodes))]}) # return operation - print('[ TOKENIZER PIPELINE CONVERSION ] WARNING: Truncation is not applied because it is not implemented') - return input_nodes + #print('[ TOKENIZER PIPELINE CONVERSION ] WARNING: Truncation is not applied because it is not implemented') + print('Trunc max_length:', self.max_length) + # FIXME: Truncation side (truncate_right) is ignored + # TODO: Check if axis is the right-most dimension + assert len(input_nodes) == 3, 'Only one input ragged tensor is supported as an input for TruncationStep' + + max_length = opset10.minimum(opset10.subtract(input_nodes[1], input_nodes[0]), make_constant_node(self.max_length, Type.i32)) + return [input_nodes[0], opset10.add(input_nodes[0], max_length).output(0), input_nodes[2]] @dataclass @@ -443,6 +449,7 @@ def get_ov_subgraph(self, input_nodes): # Put a scalar as a ragged tensor with scalar shape and a single element op_inputs += make_constant_node(0, Type.i32).outputs() op_inputs += make_constant_node(1, Type.i32).outputs() + print('Should be scalar:', op_inputs[-1]) print('token', node._token_id) op_inputs.append(make_constant_node(np.array([node._token_id]), Type.i32).output(0)) @@ -486,6 +493,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "PaddingStep": token=padding_dict["pad_token"], pad_right=padding_dict["direction"] == "Right", token_type_id=padding_dict["pad_type_id"], + # TODO: Initialize max_length ) def get_ov_subgraph(self, input_nodes): @@ -499,17 +507,23 @@ def get_ov_subgraph(self, input_nodes): #padded_len = outputs = [] print(self.token) - print(self.max_length) + print('max_length =', self.max_length) print('ERRROR: SETTING MAX_LENGTH = 100') print('ERROR: Ignoring pad token and set it to id = 0') - self.max_length = 100 + + if self.max_length == -1: + # Calculate max_length as the maximum ragged length + max_length = opset10.reduce_max(opset10.subtract(input_nodes[1], input_nodes[0]), make_constant_node(0, Type.i32)) + else: + max_length = make_constant_node(self.max_length, Type.i32) + #if self.token_type_id == -1: # self.token_type_id = 0 for i in range(len(input_nodes)//3): print(input_nodes[3*i:3*(i+1)]) print(as_node(self.max_length).outputs()) print(as_node(np.array(0, dtype=int)).outputs()) - cur_outputs = core.make_node('RaggedToDense', input_nodes[3*i:3*(i+1)] + make_constant_node(self.max_length, Type.i32).outputs() + make_constant_node(0, Type.i32).outputs()).outputs() + cur_outputs = core.make_node('RaggedToDense', input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs()).outputs() outputs.append(cur_outputs[0]) if i == 0: mask = opset10.convert(cur_outputs[1], 'i32').output(0) # TODO: Change RaggedToDense to generate mask of any type From 062acf36147bb87b0d6547397b860edaa0f3d630 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 1 Jun 2023 08:59:07 +0400 Subject: [PATCH 024/116] Fixed conversion of HF tokenizer if part of outputs are omitted. Disabled debug output --- .../sentence_piece/convert_tokenizer.py | 24 ++++++++++++---- .../sentence_piece/sentence_piece.cpp | 28 +++++++++---------- .../sentence_piece/tokenizer_pipeline.py | 28 +++++++++---------- 3 files changed, 47 insertions(+), 33 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index ce5d0afc5..f8a091009 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -19,16 +19,22 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token if isinstance(tokenizer_object, PreTrainedTokenizerBase): ov_tokenizer = TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs).get_ov_subgraph() output_names = tokenizer_object.model_input_names - for i, output_name in enumerate(output_names): - ov_tokenizer.output(i).tensor.add_names({output_name}) - return ov_tokenizer + filtered_outputs = [] + for i, output_name in enumerate(['input_ids', 'token_type_ids', 'attention_mask']): + if output_name in output_names: + ov_tokenizer.output(i).tensor.add_names({output_name}) + filtered_outputs.append(ov_tokenizer.output(i)) + + return Model(filtered_outputs, ov_tokenizer.get_parameters()) raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") def connect_models(model1: Model, model2: Model, name_map=None, *, by_indices=None, by_names=None) -> Model: # TODO: Relax this limitation by not connecting some inputs/outputs together - assert len(model2.inputs) == len(model1.outputs) + #print(len(model2.inputs)) + #print(len(model1.outputs)) + #assert len(model2.inputs) == len(model1.outputs) if by_indices is None and by_names is None: by_names = True @@ -45,12 +51,20 @@ def connect_models(model1: Model, model2: Model, name_map=None, *, by_indices=No if name_map is None: aligned_model1_outputs = model1.outputs aligned_model2_inputs = [model2.input(model1_output.get_any_name()) for model1_output in aligned_model1_outputs] + + ''' + aligned_model1_outputs = [] + aligned_model2_inputs = [] + for model2_input in model2.inputs: + # Search for corresponding model1 output by all possible names + for model1_output in model2.outputs + ''' else: aligned_model1_outputs = [model1.output(name1) for name1, _ in name_map] aligned_model2_inputs = [model2.input(name2) for _, name2 in name_map] for model2_input, model1_output in zip(aligned_model2_inputs, aligned_model1_outputs): - print(f'Connecting: {model1_output.get_any_name()} -> {model2_input.get_any_name()}') + #print(f'Connecting: {model1_output.get_any_name()} -> {model2_input.get_any_name()}') for target in model2_input.get_target_inputs(): target.replace_source_output(model1_output.get_node().input_value(0)) #target.replace_source_output(model1_output) # TODO: Produces incorrect topology diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index edee163df..b8e866f10 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -392,14 +392,14 @@ bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVecto // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass the base tensor with elements throug. auto input_shape = inputs[0].get_shape(); - std::cout << "[ DEBUG ] RaggedTensorPack: shape = " << input_shape << "\n"; + //std::cout << "[ DEBUG ] RaggedTensorPack: shape = " << input_shape << "\n"; auto begins = inputs[0].data(); auto ends = inputs[1].data(); auto num_elements = shape_size(input_shape); - for(size_t i = 0; i < num_elements; ++i) { - std::cout << "[ DEBUG ] [" << i << "] " << begins[i] << ":" << ends[i] << " with size = " << ends[i] - begins[i] << "\n"; - } + //for(size_t i = 0; i < num_elements; ++i) { + //std::cout << "[ DEBUG ] [" << i << "] " << begins[i] << ":" << ends[i] << " with size = " << ends[i] - begins[i] << "\n"; + //} inputs[2].copy_to(outputs[0]); @@ -1183,7 +1183,7 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec using namespace paddlenlp::fast_tokenizer; - std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; + //std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; core::Vocab vocab; std::string unk_token; if(unk_token_id < 0) @@ -1195,13 +1195,13 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec unk_token = token; } - std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; - std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; - std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; + //std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; + //std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; + //std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; auto tokenizer = models::FastWordPiece(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); // FIXME: why true? - std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; + //std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; for(size_t j = 0; j < num_elems; ++j) { @@ -1427,7 +1427,7 @@ void CombineSegments::validate_and_infer_types() { OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i + 1))); } - std::cerr << ps << '\n'; + //std::cerr << ps << '\n'; set_ragged_output(this, 0, ps, et); // TODO: Avoid emitting ragged indices for the second ragged tensor, they should be identical to the first output ragged tensor @@ -1454,14 +1454,14 @@ bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector begins.push_back(inputs[3*i + 0].data()); ends.push_back(inputs[3*i + 1].data()); nelems.push_back(inputs[3*i + 0].get_size()); - std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; + //std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); // TODO: Get rank from a tensor instead of partial_shape. This is a WA for CPU bug that gives 1D tensors instead of 0D tensors. if(get_input_partial_shape(3*i + 0).rank().get_length() > 0) { ps = inputs[3*i + 0].get_shape(); - std::cerr << "updated\n"; + //std::cerr << "updated\n"; } - std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; + //std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; //OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); max_nelems = std::max(max_nelems, nelems.back()); } @@ -1470,7 +1470,7 @@ bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector // This is only an estimation, not the exact output size, because ragged tensor may have gaps in the representation for(size_t i = 0; i < num_of_ragged; ++i) { - std::cerr << "max_nelems = " << max_nelems << "\n"; + //std::cerr << "max_nelems = " << max_nelems << "\n"; if(nelems[i] == 1) { flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast } else { diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index 15102e9bd..17124b4c0 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -316,7 +316,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]): # operation.configure_mock(**{"outputs.return_value": [MagicMock() for _ in range(len(input_nodes))]}) # return operation #print('[ TOKENIZER PIPELINE CONVERSION ] WARNING: Truncation is not applied because it is not implemented') - print('Trunc max_length:', self.max_length) + #print('Trunc max_length:', self.max_length) # FIXME: Truncation side (truncate_right) is ignored # TODO: Check if axis is the right-most dimension assert len(input_nodes) == 3, 'Only one input ragged tensor is supported as an input for TruncationStep' @@ -432,7 +432,7 @@ def get_ov_subgraph(self, input_nodes): number_of_sequence_inputs = sum( 1 for input_ in self.inputs if isinstance(input_, Sequence) ) - print('number_of_sequence_inputs:', number_of_sequence_inputs) + #print('number_of_sequence_inputs:', number_of_sequence_inputs) if number_of_sequence_inputs != len(input_nodes)/3: raise UserInputError( f"Number of input nodes: {len(input_nodes)}, must be equal to {number_of_sequence_inputs}" @@ -449,13 +449,13 @@ def get_ov_subgraph(self, input_nodes): # Put a scalar as a ragged tensor with scalar shape and a single element op_inputs += make_constant_node(0, Type.i32).outputs() op_inputs += make_constant_node(1, Type.i32).outputs() - print('Should be scalar:', op_inputs[-1]) - print('token', node._token_id) + #print('Should be scalar:', op_inputs[-1]) + #print('token', node._token_id) op_inputs.append(make_constant_node(np.array([node._token_id]), Type.i32).output(0)) op_inputs.append(make_constant_node(self.segment_ids, Type.i32).output(0)) - print('op_inputs:', op_inputs) + #print('op_inputs:', op_inputs) # FIXME: Disabled for now, no implementation # operation = string_ops.CombineSegments( @@ -506,10 +506,10 @@ def get_ov_subgraph(self, input_nodes): #max_len = opset10.reduce_max(lens) #padded_len = outputs = [] - print(self.token) - print('max_length =', self.max_length) - print('ERRROR: SETTING MAX_LENGTH = 100') - print('ERROR: Ignoring pad token and set it to id = 0') + #print(self.token) + #print('max_length =', self.max_length) + #print('ERRROR: SETTING MAX_LENGTH = 100') + #print('ERROR: Ignoring pad token and set it to id = 0') if self.max_length == -1: # Calculate max_length as the maximum ragged length @@ -520,9 +520,9 @@ def get_ov_subgraph(self, input_nodes): #if self.token_type_id == -1: # self.token_type_id = 0 for i in range(len(input_nodes)//3): - print(input_nodes[3*i:3*(i+1)]) - print(as_node(self.max_length).outputs()) - print(as_node(np.array(0, dtype=int)).outputs()) + #print(input_nodes[3*i:3*(i+1)]) + #print(as_node(self.max_length).outputs()) + #print(as_node(np.array(0, dtype=int)).outputs()) cur_outputs = core.make_node('RaggedToDense', input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs()).outputs() outputs.append(cur_outputs[0]) if i == 0: @@ -576,10 +576,10 @@ def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[No for input_node in input_nodes: input_node = core.make_node("StringTensorUnpack", input_node.outputs()).outputs() - print(input_node) + #print(input_node) for step in self.processing_steps: input_node = step.get_ov_subgraph(input_node) - print('input_node:', input_node) + #print('input_node:', input_node) #ragged_tensor_pack = core.make_node("RaggedTensorPack", input_node) processing_pipelines_outputs += input_node From 0f772dc8177dd9cc9becae5c047149d7b918c819 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 19 Jun 2023 13:48:56 +0100 Subject: [PATCH 025/116] Add BPE Tokenizer --- .../user_ie_extensions/ov_extension.cpp | 1 + .../sentence_piece/hf_parser.py | 15 +- .../sentence_piece/sentence_piece.cpp | 385 +++++++++++++++--- .../sentence_piece/sentence_piece.hpp | 53 +++ .../sentence_piece/tokenizer_pipeline.py | 85 +++- 5 files changed, 468 insertions(+), 71 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index cddca5f0b..863192334 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -68,6 +68,7 @@ std::make_shared("RegexSplitWithOffsets", translate_regex_split_with_offsets), \ std::make_shared>(), \ std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ + std::make_shared>(), \ std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ std::make_shared>(), \ std::make_shared>(), \ diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py index 17bf6aaad..c9cb900b9 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -21,6 +21,7 @@ RegexSplitStep, WhitespaceSplitStep, WordPieceTokenizationStep, + BPETokenizationStep, TruncationStep, PaddingStep, CombineSegmentsStep, @@ -127,6 +128,7 @@ def normalization(self) -> None: "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(), "Split": parse_split_step, "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]), + "ByteLevel": lambda step_dict: WhitespaceSplitStep(), ## !!!FIX ME!!! } def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None: @@ -149,6 +151,9 @@ def tokenization_model(self) -> None: if self.tokenizer_json["model"]["type"] == "WordPiece": self.pipeline.add_steps(WordPieceTokenizationStep.from_hf_json(self.tokenizer_json)) self.pipeline.vocab = self.pipeline[-1].vocab + elif self.tokenizer_json["model"]["type"] == "BPE": + self.pipeline.add_steps(BPETokenizationStep.from_hf_json(self.tokenizer_json)) + self.pipeline.vocab = self.pipeline[-1].vocab else: raise OVTypeError(f"Tokenizer type '{self.tokenizer_json['model']['type']}' is not supported") @@ -164,6 +169,11 @@ def post_tokenization(self) -> None: combine_segments_step = CombineSegmentsStep.from_hf_json_bert_postprocessor( self.tokenizer_json, self.number_of_inputs ) + elif self.tokenizer_json["post_processor"]["type"] == "ByteLevel": # !!!FIX ME!!! + pass # test BPETokenizer + self.add_truncation() + self.add_padding() + return else: raise OVTypeError(f"Post-processor type '{self.tokenizer_json['post_processor']['type']}' is not supported") @@ -184,6 +194,7 @@ def add_truncation(self) -> None: def add_padding(self) -> None: if self.tokenizer_json["padding"] is not None: self.pipeline.add_steps(PaddingStep.from_hf_json(self.tokenizer_json)) - else: + self.pipeline[-1].set_token_id(self.pipeline.vocab) + elif self.original_tokenizer.pad_token is not None: self.pipeline.add_steps(PaddingStep(token=self.original_tokenizer.pad_token)) - self.pipeline[-1].set_token_id(self.pipeline.vocab) + self.pipeline[-1].set_token_id(self.pipeline.vocab) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index b8e866f10..a1fbc54bf 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -327,7 +327,7 @@ void set_ragged_output(Node* node, size_t output_index, const PartialShape& shap void StringTensorPack::validate_and_infer_types() { - OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supporst only 'begins_ends' mode, but get " + m_mode); + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supports only 'begins_ends' mode, but get " + m_mode); check_string_input(this, 0); #if USE_STRING_TENSORS set_output_type(0, element::string, get_input_partial_shape(0)); @@ -959,85 +959,177 @@ const std::map split_modes = { void RegexSplit::validate_and_infer_types() { - check_string_input(this, 0); - check_string_scalar_input(this, 3); +// check_string_input(this, 0); +// check_string_scalar_input(this, 3); +// check_ragged_string_input(this, 0); +// check_string_input(this, 5); OPENVINO_ASSERT(split_modes.find(m_behaviour) != split_modes.end(), "RegexSplit doesn't support unknown split mode: " + m_behaviour); set_ragged_string_output(this, 0, get_input_partial_shape(0)); } bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - auto split_pattern_buf = inputs[3].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + if (inputs.size() < 5) { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); -#if 1 + ov::Tensor ragged_begins_tensor(ov::element::i32, inputs[0].get_shape()); + ov::Tensor ragged_ends_tensor(ov::element::i32, inputs[0].get_shape()); + auto ragged_begins = ragged_begins_tensor.data(); + auto ragged_ends = ragged_ends_tensor.data(); + for (int i=0; i < inputs[0].get_size(); ++i) { + ragged_begins[i] = i; + ragged_ends[i] = i + 1; + }; - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); + auto split_pattern_buf = inputs[3].data(); + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - const size_t num_elements = inputs[0].get_size(); - const size_t num_chars = inputs[2].get_size(); + const size_t num_elements = inputs[0].get_size(); + const size_t num_chars = inputs[2].get_size(); - // TODO: Better estimations for max size? - // Assume we cannot have empty parts, so the number of parts cannot be bigger than the number of symbols - outputs[2].set_shape(Shape{num_chars}); - outputs[3].set_shape(Shape{num_chars}); + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); - // Assume we cannot introduce new symbols to output, only existing can be distributed (with gaps) + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); - // TODO: Can we just route input tensor directly to the output outside evaluate when graph is being constructed? - outputs[4] = inputs[2]; // TODO: Does it really work? + outputs[4] = inputs[2]; // TODO: Does it really work? - // If line above doesn't work, do this instead: - //outputs[4].set_shape(Shape{num_chars}); - //inputs[2].copy_to(outputs[4]); + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - int32_t ragged_offset = 0; + using namespace paddlenlp::fast_tokenizer; + auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - using namespace paddlenlp::fast_tokenizer; + std::cerr << "[ RegexSplit ] regex: " << std::string(split_pattern) << "\n"; - auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); + for(size_t seq = 0; seq < num_elements; ++seq) { + for(size_t word = ragged_begins[seq]; word < ragged_ends[seq]; ++word) { - for(size_t i = 0; i < num_elements; ++i) { - auto old_str = std::string(chars + begins[i], chars + ends[i]); - //std::cerr << "[ RegexSplit ] old_str: " << old_str << "\n"; - paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(old_str); - pretokenizer(&pretokenized); - size_t num_splits = pretokenized.GetSplitsSize(); - - new_ragged_begins[i] = ragged_offset; - - for (size_t j = 0; j < num_splits; ++j) { - auto split = pretokenized.GetSplit(j); - //const auto& value = split.normalized_.GetStr(); - auto offset = split.normalized_.GetOrginalOffset(); - //std::cerr << "[ RegexSplit ] split part: " << value << "\n"; - //std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[i] + offset.first; - new_ends[ragged_offset] = begins[i] + offset.second; - - ++ragged_offset; - }; + auto str = std::string(chars + begins[word], chars + ends[word]); + std::cerr << "[ RegexSplit ] old_str: " << str << "\n"; + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); + pretokenizer(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); + + new_ragged_begins[seq] = ragged_offset; + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); + std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; + std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[word] + offset.first; + new_ends[ragged_offset] = begins[word] + offset.second; + + ++ragged_offset; + }; + } - new_ragged_ends[i] = ragged_offset; + new_ragged_ends[seq] = ragged_offset; + } + + // Fix real shape based on collected results + outputs[2].set_shape({ragged_offset}); + outputs[3].set_shape({ragged_offset}); + } else { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto split_pattern_buf = inputs[5].data(); + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + + outputs[4] = inputs[4]; + const size_t num_elements = inputs[2].get_size(); + const size_t num_chars = inputs[4].get_size(); + + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); + + outputs[4] = inputs[4]; // TODO: Does it really work? + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; + + using namespace paddlenlp::fast_tokenizer; + auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); + + for(size_t seq = 0; seq < num_elements; ++seq) { + for(size_t word = ragged_begins[seq]; word < ragged_ends[seq]; ++word) { + + auto str = std::string(chars + begins[word], chars + ends[word]); + std::cerr << "[ RegexSplit ] old_str: " << str << "\n"; + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); + pretokenizer(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); + + new_ragged_begins[seq] = ragged_offset; + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); + std::cerr << "[ RegexSplit ] split part: " << value << "\n"; + std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[word] + offset.first; + new_ends[ragged_offset] = begins[word] + offset.second; + + ++ragged_offset; + }; + } + + new_ragged_ends[seq] = ragged_offset; + } + + // Fix real shape based on collected results + outputs[2].set_shape({ragged_offset}); + outputs[3].set_shape({ragged_offset}); } +#if 1 + + // Set output shapes +// outputs[0].set_shape(inputs[0].get_shape()); +// outputs[1].set_shape(inputs[1].get_shape()); +// +// const size_t num_elements = inputs[0].get_size(); +// const size_t num_chars = inputs[2].get_size(); + + // TODO: Better estimations for max size? + // Assume we cannot have empty parts, so the number of parts cannot be bigger than the number of symbols +// outputs[2].set_shape(Shape{num_chars}); +// outputs[3].set_shape(Shape{num_chars}); + + // Assume we cannot introduce new symbols to output, only existing can be distributed (with gaps) + + // TODO: Can we just route input tensor directly to the output outside evaluate when graph is being constructed? +// outputs[4] = inputs[2]; // TODO: Does it really work? - // Fix real shape based on collected results - outputs[2].set_shape({ragged_offset}); - outputs[3].set_shape({ragged_offset}); - //outputs[4].set_shape({char_offset}); + // If line above doesn't work, do this instead: + //outputs[4].set_shape(Shape{num_chars}); + //inputs[2].copy_to(outputs[4]); return true; @@ -1307,6 +1399,181 @@ ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::N } +void BPETokenizer::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_input(this, 5); + check_string_input(this, 8); + set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); +} + +bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto vocab_begins = inputs[5].data(); + auto vocab_ends = inputs[6].data(); + auto vocab_chars = inputs[7].data(); + + auto merges_begins = inputs[8].data(); + auto merges_ends = inputs[9].data(); + auto merges_chars = inputs[10].data(); + + auto vocab_size = inputs[5].get_size(); + auto merges_size = inputs[8].get_size(); + + OPENVINO_ASSERT(inputs.size() == 11, "Too few inputs passed to BPETokenizer, it means it is not converted properly or it is not used in the supported pattern"); + +#if 1 + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elems = inputs[0].get_size(); + + // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area + // to represent different elements in ragged tensor + outputs[2].set_shape({inputs[4].get_size()}); + + using namespace paddlenlp::fast_tokenizer; + + std::cerr << "[ BPETokenizer ] Start vocab reading\n"; + core::Vocab vocab; + int32_t unk_token_id = -1; + + std::cerr << "[ BPETokenizer ] Vocab size is " << vocab_size << "\n"; + + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + } + + std::cerr << "[ BPETokenizer ] Finish vocab reading\n"; + + std::cerr << "[ BPETokenizer ] Start merges reading\n"; + std::cerr << "[ BPETokenizer ] Merges Size: " << merges_size << "\n"; + core::Merges merges; + std::string delim = " "; + + + for(size_t id = 0; id < merges_size; ++id) { + auto merge = std::string(merges_chars + merges_begins[id], merges_chars + merges_ends[id]); + const int delim_pos = merge.find(delim); + + std::pair merge_pair = { + merge.substr(0, delim_pos), merge.substr(delim_pos + 1) + }; + merges.emplace_back(merge_pair); + } + + std::cerr << "[ BPETokenizer ] Finish merges reading\n"; + + + std::cerr << "[ BPETokenizer ] Start tokenizer initialization\n"; + + std::vector unk_token = {}; + if (m_unk_token.size() > 0) { + unk_token.push_back(m_unk_token); + }; + std::vector suffix_indicator = {}; + if (m_suffix_indicator.size() > 0) { + suffix_indicator.push_back(m_suffix_indicator); + }; + std::vector end_suffix = {}; + if (m_end_suffix.size() > 0) { + end_suffix.push_back(m_end_suffix); + }; + + models::BPE tokenizer(vocab, merges, 10000 /* default cache size */, {} /* dropout - don't use dropout for inference */, + unk_token, suffix_indicator, end_suffix, m_fuse_unk); + + std::cerr << "[ BPETokenizer ] Finish tokenizer initialization\n"; + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t offset = 0; + + + for(size_t j = 0; j < num_elems; ++j) { + new_begins[j] = offset; + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + auto str = std::string(chars + begins[i], chars + ends[i]); + + std::cerr << "Word: '" << str << "'\n"; + std::vector results = tokenizer.Tokenize(str); + + for (const core::Token& token : results) { + std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ + << ", offset: (" << token.offset_.first << ", " + << token.offset_.second << ")." << std::endl; + OPENVINO_ASSERT(offset < outputs[2].get_size()); + new_elems[offset++] = token.id_; + }; + } + + new_ends[j] = offset; + } + + outputs[2].set_shape({offset}); + + OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + return true; + +#else + // Stub implementation that transforms each input string to its length duplicating element if the length is odd + { + std::cout << "[ DEBUG ] WordpieceTokenizer\n"; + std::cout << "[ DEBUG ] vocab size: " << inputs[5].get_size() << "\n"; + std::cout << "[ DEBUG ] unk_token_id: " << unk_token_id << "\n"; + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elems = inputs[0].get_size(); + + const size_t num_parts = inputs[2].get_size(); + size_t new_num_parts = num_parts; + // Count number of output elements + for(size_t i = 0; i < num_parts; ++i) { + auto length = ends[i] - begins[i]; + new_num_parts += length % 2; + } + + outputs[2].set_shape({new_num_parts}); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t offset = 0; + + for(size_t j = 0; j < num_elems; ++j) { + new_begins[j] = offset; + + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + + auto length = ends[i] - begins[i]; + new_elems[offset++] = length; + + if(length % 2) { + new_elems[offset++] = length; + } + } + + new_ends[j] = offset; + } + + OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + return true; + } + // End of stub implementation +#endif +} + + ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFindV2 expects 3 inputs"); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index f990d53f2..97abb6379 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -336,6 +336,59 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); +class OPENVINO_API BPETokenizer : public ov::op::Op { +public: + OPENVINO_OP("BPETokenizer"); + + BPETokenizer () = default; + + BPETokenizer( + const ov::OutputVector& arguments, + const std::string& unk_token = "", + bool fuse_unk = false, + const std::string& suffix_indicator = "", + const std::string& end_suffix = "", + bool byte_fallback = false + ) : + ov::op::Op(arguments), + m_unk_token(unk_token), + m_fuse_unk(fuse_unk), + m_suffix_indicator(suffix_indicator), + m_end_suffix(end_suffix), + m_byte_fallback(byte_fallback) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_unk_token, m_fuse_unk, m_suffix_indicator, m_end_suffix, m_byte_fallback); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("unk_token", m_unk_token); + visitor.on_attribute("fuse_unk", m_fuse_unk); + visitor.on_attribute("suffix_indicator", m_suffix_indicator); + visitor.on_attribute("end_suffix", m_end_suffix); + visitor.on_attribute("byte_fallback", m_byte_fallback); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + std::string m_unk_token; + bool m_fuse_unk = false; + std::string m_suffix_indicator; + std::string m_end_suffix; + bool m_byte_fallback = false; +}; + + class OPENVINO_API CombineSegments : public ov::op::Op { public: OPENVINO_OP("CombineSegments"); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index 17124b4c0..4fc8c901c 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -4,6 +4,7 @@ import os from dataclasses import dataclass, field from functools import singledispatchmethod +from itertools import chain from typing import List, Optional, Any, Dict from unittest.mock import MagicMock import weakref @@ -40,7 +41,7 @@ def pack_string(s): core = Core() # TODO: Use relative path -core.add_extension("/home/slyalin/openvino-contrib/muse/modules/custom_operations/build/user_ie_extensions/libuser_ov_extensions.so") +core.add_extension("/home/apaniuko/python/openvino/bin/intel64/Debug/libuser_ov_extensions.so") class BasePipelineStep: @@ -166,7 +167,11 @@ class RegexSplitStep(PreTokenizatinStep): behaviour: str = "Remove" @classmethod - def bert_splitter(cls) -> "RegexSplitStep": + def bert_whitespace_splitter(cls) -> "RegexSplitStep": + return cls(r"\s+") + + @classmethod + def bert_keep_delimeters_splitter(cls) -> "RegexSplitStep": """Generates a step with a standard BERT regex. The source: @@ -175,7 +180,6 @@ def bert_splitter(cls) -> "RegexSplitStep": return cls( "|".join( [ - r"\s+", r"|".join( [ r"[!-/]", @@ -199,11 +203,17 @@ def bert_splitter(cls) -> "RegexSplitStep": ), ], ), + invert=False, + behaviour="Isolate" ) + @classmethod + def bert_splitter(cls) -> List["RegexSplitStep"]: + return [cls.bert_whitespace_splitter(), cls.bert_keep_delimeters_splitter()] + @classmethod def whitespace_splitter(cls) -> "RegexSplitStep": - return cls(r"\w+|[^\w\s]+") + return cls(r"\w+|[^\w\s]+", invert=True) def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: input_nodes.extend( @@ -280,6 +290,47 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: ).outputs() +@dataclass +class BPETokenizationStep(TokenizationModelStep): + vocab: List[str] = field(repr=False) + merges: List[str] = field(repr=False) + unk_token: str = "" + fuse_unk: bool = False + suffix_indicator: str = "" + end_suffix: str = "" + byte_fallback: bool = False + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": + return cls( + unk_token=tokenizer_json["model"]["unk_token"] or "", + fuse_unk=tokenizer_json["model"]["fuse_unk"] or False, + suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"] or "", + end_suffix=tokenizer_json["model"]["end_of_word_suffix"] or "", + vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])], + merges=tokenizer_json["model"]["merges"], + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + input_nodes.extend( + ( + *self.create_string_constant_node(self.vocab).outputs(), + *self.create_string_constant_node(self.merges).outputs(), + ) + ) + return core.make_node( + "BPETokenizer", + input_nodes, + { + "unk_token": self.unk_token, + "fuse_unk": self.fuse_unk, + "suffix_indicator": self.suffix_indicator, + "end_suffix": self.end_suffix, + "byte_fallback": self.byte_fallback, + } + ).outputs() + + @dataclass class PostTokenizationStep(BasePipelineStep): pass @@ -561,8 +612,16 @@ def __getitem__(self, item: int) -> BasePipelineStep: return self.steps[item] @property - def processing_steps(self) -> List[BasePipelineStep]: - return [step for step in self.steps if not isinstance(step, PostTokenizationStep)] + def normalization_steps(self) -> List[NormalizationStep]: + return [step for step in self.steps if isinstance(step, NormalizationStep)] + + @property + def pretokenization_steps(self) -> List[PreTokenizatinStep]: + return [step for step in self.steps if isinstance(step, PreTokenizatinStep)] + + @property + def tokenization_steps(self) -> List[TokenizationModelStep]: + return [step for step in self.steps if isinstance(step, TokenizationModelStep)] @property def post_tokenization_steps(self) -> List[PostTokenizationStep]: @@ -576,11 +635,17 @@ def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[No for input_node in input_nodes: input_node = core.make_node("StringTensorUnpack", input_node.outputs()).outputs() - #print(input_node) - for step in self.processing_steps: + for step in self.normalization_steps: input_node = step.get_ov_subgraph(input_node) - #print('input_node:', input_node) - #ragged_tensor_pack = core.make_node("RaggedTensorPack", input_node) + + # batch_size = opset10.shape_of(input_node[0]) + # ragged_begins = opset10.range(as_node(0), batch_size, as_node(1)).outputs() + # ragged_ends = opset10.range(as_node(1), opset10.add(batch_size, as_node(1)).outputs(), as_node(1)) + # input_node = [ragged_begins, ragged_ends] + input_node + + for step in chain(self.pretokenization_steps, self.tokenization_steps): + input_node = step.get_ov_subgraph(input_node) + processing_pipelines_outputs += input_node return processing_pipelines_outputs From 10e3d1813fb56e898bf828501fb0420e86492f16 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 20 Jun 2023 16:38:57 +0100 Subject: [PATCH 026/116] Add BytesToChars Node for BBPE --- .../user_ie_extensions/ov_extension.cpp | 1 + .../sentence_piece/convert_tokenizer.py | 19 +- .../sentence_piece/hf_parser.py | 27 +- .../sentence_piece/sentence_piece.cpp | 335 ++++++++++++++++++ .../sentence_piece/sentence_piece.hpp | 38 ++ .../sentence_piece/tokenizer_pipeline.py | 62 +++- 6 files changed, 471 insertions(+), 11 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 863192334..4cc541595 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -69,6 +69,7 @@ std::make_shared>(), \ std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ std::make_shared>(), \ + std::make_shared>(), \ std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ std::make_shared>(), \ std::make_shared>(), \ diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index f8a091009..e1ad50fb4 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -17,10 +17,24 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token # TODO: Remove this check if isinstance(tokenizer_object, PreTrainedTokenizerBase): - ov_tokenizer = TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs).get_ov_subgraph() + ov_tokenizer = TransformersTokenizerPipelineParser(tokenizer_object).parse( + number_of_inputs=number_of_inputs + ).get_ov_subgraph() output_names = tokenizer_object.model_input_names + + ov_tokenizer_output_names = ["input_ids", "attention_mask"] + if len(output_names) == 3: + ov_tokenizer_output_names.insert(1, "token_type_ids") + filtered_outputs = [] - for i, output_name in enumerate(['input_ids', 'token_type_ids', 'attention_mask']): + for i, output_name in enumerate(ov_tokenizer_output_names): + current_output = next( + (output for output in ov_tokenizer.outputs if output.any_name == output_name), False + ) + if current_output: + filtered_outputs.append(current_output) + continue + if output_name in output_names: ov_tokenizer.output(i).tensor.add_names({output_name}) filtered_outputs.append(ov_tokenizer.output(i)) @@ -59,6 +73,7 @@ def connect_models(model1: Model, model2: Model, name_map=None, *, by_indices=No # Search for corresponding model1 output by all possible names for model1_output in model2.outputs ''' + else: aligned_model1_outputs = [model1.output(name1) for name1, _ in name_map] aligned_model2_inputs = [model2.input(name2) for _, name2 in name_map] diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py index c9cb900b9..722b864d7 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -20,6 +20,7 @@ PunctuationSplitStep, RegexSplitStep, WhitespaceSplitStep, + BytesToCharsStep, WordPieceTokenizationStep, BPETokenizationStep, TruncationStep, @@ -67,6 +68,21 @@ def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep: ) +def parse_byte_level_pretokenization_step( + pretokenizer_dict: Dict[str, Any] +) -> List[Union[NormalizationStep, PreTokenizatinStep]]: + steps = [] + if pretokenizer_dict.get("add_prefix_space"): + steps.append(RegExpNormalizationStep(regex_search_pattern="^(\S)", replace_term=" $1")) + + # regex is used by default, but it does not appeared in config yet + if pretokenizer_dict.get("use_regex", True): + steps.append(RegexSplitStep.byte_level_splitter()) + + steps.append(BytesToCharsStep()) + return steps + + class TransformersTokenizerPipelineParser: def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None: assert tokenizer_object.is_fast @@ -128,7 +144,7 @@ def normalization(self) -> None: "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(), "Split": parse_split_step, "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]), - "ByteLevel": lambda step_dict: WhitespaceSplitStep(), ## !!!FIX ME!!! + "ByteLevel": parse_byte_level_pretokenization_step, } def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None: @@ -169,11 +185,14 @@ def post_tokenization(self) -> None: combine_segments_step = CombineSegmentsStep.from_hf_json_bert_postprocessor( self.tokenizer_json, self.number_of_inputs ) - elif self.tokenizer_json["post_processor"]["type"] == "ByteLevel": # !!!FIX ME!!! - pass # test BPETokenizer + elif self.tokenizer_json["post_processor"]["type"] == "ByteLevel": self.add_truncation() self.add_padding() return + elif self.tokenizer_json["post_processor"]["type"] == "RobertaProcessing": + combine_segments_step = CombineSegmentsStep.from_hf_json_roberta_processor( + self.tokenizer_json, self.number_of_inputs + ) else: raise OVTypeError(f"Post-processor type '{self.tokenizer_json['post_processor']['type']}' is not supported") @@ -198,3 +217,5 @@ def add_padding(self) -> None: elif self.original_tokenizer.pad_token is not None: self.pipeline.add_steps(PaddingStep(token=self.original_tokenizer.pad_token)) self.pipeline[-1].set_token_id(self.pipeline.vocab) + else: + self.pipeline.add_steps(PaddingStep()) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index a1fbc54bf..7519b7539 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1229,6 +1229,341 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont } +const std::unordered_map> create_bytes_to_chars_map() { + return { + { 33 , { 33 }}, + { 34 , { 34 }}, + { 35 , { 35 }}, + { 36 , { 36 }}, + { 37 , { 37 }}, + { 38 , { 38 }}, + { 39 , { 39 }}, + { 40 , { 40 }}, + { 41 , { 41 }}, + { 42 , { 42 }}, + { 43 , { 43 }}, + { 44 , { 44 }}, + { 45 , { 45 }}, + { 46 , { 46 }}, + { 47 , { 47 }}, + { 48 , { 48 }}, + { 49 , { 49 }}, + { 50 , { 50 }}, + { 51 , { 51 }}, + { 52 , { 52 }}, + { 53 , { 53 }}, + { 54 , { 54 }}, + { 55 , { 55 }}, + { 56 , { 56 }}, + { 57 , { 57 }}, + { 58 , { 58 }}, + { 59 , { 59 }}, + { 60 , { 60 }}, + { 61 , { 61 }}, + { 62 , { 62 }}, + { 63 , { 63 }}, + { 64 , { 64 }}, + { 65 , { 65 }}, + { 66 , { 66 }}, + { 67 , { 67 }}, + { 68 , { 68 }}, + { 69 , { 69 }}, + { 70 , { 70 }}, + { 71 , { 71 }}, + { 72 , { 72 }}, + { 73 , { 73 }}, + { 74 , { 74 }}, + { 75 , { 75 }}, + { 76 , { 76 }}, + { 77 , { 77 }}, + { 78 , { 78 }}, + { 79 , { 79 }}, + { 80 , { 80 }}, + { 81 , { 81 }}, + { 82 , { 82 }}, + { 83 , { 83 }}, + { 84 , { 84 }}, + { 85 , { 85 }}, + { 86 , { 86 }}, + { 87 , { 87 }}, + { 88 , { 88 }}, + { 89 , { 89 }}, + { 90 , { 90 }}, + { 91 , { 91 }}, + { 92 , { 92 }}, + { 93 , { 93 }}, + { 94 , { 94 }}, + { 95 , { 95 }}, + { 96 , { 96 }}, + { 97 , { 97 }}, + { 98 , { 98 }}, + { 99 , { 99 }}, + { 100 , { 100 }}, + { 101 , { 101 }}, + { 102 , { 102 }}, + { 103 , { 103 }}, + { 104 , { 104 }}, + { 105 , { 105 }}, + { 106 , { 106 }}, + { 107 , { 107 }}, + { 108 , { 108 }}, + { 109 , { 109 }}, + { 110 , { 110 }}, + { 111 , { 111 }}, + { 112 , { 112 }}, + { 113 , { 113 }}, + { 114 , { 114 }}, + { 115 , { 115 }}, + { 116 , { 116 }}, + { 117 , { 117 }}, + { 118 , { 118 }}, + { 119 , { 119 }}, + { 120 , { 120 }}, + { 121 , { 121 }}, + { 122 , { 122 }}, + { 123 , { 123 }}, + { 124 , { 124 }}, + { 125 , { 125 }}, + { 126 , { 126 }}, + { 161 , { 194, 161 }}, + { 162 , { 194, 162 }}, + { 163 , { 194, 163 }}, + { 164 , { 194, 164 }}, + { 165 , { 194, 165 }}, + { 166 , { 194, 166 }}, + { 167 , { 194, 167 }}, + { 168 , { 194, 168 }}, + { 169 , { 194, 169 }}, + { 170 , { 194, 170 }}, + { 171 , { 194, 171 }}, + { 172 , { 194, 172 }}, + { 174 , { 194, 174 }}, + { 175 , { 194, 175 }}, + { 176 , { 194, 176 }}, + { 177 , { 194, 177 }}, + { 178 , { 194, 178 }}, + { 179 , { 194, 179 }}, + { 180 , { 194, 180 }}, + { 181 , { 194, 181 }}, + { 182 , { 194, 182 }}, + { 183 , { 194, 183 }}, + { 184 , { 194, 184 }}, + { 185 , { 194, 185 }}, + { 186 , { 194, 186 }}, + { 187 , { 194, 187 }}, + { 188 , { 194, 188 }}, + { 189 , { 194, 189 }}, + { 190 , { 194, 190 }}, + { 191 , { 194, 191 }}, + { 192 , { 195, 128 }}, + { 193 , { 195, 129 }}, + { 194 , { 195, 130 }}, + { 195 , { 195, 131 }}, + { 196 , { 195, 132 }}, + { 197 , { 195, 133 }}, + { 198 , { 195, 134 }}, + { 199 , { 195, 135 }}, + { 200 , { 195, 136 }}, + { 201 , { 195, 137 }}, + { 202 , { 195, 138 }}, + { 203 , { 195, 139 }}, + { 204 , { 195, 140 }}, + { 205 , { 195, 141 }}, + { 206 , { 195, 142 }}, + { 207 , { 195, 143 }}, + { 208 , { 195, 144 }}, + { 209 , { 195, 145 }}, + { 210 , { 195, 146 }}, + { 211 , { 195, 147 }}, + { 212 , { 195, 148 }}, + { 213 , { 195, 149 }}, + { 214 , { 195, 150 }}, + { 215 , { 195, 151 }}, + { 216 , { 195, 152 }}, + { 217 , { 195, 153 }}, + { 218 , { 195, 154 }}, + { 219 , { 195, 155 }}, + { 220 , { 195, 156 }}, + { 221 , { 195, 157 }}, + { 222 , { 195, 158 }}, + { 223 , { 195, 159 }}, + { 224 , { 195, 160 }}, + { 225 , { 195, 161 }}, + { 226 , { 195, 162 }}, + { 227 , { 195, 163 }}, + { 228 , { 195, 164 }}, + { 229 , { 195, 165 }}, + { 230 , { 195, 166 }}, + { 231 , { 195, 167 }}, + { 232 , { 195, 168 }}, + { 233 , { 195, 169 }}, + { 234 , { 195, 170 }}, + { 235 , { 195, 171 }}, + { 236 , { 195, 172 }}, + { 237 , { 195, 173 }}, + { 238 , { 195, 174 }}, + { 239 , { 195, 175 }}, + { 240 , { 195, 176 }}, + { 241 , { 195, 177 }}, + { 242 , { 195, 178 }}, + { 243 , { 195, 179 }}, + { 244 , { 195, 180 }}, + { 245 , { 195, 181 }}, + { 246 , { 195, 182 }}, + { 247 , { 195, 183 }}, + { 248 , { 195, 184 }}, + { 249 , { 195, 185 }}, + { 250 , { 195, 186 }}, + { 251 , { 195, 187 }}, + { 252 , { 195, 188 }}, + { 253 , { 195, 189 }}, + { 254 , { 195, 190 }}, + { 255 , { 195, 191 }}, + { 0 , { 196, 128 }}, + { 1 , { 196, 129 }}, + { 2 , { 196, 130 }}, + { 3 , { 196, 131 }}, + { 4 , { 196, 132 }}, + { 5 , { 196, 133 }}, + { 6 , { 196, 134 }}, + { 7 , { 196, 135 }}, + { 8 , { 196, 136 }}, + { 9 , { 196, 137 }}, + { 10 , { 196, 138 }}, + { 11 , { 196, 139 }}, + { 12 , { 196, 140 }}, + { 13 , { 196, 141 }}, + { 14 , { 196, 142 }}, + { 15 , { 196, 143 }}, + { 16 , { 196, 144 }}, + { 17 , { 196, 145 }}, + { 18 , { 196, 146 }}, + { 19 , { 196, 147 }}, + { 20 , { 196, 148 }}, + { 21 , { 196, 149 }}, + { 22 , { 196, 150 }}, + { 23 , { 196, 151 }}, + { 24 , { 196, 152 }}, + { 25 , { 196, 153 }}, + { 26 , { 196, 154 }}, + { 27 , { 196, 155 }}, + { 28 , { 196, 156 }}, + { 29 , { 196, 157 }}, + { 30 , { 196, 158 }}, + { 31 , { 196, 159 }}, + { 32 , { 196, 160 }}, + { 127 , { 196, 161 }}, + { 128 , { 196, 162 }}, + { 129 , { 196, 163 }}, + { 130 , { 196, 164 }}, + { 131 , { 196, 165 }}, + { 132 , { 196, 166 }}, + { 133 , { 196, 167 }}, + { 134 , { 196, 168 }}, + { 135 , { 196, 169 }}, + { 136 , { 196, 170 }}, + { 137 , { 196, 171 }}, + { 138 , { 196, 172 }}, + { 139 , { 196, 173 }}, + { 140 , { 196, 174 }}, + { 141 , { 196, 175 }}, + { 142 , { 196, 176 }}, + { 143 , { 196, 177 }}, + { 144 , { 196, 178 }}, + { 145 , { 196, 179 }}, + { 146 , { 196, 180 }}, + { 147 , { 196, 181 }}, + { 148 , { 196, 182 }}, + { 149 , { 196, 183 }}, + { 150 , { 196, 184 }}, + { 151 , { 196, 185 }}, + { 152 , { 196, 186 }}, + { 153 , { 196, 187 }}, + { 154 , { 196, 188 }}, + { 155 , { 196, 189 }}, + { 156 , { 196, 190 }}, + { 157 , { 196, 191 }}, + { 158 , { 197, 128 }}, + { 159 , { 197, 129 }}, + { 160 , { 197, 130 }}, + { 173 , { 197, 131 }} + }; +} + +void BytesToChars::validate_and_infer_types() { + check_ragged_string_input(this, 0); +// check_string_input(this, 5); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + +bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to BytesToChars, it means it is not converted properly or it is not used in the supported pattern"); + + // Set output shapes + outputs[0] = inputs[0]; + outputs[1] = inputs[1]; + outputs[2].set_shape(inputs[2].get_shape()); + outputs[3].set_shape(inputs[3].get_shape()); + outputs[4].set_shape(Shape({inputs[4].get_size() * 2})); + const size_t num_elems = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + uint32_t char_pointer = 0; + + for(size_t j = 0; j < num_elems; ++j) { + new_begins[j] = char_pointer; + + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + const auto word_len = ends[i] - begins[i]; + for (size_t k = 0; k < word_len; ++k) { + for (auto byte : m_bytes_to_chars.at(chars[begins[i] + k])) { + new_chars[char_pointer++] = static_cast (byte); + } + } + } + new_ends[j] = char_pointer; + } + +// std::cerr << "Char pointer: " << char_pointer << "; old chars size: " << inputs[4].get_size() << "\n"; +// +// std::cerr << "Before set_shape:\n"; +// for (size_t i = 0; i < char_pointer; ++i) { +// std::cerr << outputs[4].data()[i] << ", "; +// } +// std::cerr << "\n"; +// +// for (size_t i = 0; i < char_pointer; ++i) { +// std::cerr << static_cast(outputs[4].data()[i]) << ", "; +// } +// std::cerr << "\n"; +// +// outputs[4].set_shape({char_pointer}); +// +// std::cerr << "After set_shape:\n"; +// +// for (size_t i = 0; i < char_pointer; ++i) { +// std::cerr << outputs[4].data()[i] << ", "; +// } +// std::cerr << "\n"; +// +// for (size_t i = 0; i < char_pointer; ++i) { +// std::cerr << static_cast(outputs[4].data()[i]) << ", "; +// } +// std::cerr << "\n"; + + return true; +} + + void WordpieceTokenizer::validate_and_infer_types() { check_ragged_string_input(this, 0); check_string_input(this, 5); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 97abb6379..f2697644e 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -336,6 +336,44 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); + +const std::unordered_map> create_bytes_to_chars_map(); + + +class OPENVINO_API BytesToChars : public ov::op::Op { +public: + OPENVINO_OP("BytesToChars"); + + BytesToChars () = default; + + BytesToChars(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + +// bool visit_attributes(ov::AttributeVisitor& visitor) override { +// visitor.on_attribute("suffix_indicator", m_suffix_indicator); +// visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); +// return true; +// } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + const std::unordered_map> m_bytes_to_chars = create_bytes_to_chars_map(); +}; + + class OPENVINO_API BPETokenizer : public ov::op::Op { public: OPENVINO_OP("BPETokenizer"); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index 4fc8c901c..e07c65bf2 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -215,6 +215,15 @@ def bert_splitter(cls) -> List["RegexSplitStep"]: def whitespace_splitter(cls) -> "RegexSplitStep": return cls(r"\w+|[^\w\s]+", invert=True) + @classmethod + def byte_level_splitter(cls) -> "RegexSplitStep": + return cls( + # r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+", + r"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+)", + invert=False, + behaviour="Isolate", + ) + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: input_nodes.extend( self.create_string_constant_node(self.split_pattern).outputs() @@ -242,6 +251,16 @@ class PunctuationSplitStep(PreTokenizatinStep): # behaviour: str = "Isolated" +@dataclass +class BytesToCharsStep(PreTokenizatinStep): + """Maps chars to other chars for Byte-level BPE Tokenizer""" + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + return core.make_node( + "BytesToChars", + input_nodes, + ).outputs() + + @dataclass class TokenizationModelStep(BasePipelineStep): pass @@ -378,11 +397,11 @@ def get_ov_subgraph(self, input_nodes: List[Output]): @dataclass class SpecialTokenWithId: - token: str + token: Optional[str] = None _token_id: Optional[int] = None def set_token_id(self, vocab: Optional[List[str]]) -> None: - if vocab is not None: + if vocab is not None and self.token in vocab: self._token_id = vocab.index(self.token) @property @@ -453,7 +472,9 @@ def from_hf_json_template_postprocessor( return cls(inputs) @classmethod - def from_hf_json_bert_postprocessor(cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1): + def from_hf_json_bert_postprocessor( + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + ) -> "CombineSegmentsStep": post_processor_dict = tokenizer_json["post_processor"] inputs: List[TokenWithTypeId] = [ AddToken( @@ -479,6 +500,27 @@ def from_hf_json_bert_postprocessor(cls, tokenizer_json: Dict[str, Any], number_ ) return cls(inputs) + @classmethod + def from_hf_json_roberta_processor( + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + ) -> "CombineSegmentsStep": + post_processor_dict = tokenizer_json["post_processor"] + + inputs: List[TokenWithTypeId] = [Sequence(token_type_id=0)] + + if not post_processor_dict.get("add_special_tokens", True): + return cls(inputs) + + inputs.insert( + 0, AddToken(token=post_processor_dict["cls"][0], token_type_id=0) + ) + inputs.append(AddToken(token=post_processor_dict["sep"][0], token_type_id=0)) + + if number_of_inputs == 2: + print("WARNING: Pair of inputs not supported for RoBERTa postprocessor") + + return cls(inputs) + def get_ov_subgraph(self, input_nodes): number_of_sequence_inputs = sum( 1 for input_ in self.inputs if isinstance(input_, Sequence) @@ -570,15 +612,22 @@ def get_ov_subgraph(self, input_nodes): #if self.token_type_id == -1: # self.token_type_id = 0 - for i in range(len(input_nodes)//3): + names = ["input_ids", "token_type_ids"] + for i, name in zip(range(len(input_nodes)//3), names): #print(input_nodes[3*i:3*(i+1)]) #print(as_node(self.max_length).outputs()) #print(as_node(np.array(0, dtype=int)).outputs()) - cur_outputs = core.make_node('RaggedToDense', input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs()).outputs() + cur_outputs = core.make_node( + "RaggedToDense", + input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs() + ).outputs() + cur_outputs[0].tensor.add_names({name}) + outputs.append(cur_outputs[0]) if i == 0: - mask = opset10.convert(cur_outputs[1], 'i32').output(0) # TODO: Change RaggedToDense to generate mask of any type + mask = opset10.convert(cur_outputs[1], "i32").output(0) # TODO: Change RaggedToDense to generate mask of any type + mask.tensor.add_names({"attention_mask"}) outputs.append(mask) return outputs @@ -672,6 +721,7 @@ def get_ov_subgraph(self) -> Model: processing_outputs = self.create_processing_pipeline(input_nodes) outputs = self.create_post_tokenization_pipeline(processing_outputs) + print(self) return Model(outputs, input_nodes, name="tokenizer") From c413cb64323957653f9ae5f12ff34119c65c34e2 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 20 Jun 2023 17:39:15 +0100 Subject: [PATCH 027/116] Delete print --- .../user_ie_extensions/sentence_piece/tokenizer_pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index e07c65bf2..f9be12daa 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -721,7 +721,6 @@ def get_ov_subgraph(self) -> Model: processing_outputs = self.create_processing_pipeline(input_nodes) outputs = self.create_post_tokenization_pipeline(processing_outputs) - print(self) return Model(outputs, input_nodes, name="tokenizer") From 8c8994c9307620b41d8467ae77bf0f6020ba3817 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 20 Jun 2023 18:50:04 +0100 Subject: [PATCH 028/116] Clip max value for max_length to int32 --- .../sentence_piece/sentence_piece.cpp | 2 +- .../sentence_piece/tokenizer_pipeline.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 7519b7539..4defdf6ec 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1546,7 +1546,7 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i // } // std::cerr << "\n"; // -// outputs[4].set_shape({char_pointer}); + outputs[4].set_shape({char_pointer}); // // std::cerr << "After set_shape:\n"; // diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index f9be12daa..b28328130 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -363,15 +363,23 @@ class TruncationStep(PostTokenizationStep): @classmethod def from_hf_json(cls, tokenizer_json: Dict[str, Any], num_of_added_tokens: int = 0) -> "TruncationStep": + max_length = min( + tokenizer_json["truncation"]["max_length"] - num_of_added_tokens, + 2**31 - 1 - num_of_added_tokens, + ) return cls( - max_length=tokenizer_json["truncation"]["max_length"] - num_of_added_tokens, + max_length=max_length, truncate_right=tokenizer_json["truncation"]["direction"] == "Right", ) @classmethod def from_hf_object(cls, tokenizer: Any, num_of_added_tokens: int = 0) -> "TruncationStep": + max_length = min( + tokenizer.model_max_length - num_of_added_tokens, + 2 ** 31 - 1 - num_of_added_tokens, + ) return cls( - max_length=tokenizer.model_max_length - num_of_added_tokens, + max_length=max_length, truncate_right=tokenizer.truncation_side == "right", ) @@ -604,7 +612,7 @@ def get_ov_subgraph(self, input_nodes): #print('ERRROR: SETTING MAX_LENGTH = 100') #print('ERROR: Ignoring pad token and set it to id = 0') - if self.max_length == -1: + if self.max_length == -1 or self.max_length >= 2 ** 31: # Calculate max_length as the maximum ragged length max_length = opset10.reduce_max(opset10.subtract(input_nodes[1], input_nodes[0]), make_constant_node(0, Type.i32)) else: From 8750ae604617612d25f54b38c60960ab05d9b556 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 22 Jun 2023 19:14:48 +0100 Subject: [PATCH 029/116] Fix RegexNormalization and Splitter, Add Digits Splitter --- .../sentence_piece/tokenizer_pipeline.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index b28328130..a741eb282 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -114,13 +114,14 @@ def strip_accents_regex(cls) -> "RegExpNormalizationStep": @classmethod def del_control_chars_regex(cls) -> "RegExpNormalizationStep": - return cls(regex_search_pattern=r"\p{Cc}|\p{Cf}", replace_term=" ") + # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L17 + return cls(regex_search_pattern=r"((?=[^\n\t\r])\p{Cc})|((?=[^\n\t\r])\p{Cf})", replace_term=" ") def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: input_nodes.extend( ( - *self.create_string_constant_node("search_pattern").outputs(), - *self.create_string_constant_node("replace_pattern").outputs(), + *self.create_string_constant_node(self.regex_search_pattern).outputs(), + *self.create_string_constant_node(self.replace_term).outputs(), ) ) return core.make_node( @@ -164,11 +165,11 @@ class PreTokenizatinStep(BasePipelineStep): class RegexSplitStep(PreTokenizatinStep): split_pattern: str invert: bool = False - behaviour: str = "Remove" + behaviour: str = "remove" @classmethod def bert_whitespace_splitter(cls) -> "RegexSplitStep": - return cls(r"\s+") + return cls(split_pattern=r"\s+", invert=False) @classmethod def bert_keep_delimeters_splitter(cls) -> "RegexSplitStep": @@ -204,7 +205,7 @@ def bert_keep_delimeters_splitter(cls) -> "RegexSplitStep": ], ), invert=False, - behaviour="Isolate" + behaviour="isolate" ) @classmethod @@ -218,10 +219,17 @@ def whitespace_splitter(cls) -> "RegexSplitStep": @classmethod def byte_level_splitter(cls) -> "RegexSplitStep": return cls( - # r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+", - r"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+)", + r"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)", invert=False, - behaviour="Isolate", + behaviour="isolate", + ) + + @classmethod + def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": + return cls( + r"\p{Nd}|\p{Nl}|\p{No}", + invert=False, + behaviour=behaviour, ) def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: From be6dc3fcc8769ddf195e46228fd839077a1e28c0 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 23 Jun 2023 13:37:27 +0100 Subject: [PATCH 030/116] Bug fixes --- .../sentence_piece/convert_tokenizer.py | 2 +- .../sentence_piece/hf_parser.py | 29 +- .../sentence_piece/sentence_piece.cpp | 260 ++++++++---------- .../sentence_piece/sentence_piece.hpp | 2 +- 4 files changed, 129 insertions(+), 164 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index e1ad50fb4..82839cb32 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -23,7 +23,7 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token output_names = tokenizer_object.model_input_names ov_tokenizer_output_names = ["input_ids", "attention_mask"] - if len(output_names) == 3: + if len(output_names) == 3 and len(ov_tokenizer.outputs) == 3: ov_tokenizer_output_names.insert(1, "token_type_ids") filtered_outputs = [] diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py index 722b864d7..5d1f951a8 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -38,17 +38,19 @@ def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegExpNormaliza def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]: - steps: List[NormalizationStep] = [NormalizeUnicode("NFD")] - - if normalizer_dict["lowercase"] is True: - steps.append(CaseFoldStep()) + steps: List[NormalizationStep] = [] if normalizer_dict["clean_text"] is True: steps.append(RegExpNormalizationStep.del_control_chars_regex()) - if normalizer_dict["strip_accents"] is True: + # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L127 + if normalizer_dict.get("strip_accents") or normalizer_dict["lowercase"]: + steps.append(NormalizeUnicode("NFD")) steps.append(RegExpNormalizationStep.strip_accents_regex()) + if normalizer_dict["lowercase"] is True: + steps.append(CaseFoldStep()) + return steps @@ -61,10 +63,11 @@ def parse_strip_step(split_dict: Dict[str, Any]) -> StripStringStep: def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep: split_pattern = pretokenizer_dict["pattern"].get("String") or pretokenizer_dict["pattern"]["Regex"] + print(pretokenizer_dict["behavior"], pretokenizer_dict["behavior"].lower().rstrip("d")) return RegexSplitStep( split_pattern=split_pattern, invert=pretokenizer_dict["invert"], - behaviour=pretokenizer_dict["behavior"], + behaviour=pretokenizer_dict["behavior"].lower().rstrip("d") ) @@ -145,6 +148,9 @@ def normalization(self) -> None: "Split": parse_split_step, "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]), "ByteLevel": parse_byte_level_pretokenization_step, + "Digits": lambda step_dict: RegexSplitStep.digits_splitter( + "isolate" if step_dict["individual_digits"] else "contiguous" + ) } def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None: @@ -174,7 +180,12 @@ def tokenization_model(self) -> None: raise OVTypeError(f"Tokenizer type '{self.tokenizer_json['model']['type']}' is not supported") def post_tokenization(self) -> None: - if self.tokenizer_json["post_processor"] is None: + if ( + self.tokenizer_json["post_processor"] is None + or self.tokenizer_json["post_processor"]["type"] == "ByteLevel" + ): + self.add_truncation() + self.add_padding() return if self.tokenizer_json["post_processor"]["type"] == "TemplateProcessing": @@ -185,10 +196,6 @@ def post_tokenization(self) -> None: combine_segments_step = CombineSegmentsStep.from_hf_json_bert_postprocessor( self.tokenizer_json, self.number_of_inputs ) - elif self.tokenizer_json["post_processor"]["type"] == "ByteLevel": - self.add_truncation() - self.add_padding() - return elif self.tokenizer_json["post_processor"]["type"] == "RobertaProcessing": combine_segments_step = CombineSegmentsStep.from_hf_json_roberta_processor( self.tokenizer_json, self.number_of_inputs diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 4defdf6ec..39dcafb3c 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -951,6 +951,7 @@ using paddlenlp::fast_tokenizer::core::SplitMode; const std::map split_modes = { {"remove", SplitMode::REMOVED}, {"isolate", SplitMode::ISOLATED}, + {"contiguous", SplitMode::CONTIGUOUS}, {"merge_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, {"merge_with_next", SplitMode::MERGED_WITH_NEXT}, }; @@ -984,9 +985,9 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp }; auto split_pattern_buf = inputs[3].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - const size_t num_elements = inputs[0].get_size(); + const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[2].get_size(); outputs[0].set_shape(inputs[0].get_shape()); @@ -995,7 +996,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp outputs[2].set_shape(Shape{num_chars}); outputs[3].set_shape(Shape{num_chars}); - outputs[4] = inputs[2]; // TODO: Does it really work? + outputs[4] = inputs[2]; // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions // and only number of elements in the original tensors matter @@ -1010,29 +1011,29 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp using namespace paddlenlp::fast_tokenizer; auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - std::cerr << "[ RegexSplit ] regex: " << std::string(split_pattern) << "\n"; + for(size_t seq = 0; seq < num_rows; ++seq) { +// std::cerr << "================= Seq: " << seq << " ====================\n"; +// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; + + new_ragged_begins[seq] = ragged_offset; - for(size_t seq = 0; seq < num_elements; ++seq) { - for(size_t word = ragged_begins[seq]; word < ragged_ends[seq]; ++word) { + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - auto str = std::string(chars + begins[word], chars + ends[word]); - std::cerr << "[ RegexSplit ] old_str: " << str << "\n"; + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); +// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); pretokenizer(&pretokenized); size_t num_splits = pretokenized.GetSplitsSize(); - - new_ragged_begins[seq] = ragged_offset; +// std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; for (size_t j = 0; j < num_splits; ++j) { auto split = pretokenized.GetSplit(j); const auto& value = split.normalized_.GetStr(); auto offset = split.normalized_.GetOrginalOffset(); - std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; - std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[word] + offset.first; - new_ends[ragged_offset] = begins[word] + offset.second; - - ++ragged_offset; +// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; +// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[ragged_col] + offset.first; + new_ends[ragged_offset++] = begins[ragged_col] + offset.second; }; } @@ -1042,6 +1043,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp // Fix real shape based on collected results outputs[2].set_shape({ragged_offset}); outputs[3].set_shape({ragged_offset}); + } else { auto ragged_begins = inputs[0].data(); auto ragged_ends = inputs[1].data(); @@ -1052,8 +1054,10 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto split_pattern_buf = inputs[5].data(); auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant +// std::cerr << "Split Pattern: " << split_pattern << "\n"; + outputs[4] = inputs[4]; - const size_t num_elements = inputs[2].get_size(); + const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[4].get_size(); outputs[0].set_shape(inputs[0].get_shape()); @@ -1062,7 +1066,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp outputs[2].set_shape(Shape{num_chars}); outputs[3].set_shape(Shape{num_chars}); - outputs[4] = inputs[4]; // TODO: Does it really work? + outputs[4] = inputs[4]; // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions // and only number of elements in the original tensors matter @@ -1077,27 +1081,40 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp using namespace paddlenlp::fast_tokenizer; auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - for(size_t seq = 0; seq < num_elements; ++seq) { - for(size_t word = ragged_begins[seq]; word < ragged_ends[seq]; ++word) { + for(size_t seq = 0; seq < num_rows; ++seq) { +// std::cerr << "----------------- Seq: " << seq << " -----------------\n"; +// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; + + new_ragged_begins[seq] = ragged_offset; - auto str = std::string(chars + begins[word], chars + ends[word]); - std::cerr << "[ RegexSplit ] old_str: " << str << "\n"; + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); +// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); pretokenizer(&pretokenized); size_t num_splits = pretokenized.GetSplitsSize(); - new_ragged_begins[seq] = ragged_offset; for (size_t j = 0; j < num_splits; ++j) { auto split = pretokenized.GetSplit(j); const auto& value = split.normalized_.GetStr(); auto offset = split.normalized_.GetOrginalOffset(); - std::cerr << "[ RegexSplit ] split part: " << value << "\n"; - std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[word] + offset.first; - new_ends[ragged_offset] = begins[word] + offset.second; +// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; +// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[ragged_col] + offset.first; + new_ends[ragged_offset++] = begins[ragged_col] + offset.second; + - ++ragged_offset; +// std::cerr << "New begins and ends:\n"; +// for (size_t i = 0; i < outputs[2].get_size(); ++i) { +// std::cerr << outputs[2].data()[i] << ", "; +// } +// std::cerr << "\n"; +// +// for (size_t i = 0; i < outputs[3].get_size(); ++i) { +// std::cerr << outputs[3].data()[i] << ", "; +// } +// std::cerr << "\n"; }; } @@ -1107,6 +1124,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp // Fix real shape based on collected results outputs[2].set_shape({ragged_offset}); outputs[3].set_shape({ragged_offset}); + } #if 1 @@ -1520,46 +1538,20 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i uint32_t char_pointer = 0; for(size_t j = 0; j < num_elems; ++j) { - new_begins[j] = char_pointer; for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { const auto word_len = ends[i] - begins[i]; + new_begins[i] = char_pointer; + for (size_t k = 0; k < word_len; ++k) { for (auto byte : m_bytes_to_chars.at(chars[begins[i] + k])) { new_chars[char_pointer++] = static_cast (byte); } } + new_ends[i] = char_pointer; } - new_ends[j] = char_pointer; } - -// std::cerr << "Char pointer: " << char_pointer << "; old chars size: " << inputs[4].get_size() << "\n"; -// -// std::cerr << "Before set_shape:\n"; -// for (size_t i = 0; i < char_pointer; ++i) { -// std::cerr << outputs[4].data()[i] << ", "; -// } -// std::cerr << "\n"; -// -// for (size_t i = 0; i < char_pointer; ++i) { -// std::cerr << static_cast(outputs[4].data()[i]) << ", "; -// } -// std::cerr << "\n"; -// outputs[4].set_shape({char_pointer}); -// -// std::cerr << "After set_shape:\n"; -// -// for (size_t i = 0; i < char_pointer; ++i) { -// std::cerr << outputs[4].data()[i] << ", "; -// } -// std::cerr << "\n"; -// -// for (size_t i = 0; i < char_pointer; ++i) { -// std::cerr << static_cast(outputs[4].data()[i]) << ", "; -// } -// std::cerr << "\n"; - return true; } @@ -1586,14 +1578,13 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec OPENVINO_ASSERT(inputs.size() == 9, "Too few inputs passed to WordpieceTokenizer, it means it is not converted properly or it is not used in the supported pattern"); auto unk_token_id = *inputs[8].data(); - //std::cerr << "[ WordpieceTokenizer ] unk_token_id = " << unk_token_id << "\n"; #if 1 // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_elems = inputs[0].get_size(); + const size_t num_rows = inputs[0].get_size(); //const size_t num_parts = inputs[2].get_size(); //size_t new_num_parts = num_parts; @@ -1606,11 +1597,11 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto new_begins = outputs[0].data(); auto new_ends = outputs[1].data(); auto new_elems = outputs[2].data(); - int32_t offset = 0; + int32_t ragged_offset = 0; using namespace paddlenlp::fast_tokenizer; - //std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; +// std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; core::Vocab vocab; std::string unk_token; if(unk_token_id < 0) @@ -1622,38 +1613,41 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec unk_token = token; } - //std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; - //std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; - //std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; +// std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; +// std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; +// std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; auto tokenizer = models::FastWordPiece(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); // FIXME: why true? - //std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; +// std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; - for(size_t j = 0; j < num_elems; ++j) { - new_begins[j] = offset; + for(size_t seq = 0; seq < num_rows; ++seq) { + new_begins[seq] = ragged_offset; - for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - auto str = std::string(chars + begins[i], chars + ends[i]); + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); std::vector results = tokenizer.Tokenize(str); +// std::cerr << "[ WordpieceTokenizer ] String bytes: "; +// for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { +// std::cerr << static_cast (chars[i]) << " "; +// } +// std::cerr << "\n"; +// std::cerr << "[ WordpieceTokenizer ] String: '" << str << "'\n"; +// std::cerr << "[ WordpieceTokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; for (const core::Token& token : results) { - //std::cout << "[ WordpieceTokenizer ] id: " << token.id_ << ", value: " << token.value_ - // << ", offset: (" << token.offset_.first << ", " - // << token.offset_.second << ")." << std::endl; - OPENVINO_ASSERT(offset < outputs[2].get_size()); - new_elems[offset++] = token.id_; +// std::cout << "[ WordpieceTokenizer ] id: " << token.id_ << ", value: " << token.value_ +// << ", offset: (" << token.offset_.first << ", " +// << token.offset_.second << ")." << std::endl; + OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); + new_elems[ragged_offset++] = token.id_; }; } - - new_ends[j] = offset; + new_ends[seq] = ragged_offset; } - - outputs[2].set_shape({offset}); - - OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + outputs[2].set_shape({ragged_offset}); return true; #else @@ -1765,7 +1759,7 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_elems = inputs[0].get_size(); + const size_t num_rows = inputs[0].get_size(); // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area // to represent different elements in ragged tensor @@ -1773,21 +1767,21 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i using namespace paddlenlp::fast_tokenizer; - std::cerr << "[ BPETokenizer ] Start vocab reading\n"; +// std::cerr << "[ BPETokenizer ] Start vocab reading\n"; core::Vocab vocab; int32_t unk_token_id = -1; - std::cerr << "[ BPETokenizer ] Vocab size is " << vocab_size << "\n"; +// std::cerr << "[ BPETokenizer ] Vocab size is " << vocab_size << "\n"; for(size_t id = 0; id < vocab_size; ++id) { auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); vocab[token] = int32_t(id); // TODO: Check range } - std::cerr << "[ BPETokenizer ] Finish vocab reading\n"; - - std::cerr << "[ BPETokenizer ] Start merges reading\n"; - std::cerr << "[ BPETokenizer ] Merges Size: " << merges_size << "\n"; +// std::cerr << "[ BPETokenizer ] Finish vocab reading\n"; +// +// std::cerr << "[ BPETokenizer ] Start merges reading\n"; +// std::cerr << "[ BPETokenizer ] Merges Size: " << merges_size << "\n"; core::Merges merges; std::string delim = " "; @@ -1802,10 +1796,10 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i merges.emplace_back(merge_pair); } - std::cerr << "[ BPETokenizer ] Finish merges reading\n"; +// std::cerr << "[ BPETokenizer ] Finish merges reading\n"; - std::cerr << "[ BPETokenizer ] Start tokenizer initialization\n"; +// std::cerr << "[ BPETokenizer ] Start tokenizer initialization\n"; std::vector unk_token = {}; if (m_unk_token.size() > 0) { @@ -1823,87 +1817,51 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i models::BPE tokenizer(vocab, merges, 10000 /* default cache size */, {} /* dropout - don't use dropout for inference */, unk_token, suffix_indicator, end_suffix, m_fuse_unk); - std::cerr << "[ BPETokenizer ] Finish tokenizer initialization\n"; +// std::cerr << "[ BPETokenizer ] Finish tokenizer initialization\n"; // Get pointers in the output tensors auto new_begins = outputs[0].data(); auto new_ends = outputs[1].data(); auto new_elems = outputs[2].data(); - int32_t offset = 0; + int32_t ragged_offset = 0; +// std::cerr << "Ragged Begins and ends:\n"; +// for (size_t i = 0; i < inputs[0].get_size(); ++i) { +// std::cerr << inputs[0].data()[i] << ", "; +// } +// std::cerr << "\n"; +// for (size_t i = 0; i < inputs[1].get_size(); ++i) { +// std::cerr << inputs[1].data()[i] << ", "; +// } +// std::cerr << "\n"; - for(size_t j = 0; j < num_elems; ++j) { - new_begins[j] = offset; - for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { - auto str = std::string(chars + begins[i], chars + ends[i]); - std::cerr << "Word: '" << str << "'\n"; + for(size_t seq = 0; seq < num_rows; ++seq) { + new_begins[seq] = ragged_offset; + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + +// std::cerr << "[ BPETokenizer ] String: '" << str << "'\n"; +// std::cerr << "[ BPETokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; + std::vector results = tokenizer.Tokenize(str); for (const core::Token& token : results) { - std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ - << ", offset: (" << token.offset_.first << ", " - << token.offset_.second << ")." << std::endl; - OPENVINO_ASSERT(offset < outputs[2].get_size()); - new_elems[offset++] = token.id_; +// std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ +// << ", offset: (" << token.offset_.first << ", " +// << token.offset_.second << ")." << std::endl; + OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); + new_elems[ragged_offset++] = token.id_; }; } - new_ends[j] = offset; + new_ends[seq] = ragged_offset; } - - outputs[2].set_shape({offset}); - - OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); + outputs[2].set_shape({ragged_offset}); return true; #else // Stub implementation that transforms each input string to its length duplicating element if the length is odd - { - std::cout << "[ DEBUG ] WordpieceTokenizer\n"; - std::cout << "[ DEBUG ] vocab size: " << inputs[5].get_size() << "\n"; - std::cout << "[ DEBUG ] unk_token_id: " << unk_token_id << "\n"; - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_elems = inputs[0].get_size(); - - const size_t num_parts = inputs[2].get_size(); - size_t new_num_parts = num_parts; - // Count number of output elements - for(size_t i = 0; i < num_parts; ++i) { - auto length = ends[i] - begins[i]; - new_num_parts += length % 2; - } - - outputs[2].set_shape({new_num_parts}); - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_elems = outputs[2].data(); - int32_t offset = 0; - - for(size_t j = 0; j < num_elems; ++j) { - new_begins[j] = offset; - - for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { - - auto length = ends[i] - begins[i]; - new_elems[offset++] = length; - - if(length % 2) { - new_elems[offset++] = length; - } - } - - new_ends[j] = offset; - } - - OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); - return true; - } // End of stub implementation #endif } diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index f2697644e..2899ab813 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -263,7 +263,7 @@ class OPENVINO_API RegexSplit : public ov::op::Op { RegexSplit () = default; - RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "removed", bool invert = false) : + RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false) : ov::op::Op(arguments), m_behaviour(behaviour), m_invert(invert) { From e4dcdda4e79c2a095d61757089c5f8cd9bc9f0ea Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 29 Jun 2023 19:15:35 +0100 Subject: [PATCH 031/116] Add decoding step, BytesToChars refactoring Has a bug with internal dimension for VocabNode --- .../user_ie_extensions/ov_extension.cpp | 2 + .../sentence_piece/convert_tokenizer.py | 17 +- .../sentence_piece/hf_parser.py | 12 + .../sentence_piece/sentence_piece.cpp | 671 +++++++++++------- .../sentence_piece/sentence_piece.hpp | 71 +- .../sentence_piece/tokenizer_pipeline.py | 71 +- 6 files changed, 560 insertions(+), 284 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 4cc541595..a3fa79acb 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -73,6 +73,8 @@ std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ std::make_shared>(), \ std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ std::make_shared("Reshape", translate_reshape), \ std::make_shared("Const", translate_const), \ std::make_shared>(), \ diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py index 82839cb32..db2e4b30b 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py @@ -3,23 +3,27 @@ # SPDX-License-Identifier: Apache-2.0 import sys -from typing import Any, List +from typing import Any, Tuple, Union from openvino.runtime.exceptions import OVTypeError from openvino.runtime import Model -from tokenizer_pipeline import TokenizerPipeline -def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> TokenizerPipeline: +def convert_tokenizer( + tokenizer_object: Any, number_of_inputs: int = 1, with_decoder=False +) -> Union[Model, Tuple[Model, Model]]: if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase from hf_parser import TransformersTokenizerPipelineParser # TODO: Remove this check if isinstance(tokenizer_object, PreTrainedTokenizerBase): - ov_tokenizer = TransformersTokenizerPipelineParser(tokenizer_object).parse( + pipeline = TransformersTokenizerPipelineParser(tokenizer_object).parse( number_of_inputs=number_of_inputs - ).get_ov_subgraph() + ) + ov_tokenizer = pipeline.get_encoder_ov_subgraph() + if with_decoder: + ov_detokenizer = pipeline.get_decoder_ov_subgraph() output_names = tokenizer_object.model_input_names ov_tokenizer_output_names = ["input_ids", "attention_mask"] @@ -39,6 +43,9 @@ def convert_tokenizer(tokenizer_object: Any, number_of_inputs: int = 1) -> Token ov_tokenizer.output(i).tensor.add_names({output_name}) filtered_outputs.append(ov_tokenizer.output(i)) + if with_decoder: + return Model(filtered_outputs, ov_tokenizer.get_parameters()), ov_detokenizer + return Model(filtered_outputs, ov_tokenizer.get_parameters()) raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py index 5d1f951a8..e6936e50e 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -26,6 +26,8 @@ TruncationStep, PaddingStep, CombineSegmentsStep, + VocabDecoderStep, + CharsToBytesStep, ) @@ -107,6 +109,7 @@ def parse(self, number_of_inputs: Optional[int] = None) -> TokenizerPipeline: self.pre_tokenization, self.tokenization_model, self.post_tokenization, + self.decoding, ]: add_steps() @@ -226,3 +229,12 @@ def add_padding(self) -> None: self.pipeline[-1].set_token_id(self.pipeline.vocab) else: self.pipeline.add_steps(PaddingStep()) + + def decoding(self) -> None: + if self.tokenizer_json["decoder"] is None: + return + + if self.tokenizer_json["decoder"]["type"] == "ByteLevel": + self.pipeline.add_steps(VocabDecoderStep()) + self.pipeline.add_steps(CharsToBytesStep()) + return diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 39dcafb3c..a1bfd5df9 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -987,6 +987,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto split_pattern_buf = inputs[3].data(); auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + std::cerr << "[ RegexSplit ] Split Pattern: `" << split_pattern << "`\n"; + const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[2].get_size(); @@ -1012,26 +1014,26 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); for(size_t seq = 0; seq < num_rows; ++seq) { -// std::cerr << "================= Seq: " << seq << " ====================\n"; -// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; + std::cerr << "================= Seq: " << seq << " ====================\n"; + std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; new_ragged_begins[seq] = ragged_offset; for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); -// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; + std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); pretokenizer(&pretokenized); size_t num_splits = pretokenized.GetSplitsSize(); -// std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; + std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; for (size_t j = 0; j < num_splits; ++j) { auto split = pretokenized.GetSplit(j); const auto& value = split.normalized_.GetStr(); auto offset = split.normalized_.GetOrginalOffset(); -// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; -// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; + std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; new_begins[ragged_offset] = begins[ragged_col] + offset.first; new_ends[ragged_offset++] = begins[ragged_col] + offset.second; }; @@ -1247,265 +1249,265 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont } -const std::unordered_map> create_bytes_to_chars_map() { - return { - { 33 , { 33 }}, - { 34 , { 34 }}, - { 35 , { 35 }}, - { 36 , { 36 }}, - { 37 , { 37 }}, - { 38 , { 38 }}, - { 39 , { 39 }}, - { 40 , { 40 }}, - { 41 , { 41 }}, - { 42 , { 42 }}, - { 43 , { 43 }}, - { 44 , { 44 }}, - { 45 , { 45 }}, - { 46 , { 46 }}, - { 47 , { 47 }}, - { 48 , { 48 }}, - { 49 , { 49 }}, - { 50 , { 50 }}, - { 51 , { 51 }}, - { 52 , { 52 }}, - { 53 , { 53 }}, - { 54 , { 54 }}, - { 55 , { 55 }}, - { 56 , { 56 }}, - { 57 , { 57 }}, - { 58 , { 58 }}, - { 59 , { 59 }}, - { 60 , { 60 }}, - { 61 , { 61 }}, - { 62 , { 62 }}, - { 63 , { 63 }}, - { 64 , { 64 }}, - { 65 , { 65 }}, - { 66 , { 66 }}, - { 67 , { 67 }}, - { 68 , { 68 }}, - { 69 , { 69 }}, - { 70 , { 70 }}, - { 71 , { 71 }}, - { 72 , { 72 }}, - { 73 , { 73 }}, - { 74 , { 74 }}, - { 75 , { 75 }}, - { 76 , { 76 }}, - { 77 , { 77 }}, - { 78 , { 78 }}, - { 79 , { 79 }}, - { 80 , { 80 }}, - { 81 , { 81 }}, - { 82 , { 82 }}, - { 83 , { 83 }}, - { 84 , { 84 }}, - { 85 , { 85 }}, - { 86 , { 86 }}, - { 87 , { 87 }}, - { 88 , { 88 }}, - { 89 , { 89 }}, - { 90 , { 90 }}, - { 91 , { 91 }}, - { 92 , { 92 }}, - { 93 , { 93 }}, - { 94 , { 94 }}, - { 95 , { 95 }}, - { 96 , { 96 }}, - { 97 , { 97 }}, - { 98 , { 98 }}, - { 99 , { 99 }}, - { 100 , { 100 }}, - { 101 , { 101 }}, - { 102 , { 102 }}, - { 103 , { 103 }}, - { 104 , { 104 }}, - { 105 , { 105 }}, - { 106 , { 106 }}, - { 107 , { 107 }}, - { 108 , { 108 }}, - { 109 , { 109 }}, - { 110 , { 110 }}, - { 111 , { 111 }}, - { 112 , { 112 }}, - { 113 , { 113 }}, - { 114 , { 114 }}, - { 115 , { 115 }}, - { 116 , { 116 }}, - { 117 , { 117 }}, - { 118 , { 118 }}, - { 119 , { 119 }}, - { 120 , { 120 }}, - { 121 , { 121 }}, - { 122 , { 122 }}, - { 123 , { 123 }}, - { 124 , { 124 }}, - { 125 , { 125 }}, - { 126 , { 126 }}, - { 161 , { 194, 161 }}, - { 162 , { 194, 162 }}, - { 163 , { 194, 163 }}, - { 164 , { 194, 164 }}, - { 165 , { 194, 165 }}, - { 166 , { 194, 166 }}, - { 167 , { 194, 167 }}, - { 168 , { 194, 168 }}, - { 169 , { 194, 169 }}, - { 170 , { 194, 170 }}, - { 171 , { 194, 171 }}, - { 172 , { 194, 172 }}, - { 174 , { 194, 174 }}, - { 175 , { 194, 175 }}, - { 176 , { 194, 176 }}, - { 177 , { 194, 177 }}, - { 178 , { 194, 178 }}, - { 179 , { 194, 179 }}, - { 180 , { 194, 180 }}, - { 181 , { 194, 181 }}, - { 182 , { 194, 182 }}, - { 183 , { 194, 183 }}, - { 184 , { 194, 184 }}, - { 185 , { 194, 185 }}, - { 186 , { 194, 186 }}, - { 187 , { 194, 187 }}, - { 188 , { 194, 188 }}, - { 189 , { 194, 189 }}, - { 190 , { 194, 190 }}, - { 191 , { 194, 191 }}, - { 192 , { 195, 128 }}, - { 193 , { 195, 129 }}, - { 194 , { 195, 130 }}, - { 195 , { 195, 131 }}, - { 196 , { 195, 132 }}, - { 197 , { 195, 133 }}, - { 198 , { 195, 134 }}, - { 199 , { 195, 135 }}, - { 200 , { 195, 136 }}, - { 201 , { 195, 137 }}, - { 202 , { 195, 138 }}, - { 203 , { 195, 139 }}, - { 204 , { 195, 140 }}, - { 205 , { 195, 141 }}, - { 206 , { 195, 142 }}, - { 207 , { 195, 143 }}, - { 208 , { 195, 144 }}, - { 209 , { 195, 145 }}, - { 210 , { 195, 146 }}, - { 211 , { 195, 147 }}, - { 212 , { 195, 148 }}, - { 213 , { 195, 149 }}, - { 214 , { 195, 150 }}, - { 215 , { 195, 151 }}, - { 216 , { 195, 152 }}, - { 217 , { 195, 153 }}, - { 218 , { 195, 154 }}, - { 219 , { 195, 155 }}, - { 220 , { 195, 156 }}, - { 221 , { 195, 157 }}, - { 222 , { 195, 158 }}, - { 223 , { 195, 159 }}, - { 224 , { 195, 160 }}, - { 225 , { 195, 161 }}, - { 226 , { 195, 162 }}, - { 227 , { 195, 163 }}, - { 228 , { 195, 164 }}, - { 229 , { 195, 165 }}, - { 230 , { 195, 166 }}, - { 231 , { 195, 167 }}, - { 232 , { 195, 168 }}, - { 233 , { 195, 169 }}, - { 234 , { 195, 170 }}, - { 235 , { 195, 171 }}, - { 236 , { 195, 172 }}, - { 237 , { 195, 173 }}, - { 238 , { 195, 174 }}, - { 239 , { 195, 175 }}, - { 240 , { 195, 176 }}, - { 241 , { 195, 177 }}, - { 242 , { 195, 178 }}, - { 243 , { 195, 179 }}, - { 244 , { 195, 180 }}, - { 245 , { 195, 181 }}, - { 246 , { 195, 182 }}, - { 247 , { 195, 183 }}, - { 248 , { 195, 184 }}, - { 249 , { 195, 185 }}, - { 250 , { 195, 186 }}, - { 251 , { 195, 187 }}, - { 252 , { 195, 188 }}, - { 253 , { 195, 189 }}, - { 254 , { 195, 190 }}, - { 255 , { 195, 191 }}, - { 0 , { 196, 128 }}, - { 1 , { 196, 129 }}, - { 2 , { 196, 130 }}, - { 3 , { 196, 131 }}, - { 4 , { 196, 132 }}, - { 5 , { 196, 133 }}, - { 6 , { 196, 134 }}, - { 7 , { 196, 135 }}, - { 8 , { 196, 136 }}, - { 9 , { 196, 137 }}, - { 10 , { 196, 138 }}, - { 11 , { 196, 139 }}, - { 12 , { 196, 140 }}, - { 13 , { 196, 141 }}, - { 14 , { 196, 142 }}, - { 15 , { 196, 143 }}, - { 16 , { 196, 144 }}, - { 17 , { 196, 145 }}, - { 18 , { 196, 146 }}, - { 19 , { 196, 147 }}, - { 20 , { 196, 148 }}, - { 21 , { 196, 149 }}, - { 22 , { 196, 150 }}, - { 23 , { 196, 151 }}, - { 24 , { 196, 152 }}, - { 25 , { 196, 153 }}, - { 26 , { 196, 154 }}, - { 27 , { 196, 155 }}, - { 28 , { 196, 156 }}, - { 29 , { 196, 157 }}, - { 30 , { 196, 158 }}, - { 31 , { 196, 159 }}, - { 32 , { 196, 160 }}, - { 127 , { 196, 161 }}, - { 128 , { 196, 162 }}, - { 129 , { 196, 163 }}, - { 130 , { 196, 164 }}, - { 131 , { 196, 165 }}, - { 132 , { 196, 166 }}, - { 133 , { 196, 167 }}, - { 134 , { 196, 168 }}, - { 135 , { 196, 169 }}, - { 136 , { 196, 170 }}, - { 137 , { 196, 171 }}, - { 138 , { 196, 172 }}, - { 139 , { 196, 173 }}, - { 140 , { 196, 174 }}, - { 141 , { 196, 175 }}, - { 142 , { 196, 176 }}, - { 143 , { 196, 177 }}, - { 144 , { 196, 178 }}, - { 145 , { 196, 179 }}, - { 146 , { 196, 180 }}, - { 147 , { 196, 181 }}, - { 148 , { 196, 182 }}, - { 149 , { 196, 183 }}, - { 150 , { 196, 184 }}, - { 151 , { 196, 185 }}, - { 152 , { 196, 186 }}, - { 153 , { 196, 187 }}, - { 154 , { 196, 188 }}, - { 155 , { 196, 189 }}, - { 156 , { 196, 190 }}, - { 157 , { 196, 191 }}, - { 158 , { 197, 128 }}, - { 159 , { 197, 129 }}, - { 160 , { 197, 130 }}, - { 173 , { 197, 131 }} - }; +const std::array, 256> create_bytes_to_chars_map() { + return {{ + { 196, 128 }, + { 196, 129 }, + { 196, 130 }, + { 196, 131 }, + { 196, 132 }, + { 196, 133 }, + { 196, 134 }, + { 196, 135 }, + { 196, 136 }, + { 196, 137 }, + { 196, 138 }, + { 196, 139 }, + { 196, 140 }, + { 196, 141 }, + { 196, 142 }, + { 196, 143 }, + { 196, 144 }, + { 196, 145 }, + { 196, 146 }, + { 196, 147 }, + { 196, 148 }, + { 196, 149 }, + { 196, 150 }, + { 196, 151 }, + { 196, 152 }, + { 196, 153 }, + { 196, 154 }, + { 196, 155 }, + { 196, 156 }, + { 196, 157 }, + { 196, 158 }, + { 196, 159 }, + { 196, 160 }, + { 33 }, + { 34 }, + { 35 }, + { 36 }, + { 37 }, + { 38 }, + { 39 }, + { 40 }, + { 41 }, + { 42 }, + { 43 }, + { 44 }, + { 45 }, + { 46 }, + { 47 }, + { 48 }, + { 49 }, + { 50 }, + { 51 }, + { 52 }, + { 53 }, + { 54 }, + { 55 }, + { 56 }, + { 57 }, + { 58 }, + { 59 }, + { 60 }, + { 61 }, + { 62 }, + { 63 }, + { 64 }, + { 65 }, + { 66 }, + { 67 }, + { 68 }, + { 69 }, + { 70 }, + { 71 }, + { 72 }, + { 73 }, + { 74 }, + { 75 }, + { 76 }, + { 77 }, + { 78 }, + { 79 }, + { 80 }, + { 81 }, + { 82 }, + { 83 }, + { 84 }, + { 85 }, + { 86 }, + { 87 }, + { 88 }, + { 89 }, + { 90 }, + { 91 }, + { 92 }, + { 93 }, + { 94 }, + { 95 }, + { 96 }, + { 97 }, + { 98 }, + { 99 }, + { 100 }, + { 101 }, + { 102 }, + { 103 }, + { 104 }, + { 105 }, + { 106 }, + { 107 }, + { 108 }, + { 109 }, + { 110 }, + { 111 }, + { 112 }, + { 113 }, + { 114 }, + { 115 }, + { 116 }, + { 117 }, + { 118 }, + { 119 }, + { 120 }, + { 121 }, + { 122 }, + { 123 }, + { 124 }, + { 125 }, + { 126 }, + { 196, 161 }, + { 196, 162 }, + { 196, 163 }, + { 196, 164 }, + { 196, 165 }, + { 196, 166 }, + { 196, 167 }, + { 196, 168 }, + { 196, 169 }, + { 196, 170 }, + { 196, 171 }, + { 196, 172 }, + { 196, 173 }, + { 196, 174 }, + { 196, 175 }, + { 196, 176 }, + { 196, 177 }, + { 196, 178 }, + { 196, 179 }, + { 196, 180 }, + { 196, 181 }, + { 196, 182 }, + { 196, 183 }, + { 196, 184 }, + { 196, 185 }, + { 196, 186 }, + { 196, 187 }, + { 196, 188 }, + { 196, 189 }, + { 196, 190 }, + { 196, 191 }, + { 197, 128 }, + { 197, 129 }, + { 197, 130 }, + { 194, 161 }, + { 194, 162 }, + { 194, 163 }, + { 194, 164 }, + { 194, 165 }, + { 194, 166 }, + { 194, 167 }, + { 194, 168 }, + { 194, 169 }, + { 194, 170 }, + { 194, 171 }, + { 194, 172 }, + { 197, 131 }, + { 194, 174 }, + { 194, 175 }, + { 194, 176 }, + { 194, 177 }, + { 194, 178 }, + { 194, 179 }, + { 194, 180 }, + { 194, 181 }, + { 194, 182 }, + { 194, 183 }, + { 194, 184 }, + { 194, 185 }, + { 194, 186 }, + { 194, 187 }, + { 194, 188 }, + { 194, 189 }, + { 194, 190 }, + { 194, 191 }, + { 195, 128 }, + { 195, 129 }, + { 195, 130 }, + { 195, 131 }, + { 195, 132 }, + { 195, 133 }, + { 195, 134 }, + { 195, 135 }, + { 195, 136 }, + { 195, 137 }, + { 195, 138 }, + { 195, 139 }, + { 195, 140 }, + { 195, 141 }, + { 195, 142 }, + { 195, 143 }, + { 195, 144 }, + { 195, 145 }, + { 195, 146 }, + { 195, 147 }, + { 195, 148 }, + { 195, 149 }, + { 195, 150 }, + { 195, 151 }, + { 195, 152 }, + { 195, 153 }, + { 195, 154 }, + { 195, 155 }, + { 195, 156 }, + { 195, 157 }, + { 195, 158 }, + { 195, 159 }, + { 195, 160 }, + { 195, 161 }, + { 195, 162 }, + { 195, 163 }, + { 195, 164 }, + { 195, 165 }, + { 195, 166 }, + { 195, 167 }, + { 195, 168 }, + { 195, 169 }, + { 195, 170 }, + { 195, 171 }, + { 195, 172 }, + { 195, 173 }, + { 195, 174 }, + { 195, 175 }, + { 195, 176 }, + { 195, 177 }, + { 195, 178 }, + { 195, 179 }, + { 195, 180 }, + { 195, 181 }, + { 195, 182 }, + { 195, 183 }, + { 195, 184 }, + { 195, 185 }, + { 195, 186 }, + { 195, 187 }, + { 195, 188 }, + { 195, 189 }, + { 195, 190 }, + { 195, 191 }, + }}; } void BytesToChars::validate_and_infer_types() { @@ -1544,8 +1546,8 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i new_begins[i] = char_pointer; for (size_t k = 0; k < word_len; ++k) { - for (auto byte : m_bytes_to_chars.at(chars[begins[i] + k])) { - new_chars[char_pointer++] = static_cast (byte); + for (auto byte : m_bytes_to_chars[chars[begins[i] + k]]) { + new_chars[char_pointer++] = byte; } } new_ends[i] = char_pointer; @@ -1841,15 +1843,15 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); -// std::cerr << "[ BPETokenizer ] String: '" << str << "'\n"; + std::cerr << "[ BPETokenizer ] String: '" << str << "'\n"; // std::cerr << "[ BPETokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; std::vector results = tokenizer.Tokenize(str); for (const core::Token& token : results) { -// std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ -// << ", offset: (" << token.offset_.first << ", " -// << token.offset_.second << ")." << std::endl; + std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ + << ", offset: (" << token.offset_.first << ", " + << token.offset_.second << ")." << std::endl; OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); new_elems[ragged_offset++] = token.id_; }; @@ -2164,3 +2166,130 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { return {const_node}; } + +void VocabDecoder::validate_and_infer_types() { +// check_ragged_string_input(this, 0); + check_string_input(this, 1); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + + +bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto batch_size = inputs[0].get_shape()[0]; + auto seq_len = inputs[0].get_shape()[1]; + auto input_data = inputs[0].data(); + + auto vocab_begins = inputs[1].data(); + auto vocab_ends = inputs[2].data(); + auto vocab_chars = inputs[3].data(); + auto vocab_size = inputs[1].get_size(); + + std::vector> vocab; + vocab.resize(vocab_size); + + OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern"); + + for(size_t id = 0; id < vocab_size; ++id) { + std::vector token = std::vector(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[id] = token; + } + // Set output shapes + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len}); + outputs[3].set_shape({batch_size * seq_len}); + outputs[4].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + const size_t num_rows = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + new_ragged_begins[batch] = batch * seq_len; + new_ragged_ends[batch] = new_ragged_begins[batch] + seq_len; + + for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) { + auto token_id = input_data[seq]; + auto token = vocab[token_id]; + + std::copy(token.begin(), token.end(), &new_chars[char_offset]); + + new_begins[seq] = char_offset; + char_offset += token.size(); + new_ends[seq] = char_offset; + } + } + outputs[4].set_shape({char_offset}); + return true; +} + + +void CharsToBytes::validate_and_infer_types() { + check_ragged_string_input(this, 0); +// set_ragged_string_output(this, 0, get_input_partial_shape(0)); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +std::array, 4> CharsToBytes::create_pair_map() { + auto bytes_to_chars = create_bytes_to_chars_map(); + std::array, 4> pair_map; + + for (int i=0; i < bytes_to_chars.size(); ++i) { + std::vector chars = bytes_to_chars[i]; + if (chars.size() == 2) { + pair_map[chars[0] - 194][chars[1] - 128] = i; + }; + }; + + return pair_map; +} + + +bool CharsToBytes::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to CharsToBytes, it means it is not converted properly or it is not used in the supported pattern"); + + // Set output shapes +// outputs[0] = inputs[0]; +// outputs[1] = inputs[1]; + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + outputs[2].set_shape(Shape({inputs[4].get_size()})); + const size_t num_rows = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + uint32_t char_pointer = 0; + + for(size_t row = 0; row < num_rows; ++row) { + new_begins[row] = char_pointer; + for(size_t col = ragged_begins[row]; col < ragged_ends[row]; ++col) { + const auto word_len = ends[col] - begins[col]; + + for (size_t k = 0; k < word_len; ++k) { + const auto first_byte = chars[begins[col] + k]; + if (first_byte < m_one_byte_border) { + new_chars[char_pointer++] = first_byte; + } else { + const auto second_byte = chars[begins[col] + (++k)]; + new_chars[char_pointer++] = m_pair_map[first_byte - 194][second_byte - 128]; + } + } + }; + new_ends[row] = char_pointer; + } + outputs[2].set_shape({char_pointer}); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index 2899ab813..aaa65f321 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -337,7 +337,7 @@ ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::N ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); -const std::unordered_map> create_bytes_to_chars_map(); +const std::array, 256> create_bytes_to_chars_map(); class OPENVINO_API BytesToChars : public ov::op::Op { @@ -370,7 +370,7 @@ class OPENVINO_API BytesToChars : public ov::op::Op { } private: - const std::unordered_map> m_bytes_to_chars = create_bytes_to_chars_map(); + const std::array, 256> m_bytes_to_chars = create_bytes_to_chars_map(); }; @@ -486,3 +486,70 @@ class OPENVINO_API RaggedToDense : public ov::op::Op { ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); ov::OutputVector translate_const(const ov::frontend::NodeContext& node); + + +class OPENVINO_API VocabDecoder : public ov::op::Op { +public: + OPENVINO_OP("VocabDecoder"); + + VocabDecoder () = default; + + VocabDecoder(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + std::cerr << "[ clone_with_new_inputs ] Number of inputs: " << inputs.size() << "\n"; + for (int i=0; i < inputs.size(); ++i) { + std::cerr << "[ clone_with_new_inputs ] " << inputs[i].get_partial_shape().to_string() << ";\n"; + }; + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; + +class OPENVINO_API CharsToBytes : public ov::op::Op { +public: + OPENVINO_OP("CharsToBytes"); + + CharsToBytes () = default; + + CharsToBytes(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + + std::array, 4> create_pair_map(); + +private: + const std::array, 4> m_pair_map = create_pair_map(); + const uint8_t m_one_byte_border = 128; // if char > 128 => it is two byte char +}; diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index a741eb282..a6d4ffb7a 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -219,7 +219,7 @@ def whitespace_splitter(cls) -> "RegexSplitStep": @classmethod def byte_level_splitter(cls) -> "RegexSplitStep": return cls( - r"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)", + r"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\s\\p{L}\\p{N}]+|\s+", invert=False, behaviour="isolate", ) @@ -339,9 +339,11 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": ) def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + pipeline = self.get_pipeline() + pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs() input_nodes.extend( ( - *self.create_string_constant_node(self.vocab).outputs(), + *self.get_pipeline().vocab_node_outputs, *self.create_string_constant_node(self.merges).outputs(), ) ) @@ -649,11 +651,33 @@ def get_ov_subgraph(self, input_nodes): return outputs +@dataclass +class DecodingStep(BasePipelineStep): + pass + + +@dataclass +class VocabDecoderStep(DecodingStep): + def get_vocab_node_outputs(self) -> Optional[List[Output]]: + return self.get_pipeline().vocab_node_outputs + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + input_nodes.extend(self.get_vocab_node_outputs()) + return core.make_node("VocabDecoder", input_nodes, {}).outputs() + + +@dataclass +class CharsToBytesStep(DecodingStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + return core.make_node("CharsToBytes", input_nodes, {}).outputs() + + @dataclass class TokenizerPipeline: steps: List[BasePipelineStep] = field(default_factory=list) vocab: Optional[List[str]] = field(default=None, repr=False) number_of_inputs: int = 1 + vocab_node_outputs: Optional[List[Output]] = field(default=None, repr=False) def get_config(self) -> Dict[str, Dict[str, Any]]: return {type(step).__name__: step.get_config() for step in self.steps} @@ -692,9 +716,16 @@ def tokenization_steps(self) -> List[TokenizationModelStep]: def post_tokenization_steps(self) -> List[PostTokenizationStep]: return [step for step in self.steps if isinstance(step, PostTokenizationStep)] + @property + def decoding_steps(self) -> List[PostTokenizationStep]: + return [step for step in self.steps if isinstance(step, DecodingStep)] + def create_string_input(self) -> Node: return op.Parameter(Type.u8, PartialShape(["?"])) + def create_int_input(self, input_type=Type.i32) -> Node: + return op.Parameter(input_type, PartialShape(["?", "?", "?"])) + def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[Node]: processing_pipelines_outputs = [] @@ -715,7 +746,7 @@ def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[No return processing_pipelines_outputs - def create_post_tokenization_pipeline(self, input_nodes): + def create_post_tokenization_pipeline(self, input_nodes: List[op.Parameter]) -> List[Output]: #outputs = [] for step in self.post_tokenization_steps: pipeline_step = step.get_ov_subgraph(input_nodes) @@ -732,11 +763,39 @@ def create_post_tokenization_pipeline(self, input_nodes): #outputs.insert(0, input_nodes[0]) return input_nodes - def get_ov_subgraph(self) -> Model: + def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: + # input_nodes = inputs.outputs() + for step in self.decoding_steps: + pipeline_step = step.get_ov_subgraph(input_nodes) + input_nodes = pipeline_step + + return core.make_node("StringTensorPack", input_nodes).outputs() + + def get_encoder_ov_subgraph(self) -> Model: input_nodes = [self.create_string_input() for _ in range(self.number_of_inputs)] processing_outputs = self.create_processing_pipeline(input_nodes) outputs = self.create_post_tokenization_pipeline(processing_outputs) + return Model(outputs, input_nodes, name="tokenizer_encoder") + + def get_greedy_decoding_ov_subgraph(self, input_node: op.Parameter) -> List[Output]: + argmax = opset10.topk( + data=input_node, + k=1, + axis=-1, + mode="max", + sort="none", + name="ArgMax", + ) + return opset10.squeeze( + data=argmax.output(1), + axes=-1, + ).outputs() - return Model(outputs, input_nodes, name="tokenizer") - + def get_decoder_ov_subgraph(self) -> Model: + input_node = self.create_int_input() + argmax = self.get_greedy_decoding_ov_subgraph(input_node) + outputs = self.create_decoding_pipeline(argmax) + model = Model(outputs, [input_node], name="tokenizer_decoder") + model.output().tensor.add_names({"string_output"}) + return model From b45e5ec7a42870c65ec2d3c81660ad645efe3f88 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 30 Jun 2023 13:03:22 +0100 Subject: [PATCH 032/116] Fix some regex bugs for byte-level splitter --- .../user_ie_extensions/sentence_piece/hf_parser.py | 4 +++- .../sentence_piece/tokenizer_pipeline.py | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py index e6936e50e..bf71481ad 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -80,8 +80,10 @@ def parse_byte_level_pretokenization_step( if pretokenizer_dict.get("add_prefix_space"): steps.append(RegExpNormalizationStep(regex_search_pattern="^(\S)", replace_term=" $1")) - # regex is used by default, but it does not appeared in config yet + # regex is used by default, but it does not appear in config yet if pretokenizer_dict.get("use_regex", True): + # re2 does not support negative lookahead, so there is two steps replicate the behaviour + steps.append(RegexSplitStep.add_whitespace_to_the_next_word()) steps.append(RegexSplitStep.byte_level_splitter()) steps.append(BytesToCharsStep()) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py index a6d4ffb7a..7a47137d0 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py @@ -219,11 +219,19 @@ def whitespace_splitter(cls) -> "RegexSplitStep": @classmethod def byte_level_splitter(cls) -> "RegexSplitStep": return cls( - r"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\s\\p{L}\\p{N}]+|\s+", + r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+", invert=False, behaviour="isolate", ) + @classmethod + def add_whitespace_to_the_next_word(cls): + return cls( + r"\s\S", + invert=False, + behaviour="merge_with_next" + ) + @classmethod def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": return cls( From 5f03ed0df1ed5731dabdb8cbe83abadd97c4a1d0 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 7 Jul 2023 17:50:59 +0100 Subject: [PATCH 033/116] Fix bug with VocabDecoder shape --- .../sentence_piece/hf_parser.py | 1 - .../sentence_piece/sentence_piece.cpp | 21 ++++++++++--------- .../sentence_piece/sentence_piece.hpp | 4 ---- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py index bf71481ad..dae5b5a99 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py @@ -65,7 +65,6 @@ def parse_strip_step(split_dict: Dict[str, Any]) -> StripStringStep: def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep: split_pattern = pretokenizer_dict["pattern"].get("String") or pretokenizer_dict["pattern"]["Regex"] - print(pretokenizer_dict["behavior"], pretokenizer_dict["behavior"].lower().rstrip("d")) return RegexSplitStep( split_pattern=split_pattern, invert=pretokenizer_dict["invert"], diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index a1bfd5df9..fe5ba77fb 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -987,7 +987,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto split_pattern_buf = inputs[3].data(); auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - std::cerr << "[ RegexSplit ] Split Pattern: `" << split_pattern << "`\n"; +// std::cerr << "[ RegexSplit ] Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[2].get_size(); @@ -1014,26 +1014,26 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); for(size_t seq = 0; seq < num_rows; ++seq) { - std::cerr << "================= Seq: " << seq << " ====================\n"; - std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; +// std::cerr << "================= Seq: " << seq << " ====================\n"; +// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; new_ragged_begins[seq] = ragged_offset; for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); - std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; +// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); pretokenizer(&pretokenized); size_t num_splits = pretokenized.GetSplitsSize(); - std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; +// std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; for (size_t j = 0; j < num_splits; ++j) { auto split = pretokenized.GetSplit(j); const auto& value = split.normalized_.GetStr(); auto offset = split.normalized_.GetOrginalOffset(); - std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; - std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; +// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; +// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; new_begins[ragged_offset] = begins[ragged_col] + offset.first; new_ends[ragged_offset++] = begins[ragged_col] + offset.second; }; @@ -1056,7 +1056,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto split_pattern_buf = inputs[5].data(); auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant -// std::cerr << "Split Pattern: " << split_pattern << "\n"; +// std::cerr << "Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; outputs[4] = inputs[4]; const size_t num_rows = inputs[0].get_size(); @@ -1242,7 +1242,7 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there inputs.push_back(delim_regex_pattern); - // TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolated` behaviour + // TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolate` behaviour auto outputs = std::make_shared(inputs)->outputs(); auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; @@ -2170,7 +2170,8 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { void VocabDecoder::validate_and_infer_types() { // check_ragged_string_input(this, 0); check_string_input(this, 1); - set_ragged_string_output(this, 0, get_input_partial_shape(0)); + const auto shape = get_input_partial_shape(0); + set_ragged_string_output(this, 0, {shape[0]}); } diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp index aaa65f321..7e57f51d0 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp @@ -502,10 +502,6 @@ class OPENVINO_API VocabDecoder : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - std::cerr << "[ clone_with_new_inputs ] Number of inputs: " << inputs.size() << "\n"; - for (int i=0; i < inputs.size(); ++i) { - std::cerr << "[ clone_with_new_inputs ] " << inputs[i].get_partial_shape().to_string() << ";\n"; - }; return std::make_shared(inputs); } From 2a655024c25b590f31808944c21bde2a48f4e106 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Mon, 10 Jul 2023 19:11:28 +0400 Subject: [PATCH 034/116] Minor changes for natively supported strings --- .../user_ie_extensions/sentence_piece/sentence_piece.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index b8e866f10..a28b8fdc0 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -273,12 +273,13 @@ void check_string_scalar_input(const Node* node, size_t input_index) { auto shape = node->get_input_partial_shape(input_index); auto element_type = node->get_input_element_type(input_index); - #if USE_STRING_TENSORS + #if false && USE_STRING_TENSORS + // This block is not used when we convert ops to decomposed representation (and we really do) OPENVINO_ASSERT( (element_type == element::dynamic || element_type == element::string) && (shape.rank().is_dynamic() || shape.rank().get_length() == 0), - "string/0D tensor is expected"); + "string/0D tensor is expected, but observed: " + element_type.get_type_name() + shape.to_string()); #else @@ -528,7 +529,7 @@ bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVec #endif #if USE_STRING_TENSORS - OPENVINO_ASSERT(false, "Detected a u8 tensor but element::string tensor should be provided") + OPENVINO_ASSERT(false, "Detected a u8 tensor but element::string tensor should be provided"); #endif int32_t batch_size; @@ -929,7 +930,7 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont // TODO: How to translate attribute `replace_global`? #if USE_STRING_TENSORS - return std::make_shared(element::string, {}, value); + return std::make_shared(element::string, Shape{}, &value); #else return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); #endif From a6f91101540099b80cefcd5a465ae847c0776fe6 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Mon, 10 Jul 2023 19:22:02 +0400 Subject: [PATCH 035/116] Suppressed minor^Carnings about int32 -> unsigned implicit --- .../sentence_piece/sentence_piece.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp index 856df79e8..86bf63638 100644 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp @@ -1044,8 +1044,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp } // Fix real shape based on collected results - outputs[2].set_shape({ragged_offset}); - outputs[3].set_shape({ragged_offset}); + outputs[2].set_shape({size_t(ragged_offset)}); + outputs[3].set_shape({size_t(ragged_offset)}); } else { auto ragged_begins = inputs[0].data(); @@ -1125,8 +1125,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp } // Fix real shape based on collected results - outputs[2].set_shape({ragged_offset}); - outputs[3].set_shape({ragged_offset}); + outputs[2].set_shape({size_t(ragged_offset)}); + outputs[3].set_shape({size_t(ragged_offset)}); } #if 1 @@ -1650,7 +1650,7 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec } new_ends[seq] = ragged_offset; } - outputs[2].set_shape({ragged_offset}); + outputs[2].set_shape({size_t(ragged_offset)}); return true; #else @@ -1860,7 +1860,7 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i new_ends[seq] = ragged_offset; } - outputs[2].set_shape({ragged_offset}); + outputs[2].set_shape({size_t(ragged_offset)}); return true; #else From 5c29254e2a311f72edf475f3310baf61f14a3153 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Mon, 10 Jul 2023 22:08:17 +0400 Subject: [PATCH 036/116] Restructured sentence_piece directory to tokenizer directory: split all ops, translators and helper into individual files. To build use tokenizer custom op name in cmake instead of sentence_piece. --- .../user_ie_extensions/CMakeLists.txt | 5 +- .../user_ie_extensions/ov_extension.cpp | 10 +- .../sentence_piece/sentence_piece.cpp | 2297 ----------------- .../sentence_piece/sentence_piece.hpp | 551 ---- .../CMakeLists.txt | 0 .../tokenizer/bpe_tokenizer.cpp | 153 ++ .../tokenizer/bpe_tokenizer.hpp | 59 + .../tokenizer/bytes_to_chars.cpp | 321 +++ .../tokenizer/bytes_to_chars.hpp | 43 + .../tokenizer/case_fold.cpp | 25 + .../tokenizer/case_fold.hpp | 34 + .../tokenizer/chars_to_bytes.cpp | 77 + .../tokenizer/chars_to_bytes.hpp | 41 + .../tokenizer/combine_segments.cpp | 142 + .../tokenizer/combine_segments.hpp | 35 + .../convert_tokenizer.py | 0 .../hf_parser.py | 0 .../tokenizer/normalize_unicode.cpp | 35 + .../tokenizer/normalize_unicode.hpp | 41 + .../tokenizer/ragged_tensor_pack.cpp | 39 + .../tokenizer/ragged_tensor_pack.hpp | 38 + .../tokenizer/ragged_to_dense.cpp | 88 + .../tokenizer/ragged_to_dense.hpp | 36 + .../tokenizer/regex_normalization.cpp | 38 + .../tokenizer/regex_normalization.hpp | 35 + .../tokenizer/regex_split.cpp | 205 ++ .../tokenizer/regex_split.hpp | 44 + .../tokenizer/sentence_piece.cpp | 209 ++ .../tokenizer/sentence_piece.hpp | 41 + .../{sentence_piece => tokenizer}/str_pack.py | 0 .../tokenizer/string_tensor_pack.cpp | 56 + .../tokenizer/string_tensor_pack.hpp | 43 + .../tokenizer/string_tensor_unpack.cpp | 131 + .../tokenizer/string_tensor_unpack.hpp | 49 + .../tokenizer/tensorflow_translators.cpp | 251 ++ .../tokenizer/tensorflow_translators.hpp | 18 + .../tokenizer/tokenizer.hpp | 23 + .../tokenizer_pipeline.py | 0 .../user_ie_extensions/tokenizer/utils.cpp | 228 ++ .../user_ie_extensions/tokenizer/utils.hpp | 70 + .../tokenizer/vocab_decoder.cpp | 74 + .../tokenizer/vocab_decoder.hpp | 35 + .../tokenizer/wordpiece_tokenizer.cpp | 109 + .../tokenizer/wordpiece_tokenizer.hpp | 44 + 44 files changed, 2918 insertions(+), 2855 deletions(-) delete mode 100644 modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp delete mode 100644 modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp rename modules/custom_operations/user_ie_extensions/{sentence_piece => tokenizer}/CMakeLists.txt (100%) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp rename modules/custom_operations/user_ie_extensions/{sentence_piece => tokenizer}/convert_tokenizer.py (100%) rename modules/custom_operations/user_ie_extensions/{sentence_piece => tokenizer}/hf_parser.py (100%) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp rename modules/custom_operations/user_ie_extensions/{sentence_piece => tokenizer}/str_pack.py (100%) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp rename modules/custom_operations/user_ie_extensions/{sentence_piece => tokenizer}/tokenizer_pipeline.py (100%) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 65a13360e..6c6a9fcbe 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -80,8 +80,9 @@ if(TBB_FOUND) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb TBB::tbbmalloc) endif() -if(sentence_piece IN_LIST CUSTOM_OPERATIONS) - add_subdirectory(sentence_piece) +# Left sentence_piece for backward compatibility +if(tokenizer IN_LIST CUSTOM_OPERATIONS) + add_subdirectory(tokenizer) endif() target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index a3fa79acb..1fec891c9 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -52,9 +52,9 @@ # define S_CONV_EXT #endif -#ifdef sentence_piece -# include "sentence_piece/sentence_piece.hpp" -# define SENTENSE_PIECE_EXT \ +#ifdef tokenizer +# include "tokenizer/tokenizer.hpp" +# define TOKENIZER_EXT \ std::make_shared>(), \ std::make_shared>(), \ std::make_shared>(), \ @@ -81,7 +81,7 @@ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), #else -# define SENTENSE_PIECE_EXT +# define TOKENIZER_EXT #endif OPENVINO_CREATE_EXTENSIONS(std::vector( @@ -91,5 +91,5 @@ OPENVINO_CREATE_EXTENSIONS(std::vector( S_CONV_TRANSPOSE_EXT S_CONV_EXT COMPLEX_MUL_EXT - SENTENSE_PIECE_EXT + TOKENIZER_EXT })); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp deleted file mode 100644 index 86bf63638..000000000 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ /dev/null @@ -1,2297 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include "normalizer.h" -#include "sentence_piece.hpp" - -#include "openvino/op/util/framework_node.hpp" -#include "openvino/opsets/opset10.hpp" - -#include "fast_tokenizer/normalizers/normalizers.h" -#include "fast_tokenizer/models/models.h" -#include "fast_tokenizer/pretokenizers/pretokenizers.h" - -// TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor - -#ifndef OPENVINO_ELEMENT_STRING_SUPPORTED - #define OPENVINO_ELEMENT_STRING_SUPPORTED 0 -#endif - -#ifndef OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK - #define OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK 0 -#endif - -#define USE_STRING_TENSORS 0 // modify this depending on willingness to use explicit string tensors - -#if USE_STRING_TENSORS && !OPENVINO_ELEMENT_STRING_SUPPORTED - #error "USE_STRING_TENSORS = 1 can be used only when OpenVINO supports element::string that is determined by OPENVINO_ELEMENT_STRING_SUPPORTED == 1" -#endif - -#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS 0 - -using sentencepiece::SentencePieceProcessor; -using namespace TemplateExtension; -using namespace ov; -using namespace ov::frontend; -using namespace ov::opset10; - -namespace { - template - T extract_scalar_const_value(const std::shared_ptr& node, const std::string& const_name) { - auto const_node = as_type_ptr(node); - FRONT_END_GENERAL_CHECK(const_node, "Conversion expects " + const_name + " to be constant."); - std::vector const_value = const_node->cast_vector(); - FRONT_END_GENERAL_CHECK(const_value.size() == 1, "Conversion expects " + const_name + " to be a scalar."); - return const_value[0]; - } -} // namespace - -SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t nbest_size, float alpha, - bool add_bos, bool add_eos, bool reverse) : m_sp(std::make_shared()), - m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), - m_reverse(reverse), Op(args) { - auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); - FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); - auto spm_model = static_cast(sp_model_const->get_data_ptr()); - auto spm_model_size = sp_model_const->get_byte_size(); - - // configure SentencePieceProcessor - std::string model_proto(spm_model, spm_model_size); - CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); - - // form extra options to configure SentencePieceProcessor - std::string extra_options = ""; - if (m_add_bos) { - extra_options += "bos"; - } - if (m_add_eos) { - extra_options = extra_options.empty() ? extra_options : extra_options + ":"; - extra_options += "eos"; - } - /* TODO: TF ignores this option, so we are ignoring it as well; need to understand what should we do - if (m_reverse) { - extra_options = extra_options.empty() ? extra_options : extra_options + ":"; - extra_options += "reverse"; - } - */ - // example of extra_options, if "bos:eos:reverse" - CHECK_OK(m_sp->SetEncodeExtraOptions(extra_options)); - constructor_validate_and_infer_types(); -} - -SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const std::shared_ptr& sp, - int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse) : m_sp(sp), - m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), - m_reverse(reverse), Op(args) { - constructor_validate_and_infer_types(); -} - -void SentencepieceTokenizer::validate_and_infer_types() { - - #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS - - FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)"); - FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); - FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor"); - FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor"); - FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor"); - - #else - - FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); - FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); - - #if USE_STRING_TENSORS - - #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK - FRONT_END_GENERAL_CHECK( - get_input_element_type(1) == element::string || get_input_element_type(1) == element::u8, - "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 or string depending on the current stage of model preparation"); - #else - FRONT_END_GENERAL_CHECK( - get_input_element_type(1) == element::string, - "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor"); - #endif - - #else - -#if 0 // change to 0 when compiled with master and the bug with data propagation from within inline context is not solved - FRONT_END_GENERAL_CHECK( - get_input_element_type(1) == element::u8, - "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 tensor, but got " + - get_input_element_type(1).get_type_name()); -#endif - - #endif - - #endif - - // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values - // and dense shape - set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); - set_output_type(1, element::i32, PartialShape{ Dimension() }); - set_output_type(2, element::i64, PartialShape{ Dimension(2) }); -} - -bool SentencepieceTokenizer::visit_attributes(AttributeVisitor& visitor) { - visitor.on_attribute("nbest_size", m_nbest_size); - visitor.on_attribute("alpha", m_alpha); - visitor.on_attribute("add_bos", m_add_bos); - visitor.on_attribute("add_eos", m_add_eos); - visitor.on_attribute("reverse", m_reverse); - return true; -} - -void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) { - auto strings = packed.data(); - auto bitstream_size = packed.get_byte_size(); - // check the format of the input bitstream representing the string tensor - FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - batch_size = *reinterpret_cast(strings + 0); - FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, - "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - begin_ids = reinterpret_cast(strings + 4); - end_ids = begin_ids + 1; - symbols = strings + 4 + 4 + 4 * batch_size; -} - -bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { - std::vector sparse_indices; - std::vector sparse_values; - std::vector sparse_dense_shape; - -#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS - - auto begin_ids = inputs[1].data(); - auto end_ids = inputs[2].data(); - auto data = inputs[3].data(); - - auto batch_size = shape_size(inputs[1].get_shape()); - -#else - -#if USE_STRING_TENSORS - - #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK - const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); - #else - const ov::Tensor& strings_tensor = inputs[1]; - #endif - - const std::string* strings = strings_tensor.data(); - size_t batch_size = ov::shape_size(strings_tensor.get_shape()); - -#else - - // const uint8_t* strings = inputs[1].data(); - // auto bitstream_size = inputs[1].get_byte_size(); - - // // check the format of the input bitstream representing the string tensor - // FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - // auto batch_size = *reinterpret_cast(strings + 0); - // FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, - // "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - // auto begin_ids = reinterpret_cast(strings + 4); - // auto end_ids = begin_ids + 1; - // auto data = strings + 4 + 4 + 4 * batch_size; - int32_t batch_size; - const int32_t* begin_ids; - const int32_t* end_ids; - const uint8_t* data; - parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data); - -#endif - -#endif - //std::cerr << " Batch size: " << batch_size << "\n"; - - size_t max_token_id = 0; - for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { -#if USE_STRING_TENSORS && !SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS - const std::string& sentence = strings[batch_ind]; - //std::cerr << " sentence: " << sentence << "\n"; -#else - auto begin_ind = begin_ids[batch_ind]; - auto end_ind = end_ids[batch_ind]; - //std::string sentence(data + begin_ind, data + end_ind); - absl::string_view sentence((const char*)data + begin_ind, end_ind - begin_ind); - //std::cerr << "string: " << sentence << "\n"; -#endif - std::vector ids; - CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); - // put into resulted vectors - for (size_t token_id = 0; token_id < ids.size(); ++token_id) { - sparse_indices.push_back(static_cast(batch_ind)); - sparse_indices.push_back(static_cast(token_id)); - sparse_values.push_back(static_cast(ids[token_id])); - } - max_token_id = max_token_id < ids.size() ? ids.size() : max_token_id; - } - sparse_dense_shape.push_back(static_cast(batch_size)); - sparse_dense_shape.push_back(static_cast(max_token_id)); - - outputs[0].set_shape({ sparse_indices.size() / 2, 2 }); - memcpy(outputs[0].data(), sparse_indices.data(), sizeof(int64_t) * sparse_indices.size()); - outputs[1].set_shape({ sparse_values.size() }); - memcpy(outputs[1].data(), sparse_values.data(), sizeof(int32_t) * sparse_values.size()); - outputs[2].set_shape({ 2 }); - memcpy(outputs[2].data(), sparse_dense_shape.data(), sizeof(int64_t) * sparse_dense_shape.size()); - return true; -} - -bool SentencepieceTokenizer::has_evaluate() const { - return true; -} - -std::shared_ptr SentencepieceTokenizer::clone_with_new_inputs(const OutputVector& new_args) const { - return std::make_shared(new_args, m_sp, m_nbest_size, m_alpha, m_add_bos, m_add_eos, m_reverse); -} - -OutputVector translate_sentencepiece_op(const NodeContext& node) { - // extract model to configure SentencePieceTokenizer - auto sp_model_ov_any = node.get_attribute_as_any("model"); - FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), - "SentencePieceOp configuration model is in incorrect format"); - auto str_spm_model = sp_model_ov_any.as(); - auto sp_model_const = std::make_shared(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data()); - return { sp_model_const }; -} - - - - -void check_string_input(const Node* node, size_t input_index) { - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed string representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed string representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::u8, "Expected a u8 tensor as the third part of the decomposed string representation"); -} - -void check_string_scalar_input(const Node* node, size_t input_index) { - auto shape = node->get_input_partial_shape(input_index); - auto element_type = node->get_input_element_type(input_index); - - #if false && USE_STRING_TENSORS - // This block is not used when we convert ops to decomposed representation (and we really do) - - OPENVINO_ASSERT( - (element_type == element::dynamic || element_type == element::string) && - (shape.rank().is_dynamic() || shape.rank().get_length() == 0), - "string/0D tensor is expected, but observed: " + element_type.get_type_name() + shape.to_string()); - - #else - - OPENVINO_ASSERT( - (element_type == element::dynamic || element_type == element::u8) && - (shape.rank().is_dynamic() || shape.rank().get_length() == 1), - "u8/1D tensor is expected"); - - #endif -} - -void check_ragged_input(const Node* node, size_t input_index) { - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged representation"); - auto rank = node->get_input_partial_shape(input_index+2).rank(); - FRONT_END_GENERAL_CHECK(rank.is_dynamic() || rank.get_length() == 1, "The last tensor in ragged tensor representation should be a 1D tensor"); -} - -void check_ragged_string_input(const Node* node, size_t input_index) { - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged string representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged string representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::i32, "Expected an i32 tensor as the third part of the decomposed ragged string representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+3) == element::i32, "Expected an i32 tensor as the forth part of the decomposed ragged string representation"); - FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+4) == element::u8, "Expected a u8 tensor as the fifth part of the decomposed ragged string representation"); -} - -void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { - node->set_output_type(output_index+0, element::i32, shape); // byte offset in output[+2] -- begin of each string - node->set_output_type(output_index+1, element::i32, shape); // byte offset in output[+2] -- end of each string - node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); // symbols from all strings concatenated -} - -void set_ragged_string_output(Node* node, size_t output_index, const PartialShape& shape) { - node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements - node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+3] -- end of each ragged dimension elements - node->set_output_type(output_index+2, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- begin of each string - node->set_output_type(output_index+3, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- end of each string - node->set_output_type(output_index+4, element::u8, PartialShape{Dimension()}); // symbols from all strings cnocatenated -} - -void set_ragged_output(Node* node, size_t output_index, const PartialShape& shape, element::Type type) { - node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements - node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+2] -- end of each ragged dimension elements - node->set_output_type(output_index+2, type, PartialShape{Dimension()}); // flatten elements -} - - -void StringTensorPack::validate_and_infer_types() { - OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supports only 'begins_ends' mode, but get " + m_mode); - check_string_input(this, 0); - #if USE_STRING_TENSORS - set_output_type(0, element::string, get_input_partial_shape(0)); - #else - set_output_type(0, element::u8, PartialShape{Dimension()}); - #endif -} - - -bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { -#if USE_STRING_TENSORS - // TODO - return false; -#else - auto rank = inputs[0].get_shape().size(); - if (rank != 1) { - std::cerr << "[ WARNING ] StringTensorPack ignores the rank " << rank << " of input tensor and set rank=1 in the output\n"; - } - - auto num_elements = shape_size(inputs[0].get_shape()); - auto num_chars = shape_size(inputs[2].get_shape()); - auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; - outputs[0].set_shape(Shape{num_output_elements}); - - // FIXME: Do the repacking, otherwise cannot handle string tensors with gaps between strings - //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - auto output = outputs[0].data(); - auto output_int32 = reinterpret_cast(output); - - *output_int32++ = num_elements; - *output_int32++ = 0; - output_int32 = std::copy(ends, ends + num_elements, output_int32); - output = reinterpret_cast(output_int32); - output = std::copy(chars, chars + num_chars, output); - - OPENVINO_ASSERT(num_output_elements == output - outputs[0].data(), "[ INTERNAL ERROR ] StringTensorPack output tensor is corrupted"); - - // WARNING! Chars are not repacked. If there are gaps between strings, they will remain. - - return true; -#endif -} - - - -void RaggedTensorPack::validate_and_infer_types() { - OPENVINO_ASSERT(get_input_size() == 3); - OPENVINO_ASSERT(get_input_element_type(0) == element::i32); - OPENVINO_ASSERT(get_input_element_type(1) == element::i32); - - // Pass through the base tensor which is used to build ragged dimensions - // TODO: Provide correct implementation that saves information about ragged structure - // TODO: Requires single-tensor packed representation for ragged tensor - set_output_type(0, get_input_element_type(2), get_input_partial_shape(2)); -} - - -bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass the base tensor with elements throug. - - auto input_shape = inputs[0].get_shape(); - //std::cout << "[ DEBUG ] RaggedTensorPack: shape = " << input_shape << "\n"; - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto num_elements = shape_size(input_shape); - - //for(size_t i = 0; i < num_elements; ++i) { - //std::cout << "[ DEBUG ] [" << i << "] " << begins[i] << ":" << ends[i] << " with size = " << ends[i] - begins[i] << "\n"; - //} - - inputs[2].copy_to(outputs[0]); - - return true; -} - - -void StringTensorUnpack::validate_and_infer_types() { - OPENVINO_ASSERT( - get_input_size() == 1, - "Number of inputs for StringTensorUnpack is not equal to 1"); - - auto output_shape = PartialShape::dynamic(); - - - // In case of explicit string tensors the shape is carried by input tensor itself - // OPENVINO_ASSERT( - // input_shape == PartialShape::dynamic(), - // "Excplicitly set shape for a string tensor in the unpacking is not supported"); - - // There are three cases that affect expected element type of the input tensor: - // - when string tensor is passed and we are before the hack is applied (element::string) and - // - when string tensor is passed and we are after the hack in CPU (element::u8) and - // - when stirng tensor is not really used, and we expect a packed string tensor in this case (element::u8) - - OPENVINO_ASSERT( -#if OPENVINO_ELEMENT_STRING_SUPPORTED - get_input_element_type(0) == element::string || -#endif -#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS - get_input_element_type(0) == element::u8 || -#endif - get_input_element_type(0) == element::dynamic, - "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); - -#if OPENVINO_ELEMENT_STRING_SUPPORTED - if(get_input_element_type(0) == element::string) { - output_shape = get_input_partial_shape(0); - } -#endif - -#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS - if(get_input_element_type(0) == element::u8) - { - #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK - // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. - // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. - const auto& rt_info = get_input_tensor(0).get_rt_info(); - auto it = rt_info.find("__original_partial_shape"); - - // StringTensorUnpack expects __original_partial_shape attribute of type PartialShape in the input tensor. - // If it is not found that means that model compilation wasn't pass the expected transformation where a string tensor - // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. - if(it != rt_info.end() && it->second.is()) { - output_shape = it->second.as(); - } else { - #endif - #if !USE_STRING_TENSORS - // If string tensors shouldn't be used, then the packed u8 format is also expected - // as an input, but in this case only rank is known - OPENVINO_ASSERT( - get_input_partial_shape(0).rank().is_dynamic() || get_input_partial_shape(0).rank().get_length() == 1, - "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + - get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); - - output_shape = PartialShape({Dimension()}); // [?] - #endif - #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK - } - #endif - } -#endif - - OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); - - if (m_mode == "begins_ends") { - set_string_output(this, 0, output_shape); - } -} - -void unpack_strings (const std::string* strings, const Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars) { // TODO: no need for a reference to a ov::Tensor? - auto nelements = shape_size(shape); - - size_t total = 0; - for(size_t i = 0; i < nelements; ++i) - total += strings[i].length(); - - begins.set_shape(shape); - ends.set_shape(shape); - chars.set_shape(Shape{total}); - - auto pbegins = begins.data(); - auto pends = ends.data(); - auto poutput_symbols = reinterpret_cast(chars.data()); - size_t offset = 0; - - for(size_t i = 0; i < nelements; ++i) - { - pbegins[i] = offset; - poutput_symbols = std::copy(strings[i].begin(), strings[i].end(), poutput_symbols); - offset += strings[i].length(); - pends[i] = offset; - } -} - -bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto ptensor = &inputs[0]; - #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK - if(ptensor->get_element_type() == element::u8 && ptensor->get_byte_size() == sizeof(void*)) { - auto data = *reinterpret_cast(ptensor->data()); - if(data != nullptr) { - ptensor = reinterpret_cast(data); - } - } - #endif - - auto tensor = *ptensor; - -#if OPENVINO_ELEMENT_STRING_SUPPORTED - if(tensor.get_element_type() == element::string) { - Shape input_shape = tensor.get_shape(); - const std::string* input_strings = tensor.data(); - unpack_strings(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); - return true; - } else { -#endif - -#if USE_STRING_TENSORS - OPENVINO_ASSERT(false, "Detected a u8 tensor but element::string tensor should be provided"); -#endif - - int32_t batch_size; - const int32_t* begin_ids; - const int32_t* end_ids; - const uint8_t* data; - parse_packed_strings(tensor, batch_size, begin_ids, end_ids, data); - auto num_chars = end_ids[batch_size - 1]; - - outputs[0].set_shape(Shape{static_cast(batch_size)}); - outputs[1].set_shape(Shape{static_cast(batch_size)}); - outputs[2].set_shape(Shape{static_cast(num_chars)}); - auto begins = outputs[0].data(); - auto ends = outputs[1].data(); - auto chars = outputs[2].data(); - std::copy(begin_ids, begin_ids + batch_size, begins); - std::copy(end_ids, end_ids + batch_size, ends); - std::copy(data, data + num_chars, chars); - - return true; - -#if OPENVINO_ELEMENT_STRING_SUPPORTED - } -#endif -} - - -void override_parameter (std::shared_ptr node, element::Type type, const PartialShape& shape) { - if (auto parameter = std::dynamic_pointer_cast(node)) { - // TODO: Apply this change conditionally based on real Parameter value - std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n"; - parameter->set_partial_shape(shape); - parameter->set_element_type(type); - parameter->validate_and_infer_types(); - } -} - -// TODO: replace NodeContext and input_index by a single input -OutputVector pre_translate_string_tensor_input(ov::Output input) { - auto input_node = input.get_node_shared_ptr(); - -#if !USE_STRING_TENSORS - override_parameter(input_node, element::u8, PartialShape{Dimension()}); -#endif - - if (auto struct_pack = std::dynamic_pointer_cast(input_node)) { - FRONT_END_GENERAL_CHECK(struct_pack->get_input_size() == 3, "Expected 3 inputs to StringTensorPack which represents a string tensor"); - return struct_pack->input_values(); - } else { - #if USE_STRING_TENSORS || true // always - return std::make_shared(OutputVector{input}, "begins_ends")->outputs(); - #else - // Suppose this is u8 packed string tensor with a single batch dimension - // Unpack this tensor using standard operations - - // Cannot do that because there is not ReinterprectCast operation in OV - // TODO: Find a way to make it without reinterpretation operation or introduce it as an extension (easy) - #endif - } -} - - - -OutputVector pre_translate_ragged_tensor_input(ov::Output input) { - auto ragged_pack = dynamic_cast(input.get_node()); - OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); - return ragged_pack->input_values(); -} - -OutputVector pre_translate_ragged_string_tensor_input(ov::Output input) { - // auto ragged_pack = dynamic_cast(node.get_input(input_index).get_node()); - // OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); - auto ragged_inputs = pre_translate_ragged_tensor_input(input); - auto string_inputs = pre_translate_string_tensor_input(ragged_inputs[2]); - ragged_inputs.pop_back(); - ragged_inputs.insert(ragged_inputs.end(), string_inputs.begin(), string_inputs.end()); - // auto string_pack = dynamic_cast(ragged_pack->get_input_node_ptr(2)); - // OPENVINO_ASSERT(string_pack, "Expected StringTensorPack as a base for RaggedTensorPack but didn't find it"); - return ragged_inputs; -} - -ov::Output post_translate_string_tensor_output(const OutputVector& outputs) { - FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); - return std::make_shared(outputs, "begins_ends"); -} - -ov::Output post_translate_ragged_tensor_output(const OutputVector& outputs) { - FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); - return std::make_shared(outputs); -} - -NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { - // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, - // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp - FRONT_END_GENERAL_CHECK(node.get_input_size() > 0, "RaggedTensorToSparse expects at least one input."); - auto node_name = node.get_name(); - - // check that producers of RaggedTensorToSparse is SentencePieceTokenizer - auto sp_tokenize_op = node.get_input(0).get_node_shared_ptr(); - FRONT_END_GENERAL_CHECK(sp_tokenize_op->get_input_size() > 6, - "SentencepieceTokenizeOp expects at least six inputs"); - - // prepare inputs that go to custom operation - // prepare input 0 - SentencePieceTokenizer configuration model - auto sp_model_const = as_type_ptr(sp_tokenize_op->input_value(0).get_node_shared_ptr()); - FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant."); - - // prepare input six inputs - auto inputs = sp_tokenize_op->input_value(1); - - // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes - auto nbest_size = extract_scalar_const_value(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size"); - auto alpha = extract_scalar_const_value(sp_tokenize_op->input_value(3).get_node_shared_ptr(), "alpha"); - auto add_bos = extract_scalar_const_value(sp_tokenize_op->input_value(4).get_node_shared_ptr(), "add_bos"); - auto add_eos = extract_scalar_const_value(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos"); - auto reverse = extract_scalar_const_value(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse"); - -#if !USE_STRING_TENSORS - // Override type of input tensor if this is a Parameter - if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { - parameter->set_partial_shape(PartialShape{ Dimension() }); - parameter->set_element_type(element::u8); - parameter->validate_and_infer_types(); - } -#endif - -#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS - - OutputVector inputs_vector = OutputVector{ sp_model_const }; - auto unpacked_outputs = std::make_shared(OutputVector{inputs}, "begins_ends")->outputs(); - inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end()); - -#else - - OutputVector inputs_vector = OutputVector{ sp_model_const, inputs }; - -#endif - - // create a node with custom operation - auto sp_tokenizer_ext = std::make_shared(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse); - FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3, - "Internal error: SentencepieceTokenizer operation extension must have three outputs."); - - // set tensor names - sp_tokenizer_ext->output(0).add_names({ node_name + ":0" }); - sp_tokenizer_ext->output(1).add_names({ node_name + ":1" }); - sp_tokenizer_ext->output(2).add_names({ node_name + ":2" }); - - // create named outputs for the conversion extension - NamedOutputVector named_results; - named_results.push_back({ "sparse_indices", sp_tokenizer_ext->output(0) }); - named_results.push_back({ "sparse_values", sp_tokenizer_ext->output(1) }); - named_results.push_back({ "sparse_dense_shape", sp_tokenizer_ext->output(2) }); - - return named_results; -} - -bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorVector& inputs, std::function normalizer) { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_elements = inputs[0].get_size(); - - // TODO: How to avoid copying from this temporary buffer? - // TODO: It can be possible to collect output symbols directly in the output tensor memory if `normalizer` has reasonable estimation for the final size. - std::deque buffer; - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - - for(size_t i = 0; i < num_elements; ++i) { - new_begins[i] = buffer.size(); - std::string new_str = normalizer(std::string(chars + begins[i], chars + ends[i])); - buffer.insert(buffer.end(), new_str.begin(), new_str.end()); - new_ends[i] = buffer.size(); - } - - // Copy collected symbols to the target output tensor - - outputs[2].set_shape(Shape{buffer.size()}); - auto new_chars = outputs[2].data(); - std::copy(buffer.begin(), buffer.end(), new_chars); - - return true; -} - - -void CaseFold::validate_and_infer_types() { - check_string_input(this, 0); - set_string_output(this, 0, get_input_partial_shape(0)); -} - -bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { -#if 1 - - return evaluate_normalization_helper( - outputs, inputs, - [](const std::string& str) { - using namespace paddlenlp::fast_tokenizer; - return normalizers::NormalizedString(str).Lowercase().GetStr(); - }); - -#else - // Stub implementation that transforms each input string "X" to "CaseFold(X)" for debugging purposes - { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const std::string left_side = "CaseFold(", right_side = ")"; - const size_t num_elements = inputs[0].get_size(); - const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length())*num_elements; - outputs[2].set_shape(Shape{new_len}); - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_chars = outputs[2].data(); - int32_t char_offset = 0; - - for(size_t i = 0; i < num_elements; ++i) { - new_begins[i] = char_offset; - std::string new_str = left_side + std::string(chars + begins[i], chars + ends[i]) + right_side; - std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); - char_offset += new_str.length(); - new_ends[i] = char_offset; - } - return true; - } - // End of stub implementation -#endif -} - - -ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { - FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); - return { post_translate_string_tensor_output(std::make_shared( - pre_translate_string_tensor_input(node.get_input(0)))->outputs()) }; -} - -namespace { -using namespace paddlenlp::fast_tokenizer::normalizers; -using NormalizersMap = std::map>; - -const NormalizersMap normalizers = { - {"NFD", [](const std::string& str) { return NormalizedString(str).NFD().GetStr(); }}, - {"NFC", [](const std::string& str) { return NormalizedString(str).NFC().GetStr(); }}, - {"NFKD", [](const std::string& str) { return NormalizedString(str).NFKD().GetStr(); }}, - {"NFKC", [](const std::string& str) { return NormalizedString(str).NFKC().GetStr(); }}, -}; - -} - - -void NormalizeUnicode::validate_and_infer_types() { - check_string_input(this, 0); - OPENVINO_ASSERT(normalizers.find(m_normalization_form) != normalizers.end(), "NormalizeUnicode doesn't know normalization form " + m_normalization_form); - set_string_output(this, 0, get_input_partial_shape(0)); -} - -bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { -#if 1 - - return evaluate_normalization_helper(outputs, inputs, normalizers.at(m_normalization_form)); - -#else - - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - // Stub implementation that transforms each input string "X" to "NormalizeUnicode(X, normalization_form)" for debugging purposes - { - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const std::string left_side = "NormalizeUnicode(", right_side = ")", delimeter = ", "; - const size_t num_elements = inputs[0].get_size(); - const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length() + delimeter.length() + m_normalization_form.length())*num_elements; - outputs[2].set_shape(Shape{new_len}); - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_chars = outputs[2].data(); - int32_t char_offset = 0; - - for(size_t i = 0; i < num_elements; ++i) { - new_begins[i] = char_offset; - std::string new_str = left_side + std::string(chars + begins[i], chars + ends[i]) + delimeter + m_normalization_form + right_side; - std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); - char_offset += new_str.length(); - new_ends[i] = char_offset; - } - return true; - } - // End of stub implementation -#endif -} - - -ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) { - FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "NormalizeUTF8 expects only 1 input"); - return { post_translate_string_tensor_output(std::make_shared( - pre_translate_string_tensor_input(node.get_input(0)), - node.get_attribute("normalization_form"))->outputs()) }; -} - - -void RegexNormalization::validate_and_infer_types() { - check_string_input(this, 0); - check_string_scalar_input(this, 3); - check_string_scalar_input(this, 4); - set_string_output(this, 0, get_input_partial_shape(0)); -} - -bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto search_pattern_buf = inputs[3].data(); - auto replace_pattern_buf = inputs[4].data(); - auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - -#if 1 - - using namespace paddlenlp::fast_tokenizer::normalizers; - re2::RE2 search_pattern_re(search_pattern); - - return evaluate_normalization_helper( - outputs, inputs, - [&replace_pattern, &search_pattern_re](const std::string& str) { - return NormalizedString(str).Replace(search_pattern_re, std::string(replace_pattern)).GetStr(); - }); - -#else - // Stub implementation that transforms each input string "X" to "RegexNormalization(X, search_pattern, replace_pattern)" for debugging purposes - { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const std::string left_side = "RegexNormalization(", right_side = ")", delimeter = ", "; - const size_t num_elements = inputs[0].get_size(); - const size_t new_len = inputs[2].get_size() + (left_side.length() + right_side.length() + 2*delimeter.length() + search_pattern.length() + replace_pattern.length())*num_elements; - outputs[2].set_shape(Shape{new_len}); - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_chars = outputs[2].data(); - int32_t char_offset = 0; - - for(size_t i = 0; i < num_elements; ++i) { - new_begins[i] = char_offset; - - std::string new_str = - left_side + std::string(chars + begins[i], chars + ends[i]) + delimeter + - std::string(search_pattern) + delimeter + - std::string(replace_pattern) + right_side; - - std::copy(new_str.data(), new_str.data() + new_str.length(), new_chars + char_offset); - char_offset += new_str.length(); - new_ends[i] = char_offset; - } - return true; - } - // End of stub implementation -#endif -} - - -std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name) { - // FIXME: using space to pad the value to work-around CPU issue with empty constants - auto value = node.get_attribute(name) + " "; - - // TODO: How to translate attribute `replace_global`? - - #if USE_STRING_TENSORS - return std::make_shared(element::string, Shape{}, &value); - #else - return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); - #endif -} - - -ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) { - FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input"); - ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); - inputs.push_back(string_attribute_to_constant(node, "pattern")); - inputs.push_back(string_attribute_to_constant(node, "rewrite")); - return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) }; -} - - -namespace { - -using paddlenlp::fast_tokenizer::core::SplitMode; -const std::map split_modes = { - {"remove", SplitMode::REMOVED}, - {"isolate", SplitMode::ISOLATED}, - {"contiguous", SplitMode::CONTIGUOUS}, - {"merge_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, - {"merge_with_next", SplitMode::MERGED_WITH_NEXT}, -}; - -} - - -void RegexSplit::validate_and_infer_types() { -// check_string_input(this, 0); -// check_string_scalar_input(this, 3); -// check_ragged_string_input(this, 0); -// check_string_input(this, 5); - OPENVINO_ASSERT(split_modes.find(m_behaviour) != split_modes.end(), "RegexSplit doesn't support unknown split mode: " + m_behaviour); - set_ragged_string_output(this, 0, get_input_partial_shape(0)); -} - -bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - - if (inputs.size() < 5) { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - ov::Tensor ragged_begins_tensor(ov::element::i32, inputs[0].get_shape()); - ov::Tensor ragged_ends_tensor(ov::element::i32, inputs[0].get_shape()); - auto ragged_begins = ragged_begins_tensor.data(); - auto ragged_ends = ragged_ends_tensor.data(); - for (int i=0; i < inputs[0].get_size(); ++i) { - ragged_begins[i] = i; - ragged_ends[i] = i + 1; - }; - - auto split_pattern_buf = inputs[3].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - -// std::cerr << "[ RegexSplit ] Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; - - const size_t num_rows = inputs[0].get_size(); - const size_t num_chars = inputs[2].get_size(); - - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - - outputs[2].set_shape(Shape{num_chars}); - outputs[3].set_shape(Shape{num_chars}); - - outputs[4] = inputs[2]; - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - int32_t ragged_offset = 0; - - using namespace paddlenlp::fast_tokenizer; - auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - - for(size_t seq = 0; seq < num_rows; ++seq) { -// std::cerr << "================= Seq: " << seq << " ====================\n"; -// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; - - new_ragged_begins[seq] = ragged_offset; - - for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - - auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); -// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; - paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); - pretokenizer(&pretokenized); - size_t num_splits = pretokenized.GetSplitsSize(); -// std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; - - for (size_t j = 0; j < num_splits; ++j) { - auto split = pretokenized.GetSplit(j); - const auto& value = split.normalized_.GetStr(); - auto offset = split.normalized_.GetOrginalOffset(); -// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; -// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[ragged_col] + offset.first; - new_ends[ragged_offset++] = begins[ragged_col] + offset.second; - }; - } - - new_ragged_ends[seq] = ragged_offset; - } - - // Fix real shape based on collected results - outputs[2].set_shape({size_t(ragged_offset)}); - outputs[3].set_shape({size_t(ragged_offset)}); - - } else { - auto ragged_begins = inputs[0].data(); - auto ragged_ends = inputs[1].data(); - auto begins = inputs[2].data(); - auto ends = inputs[3].data(); - auto chars = inputs[4].data(); - - auto split_pattern_buf = inputs[5].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - -// std::cerr << "Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; - - outputs[4] = inputs[4]; - const size_t num_rows = inputs[0].get_size(); - const size_t num_chars = inputs[4].get_size(); - - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - - outputs[2].set_shape(Shape{num_chars}); - outputs[3].set_shape(Shape{num_chars}); - - outputs[4] = inputs[4]; - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - int32_t ragged_offset = 0; - - using namespace paddlenlp::fast_tokenizer; - auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - - for(size_t seq = 0; seq < num_rows; ++seq) { -// std::cerr << "----------------- Seq: " << seq << " -----------------\n"; -// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; - - new_ragged_begins[seq] = ragged_offset; - - for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); -// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; - paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); - pretokenizer(&pretokenized); - size_t num_splits = pretokenized.GetSplitsSize(); - - - for (size_t j = 0; j < num_splits; ++j) { - auto split = pretokenized.GetSplit(j); - const auto& value = split.normalized_.GetStr(); - auto offset = split.normalized_.GetOrginalOffset(); -// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; -// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[ragged_col] + offset.first; - new_ends[ragged_offset++] = begins[ragged_col] + offset.second; - - -// std::cerr << "New begins and ends:\n"; -// for (size_t i = 0; i < outputs[2].get_size(); ++i) { -// std::cerr << outputs[2].data()[i] << ", "; -// } -// std::cerr << "\n"; -// -// for (size_t i = 0; i < outputs[3].get_size(); ++i) { -// std::cerr << outputs[3].data()[i] << ", "; -// } -// std::cerr << "\n"; - }; - } - - new_ragged_ends[seq] = ragged_offset; - } - - // Fix real shape based on collected results - outputs[2].set_shape({size_t(ragged_offset)}); - outputs[3].set_shape({size_t(ragged_offset)}); - - } -#if 1 - - // Set output shapes -// outputs[0].set_shape(inputs[0].get_shape()); -// outputs[1].set_shape(inputs[1].get_shape()); -// -// const size_t num_elements = inputs[0].get_size(); -// const size_t num_chars = inputs[2].get_size(); - - // TODO: Better estimations for max size? - // Assume we cannot have empty parts, so the number of parts cannot be bigger than the number of symbols -// outputs[2].set_shape(Shape{num_chars}); -// outputs[3].set_shape(Shape{num_chars}); - - // Assume we cannot introduce new symbols to output, only existing can be distributed (with gaps) - - // TODO: Can we just route input tensor directly to the output outside evaluate when graph is being constructed? -// outputs[4] = inputs[2]; // TODO: Does it really work? - - // If line above doesn't work, do this instead: - //outputs[4].set_shape(Shape{num_chars}); - //inputs[2].copy_to(outputs[4]); - - return true; - - // TODO: Complete implementation -#else - // Stub implementation that transforms each input string "X" to multiple "RegexSplit(X, split_pattern) = part(X)" for debugging purposes - // Where part(X) is a part of original X divided by predefined length with some reminder - // So each element X is divided into multiple output elements along ragged dimension, and the number of elements depends on the input X length and - // can vary for different X. For example, let the length = 2 and input X = "words", the output would consist of 3 elements along corresponding - // ragged dimension in the output with values: - // - "RegexSplit(word, search_pattern, replace_pattern) = wo", - // - "RegexSplit(word, search_pattern, replace_pattern) = rd", - // - "RegexSplit(word, search_pattern, replace_pattern) = s" - // split_pattern is cut for the sake of readability of ouput - { - const size_t part_length = 30; // any positive number, defines the length of each part in bytes - - std::string split_pattern_part = std::string(split_pattern.substr(0, part_length)); - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - - const std::string left_side = "RegexSplit(", right_side = ")", delimeter = ", "; - const size_t num_elements = inputs[0].get_size(); - size_t num_parts = 0; // will count the number of all parts - size_t num_additional_chars = 0; // - // Count the resulting number of part that we are going to obtain - for(size_t i = 0; i < num_elements; ++i) { - auto length = ends[i] - begins[i]; - auto num_of_whole_parts = length/part_length; - auto remainder = length%part_length; - auto num_local_parts = num_of_whole_parts + int(bool(remainder)); - num_parts += num_local_parts; - num_additional_chars += length*num_local_parts; - } - - size_t num_chars = inputs[2].get_size(); - - // FIXME: Overestimation - const size_t new_num_chars = num_chars + num_parts*30/*!*/ + (left_side.length() + right_side.length() + delimeter.length() + split_pattern_part.length())*num_elements; - outputs[2].set_shape(Shape{num_parts}); - outputs[3].set_shape(Shape{num_parts}); - outputs[4].set_shape(Shape{new_num_chars}); - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - auto new_chars = outputs[4].data(); - int32_t ragged_offset = 0; - int32_t char_offset = 0; - - for(size_t i = 0; i < num_elements; ++i) { - new_ragged_begins[i] = ragged_offset; - auto old_str = std::string(chars + begins[i], chars + ends[i]); - auto new_str_part_base = left_side + old_str + delimeter + split_pattern_part + right_side; - - for(size_t j = 0; j < old_str.length(); j += part_length) { - new_begins[ragged_offset] = char_offset; - //auto new_str_part = new_str_part_base + old_str.substr(j, part_length); - std::string new_str_part = j == 0 ? new_str_part_base : "part[" + std::to_string(i) + "," + std::to_string(j) + "]"; - std::copy(new_str_part.data(), new_str_part.data() + new_str_part.length(), new_chars + char_offset); - char_offset += new_str_part.length(); - new_ends[ragged_offset] = char_offset; - ++ragged_offset; - } - - new_ragged_ends[i] = ragged_offset; - } - - outputs[4].set_shape({char_offset}); - - //OPENVINO_ASSERT(char_offset == new_num_chars, "Internal error in RegexSplit::evaluate: out of range for chars"); - OPENVINO_ASSERT(ragged_offset == num_parts, "Internal error in RegexSplit::evaluate: out of range for ragged parts"); - - return true; - } - // End of stub implementation -#endif -} - - -ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node) { - FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "RegexSplitWithOffsets expects 3 inputs"); - ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); - auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there - inputs.push_back(delim_regex_pattern); - // TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolate` behaviour - auto outputs = std::make_shared(inputs)->outputs(); - auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); - return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; -} - - -const std::array, 256> create_bytes_to_chars_map() { - return {{ - { 196, 128 }, - { 196, 129 }, - { 196, 130 }, - { 196, 131 }, - { 196, 132 }, - { 196, 133 }, - { 196, 134 }, - { 196, 135 }, - { 196, 136 }, - { 196, 137 }, - { 196, 138 }, - { 196, 139 }, - { 196, 140 }, - { 196, 141 }, - { 196, 142 }, - { 196, 143 }, - { 196, 144 }, - { 196, 145 }, - { 196, 146 }, - { 196, 147 }, - { 196, 148 }, - { 196, 149 }, - { 196, 150 }, - { 196, 151 }, - { 196, 152 }, - { 196, 153 }, - { 196, 154 }, - { 196, 155 }, - { 196, 156 }, - { 196, 157 }, - { 196, 158 }, - { 196, 159 }, - { 196, 160 }, - { 33 }, - { 34 }, - { 35 }, - { 36 }, - { 37 }, - { 38 }, - { 39 }, - { 40 }, - { 41 }, - { 42 }, - { 43 }, - { 44 }, - { 45 }, - { 46 }, - { 47 }, - { 48 }, - { 49 }, - { 50 }, - { 51 }, - { 52 }, - { 53 }, - { 54 }, - { 55 }, - { 56 }, - { 57 }, - { 58 }, - { 59 }, - { 60 }, - { 61 }, - { 62 }, - { 63 }, - { 64 }, - { 65 }, - { 66 }, - { 67 }, - { 68 }, - { 69 }, - { 70 }, - { 71 }, - { 72 }, - { 73 }, - { 74 }, - { 75 }, - { 76 }, - { 77 }, - { 78 }, - { 79 }, - { 80 }, - { 81 }, - { 82 }, - { 83 }, - { 84 }, - { 85 }, - { 86 }, - { 87 }, - { 88 }, - { 89 }, - { 90 }, - { 91 }, - { 92 }, - { 93 }, - { 94 }, - { 95 }, - { 96 }, - { 97 }, - { 98 }, - { 99 }, - { 100 }, - { 101 }, - { 102 }, - { 103 }, - { 104 }, - { 105 }, - { 106 }, - { 107 }, - { 108 }, - { 109 }, - { 110 }, - { 111 }, - { 112 }, - { 113 }, - { 114 }, - { 115 }, - { 116 }, - { 117 }, - { 118 }, - { 119 }, - { 120 }, - { 121 }, - { 122 }, - { 123 }, - { 124 }, - { 125 }, - { 126 }, - { 196, 161 }, - { 196, 162 }, - { 196, 163 }, - { 196, 164 }, - { 196, 165 }, - { 196, 166 }, - { 196, 167 }, - { 196, 168 }, - { 196, 169 }, - { 196, 170 }, - { 196, 171 }, - { 196, 172 }, - { 196, 173 }, - { 196, 174 }, - { 196, 175 }, - { 196, 176 }, - { 196, 177 }, - { 196, 178 }, - { 196, 179 }, - { 196, 180 }, - { 196, 181 }, - { 196, 182 }, - { 196, 183 }, - { 196, 184 }, - { 196, 185 }, - { 196, 186 }, - { 196, 187 }, - { 196, 188 }, - { 196, 189 }, - { 196, 190 }, - { 196, 191 }, - { 197, 128 }, - { 197, 129 }, - { 197, 130 }, - { 194, 161 }, - { 194, 162 }, - { 194, 163 }, - { 194, 164 }, - { 194, 165 }, - { 194, 166 }, - { 194, 167 }, - { 194, 168 }, - { 194, 169 }, - { 194, 170 }, - { 194, 171 }, - { 194, 172 }, - { 197, 131 }, - { 194, 174 }, - { 194, 175 }, - { 194, 176 }, - { 194, 177 }, - { 194, 178 }, - { 194, 179 }, - { 194, 180 }, - { 194, 181 }, - { 194, 182 }, - { 194, 183 }, - { 194, 184 }, - { 194, 185 }, - { 194, 186 }, - { 194, 187 }, - { 194, 188 }, - { 194, 189 }, - { 194, 190 }, - { 194, 191 }, - { 195, 128 }, - { 195, 129 }, - { 195, 130 }, - { 195, 131 }, - { 195, 132 }, - { 195, 133 }, - { 195, 134 }, - { 195, 135 }, - { 195, 136 }, - { 195, 137 }, - { 195, 138 }, - { 195, 139 }, - { 195, 140 }, - { 195, 141 }, - { 195, 142 }, - { 195, 143 }, - { 195, 144 }, - { 195, 145 }, - { 195, 146 }, - { 195, 147 }, - { 195, 148 }, - { 195, 149 }, - { 195, 150 }, - { 195, 151 }, - { 195, 152 }, - { 195, 153 }, - { 195, 154 }, - { 195, 155 }, - { 195, 156 }, - { 195, 157 }, - { 195, 158 }, - { 195, 159 }, - { 195, 160 }, - { 195, 161 }, - { 195, 162 }, - { 195, 163 }, - { 195, 164 }, - { 195, 165 }, - { 195, 166 }, - { 195, 167 }, - { 195, 168 }, - { 195, 169 }, - { 195, 170 }, - { 195, 171 }, - { 195, 172 }, - { 195, 173 }, - { 195, 174 }, - { 195, 175 }, - { 195, 176 }, - { 195, 177 }, - { 195, 178 }, - { 195, 179 }, - { 195, 180 }, - { 195, 181 }, - { 195, 182 }, - { 195, 183 }, - { 195, 184 }, - { 195, 185 }, - { 195, 186 }, - { 195, 187 }, - { 195, 188 }, - { 195, 189 }, - { 195, 190 }, - { 195, 191 }, - }}; -} - -void BytesToChars::validate_and_infer_types() { - check_ragged_string_input(this, 0); -// check_string_input(this, 5); - set_ragged_string_output(this, 0, get_input_partial_shape(0)); -} - -bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto ragged_begins = inputs[0].data(); - auto ragged_ends = inputs[1].data(); - auto begins = inputs[2].data(); - auto ends = inputs[3].data(); - auto chars = inputs[4].data(); - - OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to BytesToChars, it means it is not converted properly or it is not used in the supported pattern"); - - // Set output shapes - outputs[0] = inputs[0]; - outputs[1] = inputs[1]; - outputs[2].set_shape(inputs[2].get_shape()); - outputs[3].set_shape(inputs[3].get_shape()); - outputs[4].set_shape(Shape({inputs[4].get_size() * 2})); - const size_t num_elems = inputs[0].get_size(); - - // Get pointers in the output tensors - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - auto new_chars = outputs[4].data(); - uint32_t char_pointer = 0; - - for(size_t j = 0; j < num_elems; ++j) { - - for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { - const auto word_len = ends[i] - begins[i]; - new_begins[i] = char_pointer; - - for (size_t k = 0; k < word_len; ++k) { - for (auto byte : m_bytes_to_chars[chars[begins[i] + k]]) { - new_chars[char_pointer++] = byte; - } - } - new_ends[i] = char_pointer; - } - } - outputs[4].set_shape({char_pointer}); - return true; -} - - -void WordpieceTokenizer::validate_and_infer_types() { - check_ragged_string_input(this, 0); - check_string_input(this, 5); - set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); -} - -bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto ragged_begins = inputs[0].data(); - auto ragged_ends = inputs[1].data(); - auto begins = inputs[2].data(); - auto ends = inputs[3].data(); - auto chars = inputs[4].data(); - - auto vocab_begins = inputs[5].data(); - auto vocab_ends = inputs[6].data(); - auto vocab_chars = inputs[7].data(); - - auto vocab_size = inputs[5].get_size(); - - OPENVINO_ASSERT(inputs.size() == 9, "Too few inputs passed to WordpieceTokenizer, it means it is not converted properly or it is not used in the supported pattern"); - - auto unk_token_id = *inputs[8].data(); - -#if 1 - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_rows = inputs[0].get_size(); - - //const size_t num_parts = inputs[2].get_size(); - //size_t new_num_parts = num_parts; - - // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area - // to represent different elements in ragged tensor - outputs[2].set_shape({inputs[4].get_size()}); - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_elems = outputs[2].data(); - int32_t ragged_offset = 0; - - using namespace paddlenlp::fast_tokenizer; - -// std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; - core::Vocab vocab; - std::string unk_token; - if(unk_token_id < 0) - unk_token_id += vocab_size; - for(size_t id = 0; id < vocab_size; ++id) { - auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[token] = int32_t(id); // TODO: Check range - if(id == unk_token_id) - unk_token = token; - } - -// std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; -// std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; -// std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; - - auto tokenizer = models::FastWordPiece(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); // FIXME: why true? - -// std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; - - - for(size_t seq = 0; seq < num_rows; ++seq) { - new_begins[seq] = ragged_offset; - - for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - - auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); - std::vector results = tokenizer.Tokenize(str); - -// std::cerr << "[ WordpieceTokenizer ] String bytes: "; -// for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { -// std::cerr << static_cast (chars[i]) << " "; -// } -// std::cerr << "\n"; -// std::cerr << "[ WordpieceTokenizer ] String: '" << str << "'\n"; -// std::cerr << "[ WordpieceTokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; - for (const core::Token& token : results) { -// std::cout << "[ WordpieceTokenizer ] id: " << token.id_ << ", value: " << token.value_ -// << ", offset: (" << token.offset_.first << ", " -// << token.offset_.second << ")." << std::endl; - OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); - new_elems[ragged_offset++] = token.id_; - }; - } - new_ends[seq] = ragged_offset; - } - outputs[2].set_shape({size_t(ragged_offset)}); - return true; - -#else - // Stub implementation that transforms each input string to its length duplicating element if the length is odd - { - std::cout << "[ DEBUG ] WordpieceTokenizer\n"; - std::cout << "[ DEBUG ] vocab size: " << inputs[5].get_size() << "\n"; - std::cout << "[ DEBUG ] unk_token_id: " << unk_token_id << "\n"; - - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_elems = inputs[0].get_size(); - - const size_t num_parts = inputs[2].get_size(); - size_t new_num_parts = num_parts; - // Count number of output elements - for(size_t i = 0; i < num_parts; ++i) { - auto length = ends[i] - begins[i]; - new_num_parts += length % 2; - } - - outputs[2].set_shape({new_num_parts}); - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_elems = outputs[2].data(); - int32_t offset = 0; - - for(size_t j = 0; j < num_elems; ++j) { - new_begins[j] = offset; - - for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { - - auto length = ends[i] - begins[i]; - new_elems[offset++] = length; - - if(length % 2) { - new_elems[offset++] = length; - } - } - - new_ends[j] = offset; - } - - OPENVINO_ASSERT(offset == outputs[2].get_size(), "Internal error in RegexSplit::evaluate: out of range for ragged parts"); - return true; - } - // End of stub implementation -#endif -} - - -ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node) { - FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "WordpieceTokenizeWithOffsets expects 2 inputs"); - ov::OutputVector inputs = pre_translate_ragged_string_tensor_input(node.get_input(0)); - - #if USE_STRING_TENSORS - // It may seem enough to call pre_translate_string_tensor_input that will override Parameter element - // type in case if string tensors are not used. - // But a Parameter is still required to be overridden even if string tensors are used because in TF model - // it is represented not as a string tensor, but as a resource with hash table for lookup that we cannot interpret - // and have to replace by 1D string tensor. - override_parameter(node.get_input(1).get_node_shared_ptr(), element::string, PartialShape{Dimension()}); - #endif - - auto vocab = pre_translate_string_tensor_input(node.get_input(1)); - inputs.insert(inputs.end(), vocab.begin(), vocab.end()); - // FIXME: Cannot set real value for unk_token_id from attributes because it is not known in this operation - // TODO: Set other attributes. - auto wp_tokenizer = std::make_shared( - inputs, - node.get_attribute("suffix_indicator"), - node.get_attribute("max_bytes_per_word") - ); - return { post_translate_ragged_tensor_output(wp_tokenizer->outputs()) }; -} - - -void BPETokenizer::validate_and_infer_types() { - check_ragged_string_input(this, 0); - check_string_input(this, 5); - check_string_input(this, 8); - set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); -} - -bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto ragged_begins = inputs[0].data(); - auto ragged_ends = inputs[1].data(); - auto begins = inputs[2].data(); - auto ends = inputs[3].data(); - auto chars = inputs[4].data(); - - auto vocab_begins = inputs[5].data(); - auto vocab_ends = inputs[6].data(); - auto vocab_chars = inputs[7].data(); - - auto merges_begins = inputs[8].data(); - auto merges_ends = inputs[9].data(); - auto merges_chars = inputs[10].data(); - - auto vocab_size = inputs[5].get_size(); - auto merges_size = inputs[8].get_size(); - - OPENVINO_ASSERT(inputs.size() == 11, "Too few inputs passed to BPETokenizer, it means it is not converted properly or it is not used in the supported pattern"); - -#if 1 - // Set output shapes - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - const size_t num_rows = inputs[0].get_size(); - - // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area - // to represent different elements in ragged tensor - outputs[2].set_shape({inputs[4].get_size()}); - - using namespace paddlenlp::fast_tokenizer; - -// std::cerr << "[ BPETokenizer ] Start vocab reading\n"; - core::Vocab vocab; - int32_t unk_token_id = -1; - -// std::cerr << "[ BPETokenizer ] Vocab size is " << vocab_size << "\n"; - - for(size_t id = 0; id < vocab_size; ++id) { - auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[token] = int32_t(id); // TODO: Check range - } - -// std::cerr << "[ BPETokenizer ] Finish vocab reading\n"; -// -// std::cerr << "[ BPETokenizer ] Start merges reading\n"; -// std::cerr << "[ BPETokenizer ] Merges Size: " << merges_size << "\n"; - core::Merges merges; - std::string delim = " "; - - - for(size_t id = 0; id < merges_size; ++id) { - auto merge = std::string(merges_chars + merges_begins[id], merges_chars + merges_ends[id]); - const int delim_pos = merge.find(delim); - - std::pair merge_pair = { - merge.substr(0, delim_pos), merge.substr(delim_pos + 1) - }; - merges.emplace_back(merge_pair); - } - -// std::cerr << "[ BPETokenizer ] Finish merges reading\n"; - - -// std::cerr << "[ BPETokenizer ] Start tokenizer initialization\n"; - - std::vector unk_token = {}; - if (m_unk_token.size() > 0) { - unk_token.push_back(m_unk_token); - }; - std::vector suffix_indicator = {}; - if (m_suffix_indicator.size() > 0) { - suffix_indicator.push_back(m_suffix_indicator); - }; - std::vector end_suffix = {}; - if (m_end_suffix.size() > 0) { - end_suffix.push_back(m_end_suffix); - }; - - models::BPE tokenizer(vocab, merges, 10000 /* default cache size */, {} /* dropout - don't use dropout for inference */, - unk_token, suffix_indicator, end_suffix, m_fuse_unk); - -// std::cerr << "[ BPETokenizer ] Finish tokenizer initialization\n"; - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_elems = outputs[2].data(); - int32_t ragged_offset = 0; - -// std::cerr << "Ragged Begins and ends:\n"; -// for (size_t i = 0; i < inputs[0].get_size(); ++i) { -// std::cerr << inputs[0].data()[i] << ", "; -// } -// std::cerr << "\n"; -// for (size_t i = 0; i < inputs[1].get_size(); ++i) { -// std::cerr << inputs[1].data()[i] << ", "; -// } -// std::cerr << "\n"; - - - for(size_t seq = 0; seq < num_rows; ++seq) { - new_begins[seq] = ragged_offset; - for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); - - std::cerr << "[ BPETokenizer ] String: '" << str << "'\n"; -// std::cerr << "[ BPETokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; - - std::vector results = tokenizer.Tokenize(str); - - for (const core::Token& token : results) { - std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ - << ", offset: (" << token.offset_.first << ", " - << token.offset_.second << ")." << std::endl; - OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); - new_elems[ragged_offset++] = token.id_; - }; - } - - new_ends[seq] = ragged_offset; - } - outputs[2].set_shape({size_t(ragged_offset)}); - return true; - -#else - // Stub implementation that transforms each input string to its length duplicating element if the length is odd - // End of stub implementation -#endif -} - - -ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node) { - FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFindV2 expects 3 inputs"); - - // Check if this node is used in a combination with already converted WordpieceTokenizeWithOffsets - auto wp_tokenizer_outputs = pre_translate_ragged_tensor_input(node.get_input(1)); - auto wp_tokenizer = dynamic_cast(wp_tokenizer_outputs[0].get_node()); - OPENVINO_ASSERT(wp_tokenizer, "Conversion of LookupTableFindV2 without coupled WordpieceTokenizer is not yet supported"); - - // TODO: Check vocab matching for LookupTableFindV2 and WordpieceTokenizer - - // TODO: Check if overflow really happens in real models due to i64 to i32 conversion - auto unk_token_id = std::make_shared(node.get_input(2), element::i32); - - auto wp_tokenizer_inputs = wp_tokenizer->input_values(); - wp_tokenizer_inputs.push_back(unk_token_id); - //std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n"; - - auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs); - return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) }; -} - - -void RaggedToDense::validate_and_infer_types() { - OPENVINO_ASSERT(get_input_size() == 3 + 1 + 1); - - // Input ragged tensor - check_ragged_input(this, 0); - - // Target size along ragged dimension - OPENVINO_ASSERT(get_input_element_type(3).is_integral_number()); - auto rank = get_input_partial_shape(3).rank(); - OPENVINO_ASSERT( - rank.is_dynamic() || - rank.get_length() == 0 || - rank.get_length() == 1 && get_input_partial_shape(3)[0].compatible(1), - "Target dense dimension size for RaggedToDense should be a 0D or 1D tensor with a single element"); - - // Default value to fill out of ragged range elements in output tensor - OPENVINO_ASSERT(get_input_element_type(4).compatible(get_input_element_type(2))); - auto input4_rank = get_input_partial_shape(4).rank(); - OPENVINO_ASSERT(input4_rank.compatible(0)); - - set_input_is_relevant_to_shape(3); - - if(get_input_partial_shape(0).rank().is_dynamic()) { - set_output_type(0, get_input_element_type(2), PartialShape::dynamic()); - set_output_type(1, element::boolean, PartialShape::dynamic()); - } else { - auto shape = get_input_partial_shape(0); - if(auto target_dim = dynamic_cast(get_input_node_ptr(3))) { - shape.push_back(target_dim->cast_vector()[0]); - } else { - shape.push_back(Dimension()); - } - set_output_type(0, get_input_element_type(2), shape); - set_output_type(1, element::boolean, shape); - } -} - - -bool RaggedToDense::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - // FIXME: Works for POD types only (not for strings!) - // FIXME: Output mask is calculated even if there are no consumers - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto nelems = inputs[0].get_size(); - auto elems = reinterpret_cast(inputs[2].data()); - auto elem_size = inputs[2].get_element_type().size(); - auto default_value = reinterpret_cast(inputs[4].data()); - - // Suppose validate was called and set correct output shape - // Take a target shape value for ragged dimension - size_t target_dim = outputs[0].get_shape().back(); - - auto out_elems = reinterpret_cast(outputs[0].data()); - auto out_mask = outputs[1].data(); - - auto out_elem_orig = out_elems; - auto out_mask_orig = out_mask; - - for(size_t i = 0; i < nelems; ++i) { - auto begin = elems + elem_size*begins[i]; - auto len = std::min(size_t(ends[i] - begins[i]), target_dim); // truncation - auto end = begin + elem_size*len; - out_elems = std::copy(begin, end, out_elems); - out_mask = std::fill_n(out_mask, len, char(1)); - if(len < target_dim) - out_mask = std::fill_n(out_mask, target_dim - len, char(0)); - while(len < target_dim) { - out_elems = std::copy(default_value, default_value + elem_size, out_elems); - ++len; - } - } - - OPENVINO_ASSERT(out_elems == out_elem_orig + outputs[0].get_byte_size()); - OPENVINO_ASSERT(out_mask == out_mask_orig + outputs[1].get_byte_size()); - return true; -} - -void CombineSegments::validate_and_infer_types() { - OPENVINO_ASSERT(get_input_size() > 0); - OPENVINO_ASSERT((get_input_size() - 1)%3 == 0); - - // First come several ragged tensors each represented as 3 regular tesors - size_t num_inputs = (get_input_size() - 1)/3; - PartialShape ps = PartialShape::dynamic(); - element::Type et = element::dynamic; - for (size_t i = 0; i < num_inputs; ++i) { - check_ragged_input(this, 3*i); - // Check limited broadcast - // Limited means that we support only two shapes on inputs: scalar and not scalars, - // and all not-scalars should have the same shape - auto rank = get_input_partial_shape(3*i).rank(); - if(rank.is_static() && rank.get_length()) { - OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); - } - OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i))); - OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i + 1))); - } - - //std::cerr << ps << '\n'; - - set_ragged_output(this, 0, ps, et); - // TODO: Avoid emitting ragged indices for the second ragged tensor, they should be identical to the first output ragged tensor - set_ragged_output(this, 3, ps, get_input_element_type(get_input_size() - 1)); -} - - -bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - // FIXME: Works for POD types only (not for strings!) - size_t num_of_ragged = (inputs.size() - 1)/3; - OPENVINO_ASSERT(num_of_ragged == inputs.back().get_size()); - std::vector begins; - std::vector ends; - std::vector nelems; - std::vector elems; - auto element_type = inputs[2].get_element_type(); - auto elem_size = element_type.size(); - size_t max_nelems = 0; - size_t flat_out_size = 0; - Shape ps; - - for(size_t i = 0; i < num_of_ragged; ++i) { - OPENVINO_ASSERT(inputs[3*i + 2].get_element_type() == element_type); - begins.push_back(inputs[3*i + 0].data()); - ends.push_back(inputs[3*i + 1].data()); - nelems.push_back(inputs[3*i + 0].get_size()); - //std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; - elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); - // TODO: Get rank from a tensor instead of partial_shape. This is a WA for CPU bug that gives 1D tensors instead of 0D tensors. - if(get_input_partial_shape(3*i + 0).rank().get_length() > 0) { - ps = inputs[3*i + 0].get_shape(); - //std::cerr << "updated\n"; - } - //std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; - //OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); - max_nelems = std::max(max_nelems, nelems.back()); - } - - // flat_out_size is going to be an estimation of the final size - // This is only an estimation, not the exact output size, because ragged tensor may have gaps in the representation - - for(size_t i = 0; i < num_of_ragged; ++i) { - //std::cerr << "max_nelems = " << max_nelems << "\n"; - if(nelems[i] == 1) { - flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast - } else { - flat_out_size += inputs[3*i + 2].get_size(); // FIXME: doesn't work for overlapped ragged regions - } - } - - auto ids = reinterpret_cast(inputs.back().data()); - size_t id_type_size = inputs.back().get_element_type().size(); - - outputs[3*0 + 0].set_shape(ps); - outputs[3*0 + 1].set_shape(ps); - OPENVINO_ASSERT(max_nelems == outputs[3*0 + 0].get_size()); - OPENVINO_ASSERT(max_nelems == outputs[3*0 + 1].get_size()); - outputs[3*0 + 2].set_shape({flat_out_size}); - - outputs[3*1 + 0].set_shape(ps); - outputs[3*1 + 1].set_shape(ps); - OPENVINO_ASSERT(max_nelems == outputs[3*1 + 0].get_size()); - OPENVINO_ASSERT(max_nelems == outputs[3*1 + 1].get_size()); - outputs[3*1 + 2].set_shape({flat_out_size}); - - auto out_elem_begins = outputs[3*0 + 0].data(); - auto out_elem_ends = outputs[3*0 + 1].data(); - auto out_elems = reinterpret_cast(outputs[3*0 + 2].data()); - auto out_id_begins = outputs[3*1 + 0].data(); - auto out_id_ends = outputs[3*1 + 1].data(); - auto out_ids = reinterpret_cast(outputs[3*1 + 2].data()); - - auto out_elems_orig = out_elems; - auto out_ids_orig = out_ids; - size_t out_offset = 0; - - for(size_t i = 0; i < max_nelems; ++i) { - out_elem_begins[i] = out_offset; - out_id_begins[i] = out_offset; - - for(size_t j = 0; j < num_of_ragged; ++j) { - const char* begin; - size_t len; - if(nelems[j] == 1) { - begin = elems[j] + elem_size*begins[j][0]; - len = ends[j][0] - begins[j][0]; - } else { - begin = elems[j] + elem_size*begins[j][i]; - len = ends[j][i] - begins[j][i]; - } - auto end = begin + elem_size*len; - out_elems = std::copy(begin, end, out_elems); - for(size_t k = 0; k < len; ++k) { - out_ids = std::copy(ids + id_type_size*j, ids + id_type_size*(j + 1), out_ids); - } - out_offset += len; - } - - out_elem_ends[i] = out_offset; - out_id_ends[i] = out_offset; - } - - OPENVINO_ASSERT(out_offset <= flat_out_size); - - outputs[3*0 + 2].set_shape({out_offset}); - outputs[3*1 + 2].set_shape({out_offset}); - - OPENVINO_ASSERT(out_elems == out_elems_orig + outputs[3*0 + 2].get_byte_size()); - OPENVINO_ASSERT(out_ids == out_ids_orig + outputs[3*1 + 2].get_byte_size()); - return true; -} - - -ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { - // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. - // It checks if the input tensor has string type, and then perform custom tranlation. - // Otherwise it should operate identically to the stock version of Reshape translator in TF FE. - // TODO: Introduce an API to call original translators from an extension without copying the code to an extension. - - FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "Tensorflow Reshape op should have two inputs"); - auto tensor = node.get_input(0); - auto shape = node.get_input(1); - if(auto pack = dynamic_cast(tensor.get_node())) { - // TODO: If it is a beginning of the graph, how to detect strings? It falls in 'else' branch in this case. - // FIXME: Needs extension for a Parameter to prepare it first - auto begins = std::make_shared(pack->input_value(0), shape, false); - auto ends = std::make_shared(pack->input_value(1), shape, false); - auto chars = pack->input_value(2); - auto reshape = post_translate_string_tensor_output({begins, ends, chars}); - return {reshape}; - } else { - auto reshape = std::make_shared(tensor, shape, false); - return {reshape}; - } - // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals -} - - -// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes -ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { - auto ov_type = node.get_attribute_as_any("dtype"); - std::shared_ptr const_node; - if (!ov_type.is() || ov_type.as() == ov::element::dynamic || - ov_type.as() == ov::element::undefined) { - if (ov_type.is() && ov_type.as() == "DT_STRING") { - auto value_as_any = node.get_attribute_as_any("value"); - const auto& values = value_as_any.as>(); - ov::Tensor begins(element::i32, {}), ends(element::i32, {}), chars(element::u8, {}); - unpack_strings(&values[0], {values.size()}, begins, ends, chars); - const_node = std::make_shared(OutputVector{ - std::make_shared(begins), - std::make_shared(ends), - std::make_shared(chars) - }); - } else { - const_node = std::make_shared(OutputVector{}); - } - } else { - //static std::vector tensors; - auto tensor = node.get_attribute("value"); - //tensors.push_back(tensor); - const_node = std::make_shared(tensor); - #if OPENVINO_ELEMENT_STRING_SUPPORTED - if (const_node->get_element_type() == element::string) { - if(shape_size(tensor.get_shape()) > 0) { - auto strings = std::dynamic_pointer_cast(const_node)->get_data_ptr(); - } - const_node = std::make_shared(const_node->outputs()); - const_node = std::make_shared(const_node->outputs()); - } - #endif - } - //set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name - return {const_node}; -} - - -void VocabDecoder::validate_and_infer_types() { -// check_ragged_string_input(this, 0); - check_string_input(this, 1); - const auto shape = get_input_partial_shape(0); - set_ragged_string_output(this, 0, {shape[0]}); -} - - -bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto batch_size = inputs[0].get_shape()[0]; - auto seq_len = inputs[0].get_shape()[1]; - auto input_data = inputs[0].data(); - - auto vocab_begins = inputs[1].data(); - auto vocab_ends = inputs[2].data(); - auto vocab_chars = inputs[3].data(); - auto vocab_size = inputs[1].get_size(); - - std::vector> vocab; - vocab.resize(vocab_size); - - OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern"); - - for(size_t id = 0; id < vocab_size; ++id) { - std::vector token = std::vector(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[id] = token; - } - // Set output shapes - outputs[0].set_shape({batch_size}); - outputs[1].set_shape({batch_size}); - outputs[2].set_shape({batch_size * seq_len}); - outputs[3].set_shape({batch_size * seq_len}); - outputs[4].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length - const size_t num_rows = inputs[0].get_size(); - - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - auto new_chars = outputs[4].data(); - uint32_t char_offset = 0; - - for(size_t batch = 0; batch < batch_size; ++batch) { - new_ragged_begins[batch] = batch * seq_len; - new_ragged_ends[batch] = new_ragged_begins[batch] + seq_len; - - for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) { - auto token_id = input_data[seq]; - auto token = vocab[token_id]; - - std::copy(token.begin(), token.end(), &new_chars[char_offset]); - - new_begins[seq] = char_offset; - char_offset += token.size(); - new_ends[seq] = char_offset; - } - } - outputs[4].set_shape({char_offset}); - return true; -} - - -void CharsToBytes::validate_and_infer_types() { - check_ragged_string_input(this, 0); -// set_ragged_string_output(this, 0, get_input_partial_shape(0)); - set_string_output(this, 0, get_input_partial_shape(0)); -} - -std::array, 4> CharsToBytes::create_pair_map() { - auto bytes_to_chars = create_bytes_to_chars_map(); - std::array, 4> pair_map; - - for (int i=0; i < bytes_to_chars.size(); ++i) { - std::vector chars = bytes_to_chars[i]; - if (chars.size() == 2) { - pair_map[chars[0] - 194][chars[1] - 128] = i; - }; - }; - - return pair_map; -} - - -bool CharsToBytes::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto ragged_begins = inputs[0].data(); - auto ragged_ends = inputs[1].data(); - auto begins = inputs[2].data(); - auto ends = inputs[3].data(); - auto chars = inputs[4].data(); - - OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to CharsToBytes, it means it is not converted properly or it is not used in the supported pattern"); - - // Set output shapes -// outputs[0] = inputs[0]; -// outputs[1] = inputs[1]; - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - outputs[2].set_shape(Shape({inputs[4].get_size()})); - const size_t num_rows = inputs[0].get_size(); - - // Get pointers in the output tensors - auto new_begins = outputs[0].data(); - auto new_ends = outputs[1].data(); - auto new_chars = outputs[2].data(); - uint32_t char_pointer = 0; - - for(size_t row = 0; row < num_rows; ++row) { - new_begins[row] = char_pointer; - for(size_t col = ragged_begins[row]; col < ragged_ends[row]; ++col) { - const auto word_len = ends[col] - begins[col]; - - for (size_t k = 0; k < word_len; ++k) { - const auto first_byte = chars[begins[col] + k]; - if (first_byte < m_one_byte_border) { - new_chars[char_pointer++] = first_byte; - } else { - const auto second_byte = chars[begins[col] + (++k)]; - new_chars[char_pointer++] = m_pair_map[first_byte - 194][second_byte - 128]; - } - } - }; - new_ends[row] = char_pointer; - } - outputs[2].set_shape({char_pointer}); - return true; -} diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp deleted file mode 100644 index 7e57f51d0..000000000 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ /dev/null @@ -1,551 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include - -namespace sentencepiece { - class SentencePieceProcessor; -} - -// Having a decomposed representation for a tensor, converts it to a single string tensor -// (packed u8 or natively supported element::string depending on whether or not USE_STRING_TENSORS defined). -class StringTensorPack : public ov::op::Op { -public: - OPENVINO_OP("StringTensorPack"); - - StringTensorPack () = default; - - StringTensorPack(ov::OutputVector inputs, const std::string& mode = "begins_ends") - : ov::op::Op(inputs), m_mode(mode) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - auto result = std::make_shared(inputs, m_mode); - return result; - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("mode", m_mode); - return true; - } - - bool has_evaluate() const { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; - -private: - - std::string m_mode = "begins_ends"; -}; - - -// Having a decomposed representation for a tensor, converts it to a single string tensor for debugging purposes and to facilitate model conversion -// Base tensor on which this operation builds a ragged tensor can have any shape or type, this operation doesn't try to interpret it. -class RaggedTensorPack : public ov::op::Op { -public: - OPENVINO_OP("RaggedTensorPack"); - - RaggedTensorPack () = default; - - RaggedTensorPack(ov::OutputVector inputs) - : ov::op::Op(inputs) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - auto result = std::make_shared(inputs); - return result; - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool has_evaluate() const { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; -}; - - -// Unpack a string tensor representation regardless of the source format, which -// can be an OV tensor with element::string element type (if supported) or u8 -// packed representation, to a decompose tensor representation that may potentially -// consist of multiple tensors. The destination format is defined by `mode` attribute. -// Shape of the output tensor is compitelly recognized from the input (if supported) -// or defined partially by a dedicated input attribute `shape`. If `shape` is not set, -// which default to completelly dynamic `shape`, then output shape is defined -// by an input tensor. -class StringTensorUnpack : public ov::op::Op { -public: - OPENVINO_OP("StringTensorUnpack"); - - StringTensorUnpack () = default; - - StringTensorUnpack(ov::OutputVector inputs, const std::string& mode = "begins_ends") - : ov::op::Op(inputs), m_mode(mode) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - auto result = std::make_shared(inputs, m_mode); - return result; - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("mode", m_mode); - return true; - } - - bool has_evaluate() const { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; - -private: - - std::string m_mode = "begins_ends"; -}; - - - -namespace TemplateExtension { - class SentencepieceTokenizer : public ov::op::Op { - public: - OPENVINO_OP("SentencepieceTokenizer"); - - SentencepieceTokenizer() = default; - SentencepieceTokenizer(const ov::OutputVector& args, int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse); - SentencepieceTokenizer(const ov::OutputVector& args, const std::shared_ptr& sp, int32_t nbest_size, float alpha, - bool add_bos, bool add_eos, bool reverse); - - bool visit_attributes(ov::AttributeVisitor& visitor) override; - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const override; - - private: - std::shared_ptr m_sp; - int32_t m_nbest_size; - float m_alpha; - bool m_add_bos; - bool m_add_eos; - bool m_reverse; - }; -} // namespace TemplateExtension - -ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node); - -ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); - - -class OPENVINO_API CaseFold : public ov::op::Op { -public: - OPENVINO_OP("CaseFold"); - - CaseFold () = default; - - CaseFold (const ov::OutputVector& arguments) : ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } -}; - - -ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node); - - -class OPENVINO_API NormalizeUnicode : public ov::op::Op { -public: - OPENVINO_OP("NormalizeUnicode"); - - NormalizeUnicode () = default; - - NormalizeUnicode(const ov::OutputVector& arguments, const std::string& normalization_form = "NFD") : - ov::op::Op(arguments), - m_normalization_form(normalization_form) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalization_form); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("normalization_form", m_normalization_form); - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } - -private: - - std::string m_normalization_form = "NFD"; -}; - -ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); - - -class OPENVINO_API RegexNormalization : public ov::op::Op { -public: - OPENVINO_OP("RegexNormalization"); - - RegexNormalization () = default; - - RegexNormalization(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } -}; - -ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node); - -class OPENVINO_API RegexSplit : public ov::op::Op { -public: - OPENVINO_OP("RegexSplit"); - - RegexSplit () = default; - - RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false) : - ov::op::Op(arguments), - m_behaviour(behaviour), - m_invert(invert) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_behaviour, m_invert); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("behaviour", m_behaviour); - visitor.on_attribute("invert", m_invert); - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } - -private: - - std::string m_behaviour = "remove"; - bool m_invert = false; -}; - -ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node); - -class OPENVINO_API WordpieceTokenizer : public ov::op::Op { -public: - OPENVINO_OP("WordpieceTokenizer"); - - WordpieceTokenizer () = default; - - WordpieceTokenizer(const ov::OutputVector& arguments, const std::string& suffix_indicator = "##", int max_bytes_per_word = 100) : - ov::op::Op(arguments), - m_suffix_indicator(suffix_indicator), - m_max_bytes_per_word(max_bytes_per_word) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_suffix_indicator, m_max_bytes_per_word); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("suffix_indicator", m_suffix_indicator); - visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } - -private: - - std::string m_suffix_indicator = "##"; - int m_max_bytes_per_word = 100; // TODO: Can it be done outside the op as preprocessing of the input? -}; - -ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); -ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); - - -const std::array, 256> create_bytes_to_chars_map(); - - -class OPENVINO_API BytesToChars : public ov::op::Op { -public: - OPENVINO_OP("BytesToChars"); - - BytesToChars () = default; - - BytesToChars(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - -// bool visit_attributes(ov::AttributeVisitor& visitor) override { -// visitor.on_attribute("suffix_indicator", m_suffix_indicator); -// visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); -// return true; -// } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } - -private: - const std::array, 256> m_bytes_to_chars = create_bytes_to_chars_map(); -}; - - -class OPENVINO_API BPETokenizer : public ov::op::Op { -public: - OPENVINO_OP("BPETokenizer"); - - BPETokenizer () = default; - - BPETokenizer( - const ov::OutputVector& arguments, - const std::string& unk_token = "", - bool fuse_unk = false, - const std::string& suffix_indicator = "", - const std::string& end_suffix = "", - bool byte_fallback = false - ) : - ov::op::Op(arguments), - m_unk_token(unk_token), - m_fuse_unk(fuse_unk), - m_suffix_indicator(suffix_indicator), - m_end_suffix(end_suffix), - m_byte_fallback(byte_fallback) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_unk_token, m_fuse_unk, m_suffix_indicator, m_end_suffix, m_byte_fallback); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - visitor.on_attribute("unk_token", m_unk_token); - visitor.on_attribute("fuse_unk", m_fuse_unk); - visitor.on_attribute("suffix_indicator", m_suffix_indicator); - visitor.on_attribute("end_suffix", m_end_suffix); - visitor.on_attribute("byte_fallback", m_byte_fallback); - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } - -private: - std::string m_unk_token; - bool m_fuse_unk = false; - std::string m_suffix_indicator; - std::string m_end_suffix; - bool m_byte_fallback = false; -}; - - -class OPENVINO_API CombineSegments : public ov::op::Op { -public: - OPENVINO_OP("CombineSegments"); - - CombineSegments () = default; - - CombineSegments(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } -}; - -// Takes a ragged tensor with one ragged right-most dimension and produces a normal tensor -class OPENVINO_API RaggedToDense : public ov::op::Op { -public: - OPENVINO_OP("RaggedToDense"); - - RaggedToDense () = default; - - RaggedToDense(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } -}; - -ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); -ov::OutputVector translate_const(const ov::frontend::NodeContext& node); - - -class OPENVINO_API VocabDecoder : public ov::op::Op { -public: - OPENVINO_OP("VocabDecoder"); - - VocabDecoder () = default; - - VocabDecoder(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } -}; - -class OPENVINO_API CharsToBytes : public ov::op::Op { -public: - OPENVINO_OP("CharsToBytes"); - - CharsToBytes () = default; - - CharsToBytes(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); - } - - bool visit_attributes(ov::AttributeVisitor& visitor) override { - return true; - } - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const { - return true; - } - - std::array, 4> create_pair_map(); - -private: - const std::array, 4> m_pair_map = create_pair_map(); - const uint8_t m_one_byte_border = 128; // if char > 128 => it is two byte char -}; diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt similarity index 100% rename from modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt rename to modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp new file mode 100644 index 000000000..25ad3db31 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp @@ -0,0 +1,153 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "bpe_tokenizer.hpp" +#include "utils.hpp" + +using namespace ov; + +#undef tokenizer + +void BPETokenizer::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_input(this, 5); + check_string_input(this, 8); + set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); +} + +bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto vocab_begins = inputs[5].data(); + auto vocab_ends = inputs[6].data(); + auto vocab_chars = inputs[7].data(); + + auto merges_begins = inputs[8].data(); + auto merges_ends = inputs[9].data(); + auto merges_chars = inputs[10].data(); + + auto vocab_size = inputs[5].get_size(); + auto merges_size = inputs[8].get_size(); + + OPENVINO_ASSERT(inputs.size() == 11, "Too few inputs passed to BPETokenizer, it means it is not converted properly or it is not used in the supported pattern"); + +#if 1 + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_rows = inputs[0].get_size(); + + // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area + // to represent different elements in ragged tensor + outputs[2].set_shape({inputs[4].get_size()}); + + using namespace paddlenlp::fast_tokenizer; + +// std::cerr << "[ BPETokenizer ] Start vocab reading\n"; + core::Vocab vocab; + int32_t unk_token_id = -1; + +// std::cerr << "[ BPETokenizer ] Vocab size is " << vocab_size << "\n"; + + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + } + +// std::cerr << "[ BPETokenizer ] Finish vocab reading\n"; +// +// std::cerr << "[ BPETokenizer ] Start merges reading\n"; +// std::cerr << "[ BPETokenizer ] Merges Size: " << merges_size << "\n"; + core::Merges merges; + std::string delim = " "; + + + for(size_t id = 0; id < merges_size; ++id) { + auto merge = std::string(merges_chars + merges_begins[id], merges_chars + merges_ends[id]); + const int delim_pos = merge.find(delim); + + std::pair merge_pair = { + merge.substr(0, delim_pos), merge.substr(delim_pos + 1) + }; + merges.emplace_back(merge_pair); + } + +// std::cerr << "[ BPETokenizer ] Finish merges reading\n"; + + +// std::cerr << "[ BPETokenizer ] Start tokenizer initialization\n"; + + std::vector unk_token = {}; + if (m_unk_token.size() > 0) { + unk_token.push_back(m_unk_token); + }; + std::vector suffix_indicator = {}; + if (m_suffix_indicator.size() > 0) { + suffix_indicator.push_back(m_suffix_indicator); + }; + std::vector end_suffix = {}; + if (m_end_suffix.size() > 0) { + end_suffix.push_back(m_end_suffix); + }; + + models::BPE tokenizer(vocab, merges, 10000 /* default cache size */, {} /* dropout - don't use dropout for inference */, + unk_token, suffix_indicator, end_suffix, m_fuse_unk); + +// std::cerr << "[ BPETokenizer ] Finish tokenizer initialization\n"; + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t ragged_offset = 0; + +// std::cerr << "Ragged Begins and ends:\n"; +// for (size_t i = 0; i < inputs[0].get_size(); ++i) { +// std::cerr << inputs[0].data()[i] << ", "; +// } +// std::cerr << "\n"; +// for (size_t i = 0; i < inputs[1].get_size(); ++i) { +// std::cerr << inputs[1].data()[i] << ", "; +// } +// std::cerr << "\n"; + + + for(size_t seq = 0; seq < num_rows; ++seq) { + new_begins[seq] = ragged_offset; + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + + std::cerr << "[ BPETokenizer ] String: '" << str << "'\n"; +// std::cerr << "[ BPETokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; + + std::vector results = tokenizer.Tokenize(str); + + for (const core::Token& token : results) { + std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ + << ", offset: (" << token.offset_.first << ", " + << token.offset_.second << ")." << std::endl; + OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); + new_elems[ragged_offset++] = token.id_; + }; + } + + new_ends[seq] = ragged_offset; + } + outputs[2].set_shape({size_t(ragged_offset)}); + return true; + +#else + // Stub implementation that transforms each input string to its length duplicating element if the length is odd + // End of stub implementation +#endif +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp new file mode 100644 index 000000000..dc0a8dd4b --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API BPETokenizer : public ov::op::Op { +public: + OPENVINO_OP("BPETokenizer"); + + BPETokenizer () = default; + + BPETokenizer( + const ov::OutputVector& arguments, + const std::string& unk_token = "", + bool fuse_unk = false, + const std::string& suffix_indicator = "", + const std::string& end_suffix = "", + bool byte_fallback = false + ) : + ov::op::Op(arguments), + m_unk_token(unk_token), + m_fuse_unk(fuse_unk), + m_suffix_indicator(suffix_indicator), + m_end_suffix(end_suffix), + m_byte_fallback(byte_fallback) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_unk_token, m_fuse_unk, m_suffix_indicator, m_end_suffix, m_byte_fallback); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("unk_token", m_unk_token); + visitor.on_attribute("fuse_unk", m_fuse_unk); + visitor.on_attribute("suffix_indicator", m_suffix_indicator); + visitor.on_attribute("end_suffix", m_end_suffix); + visitor.on_attribute("byte_fallback", m_byte_fallback); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + std::string m_unk_token; + bool m_fuse_unk = false; + std::string m_suffix_indicator; + std::string m_end_suffix; + bool m_byte_fallback = false; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp new file mode 100644 index 000000000..2919098a3 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp @@ -0,0 +1,321 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "bytes_to_chars.hpp" +#include "utils.hpp" + +using namespace ov; + + +const std::array, 256> create_bytes_to_chars_map() { + return {{ + { 196, 128 }, + { 196, 129 }, + { 196, 130 }, + { 196, 131 }, + { 196, 132 }, + { 196, 133 }, + { 196, 134 }, + { 196, 135 }, + { 196, 136 }, + { 196, 137 }, + { 196, 138 }, + { 196, 139 }, + { 196, 140 }, + { 196, 141 }, + { 196, 142 }, + { 196, 143 }, + { 196, 144 }, + { 196, 145 }, + { 196, 146 }, + { 196, 147 }, + { 196, 148 }, + { 196, 149 }, + { 196, 150 }, + { 196, 151 }, + { 196, 152 }, + { 196, 153 }, + { 196, 154 }, + { 196, 155 }, + { 196, 156 }, + { 196, 157 }, + { 196, 158 }, + { 196, 159 }, + { 196, 160 }, + { 33 }, + { 34 }, + { 35 }, + { 36 }, + { 37 }, + { 38 }, + { 39 }, + { 40 }, + { 41 }, + { 42 }, + { 43 }, + { 44 }, + { 45 }, + { 46 }, + { 47 }, + { 48 }, + { 49 }, + { 50 }, + { 51 }, + { 52 }, + { 53 }, + { 54 }, + { 55 }, + { 56 }, + { 57 }, + { 58 }, + { 59 }, + { 60 }, + { 61 }, + { 62 }, + { 63 }, + { 64 }, + { 65 }, + { 66 }, + { 67 }, + { 68 }, + { 69 }, + { 70 }, + { 71 }, + { 72 }, + { 73 }, + { 74 }, + { 75 }, + { 76 }, + { 77 }, + { 78 }, + { 79 }, + { 80 }, + { 81 }, + { 82 }, + { 83 }, + { 84 }, + { 85 }, + { 86 }, + { 87 }, + { 88 }, + { 89 }, + { 90 }, + { 91 }, + { 92 }, + { 93 }, + { 94 }, + { 95 }, + { 96 }, + { 97 }, + { 98 }, + { 99 }, + { 100 }, + { 101 }, + { 102 }, + { 103 }, + { 104 }, + { 105 }, + { 106 }, + { 107 }, + { 108 }, + { 109 }, + { 110 }, + { 111 }, + { 112 }, + { 113 }, + { 114 }, + { 115 }, + { 116 }, + { 117 }, + { 118 }, + { 119 }, + { 120 }, + { 121 }, + { 122 }, + { 123 }, + { 124 }, + { 125 }, + { 126 }, + { 196, 161 }, + { 196, 162 }, + { 196, 163 }, + { 196, 164 }, + { 196, 165 }, + { 196, 166 }, + { 196, 167 }, + { 196, 168 }, + { 196, 169 }, + { 196, 170 }, + { 196, 171 }, + { 196, 172 }, + { 196, 173 }, + { 196, 174 }, + { 196, 175 }, + { 196, 176 }, + { 196, 177 }, + { 196, 178 }, + { 196, 179 }, + { 196, 180 }, + { 196, 181 }, + { 196, 182 }, + { 196, 183 }, + { 196, 184 }, + { 196, 185 }, + { 196, 186 }, + { 196, 187 }, + { 196, 188 }, + { 196, 189 }, + { 196, 190 }, + { 196, 191 }, + { 197, 128 }, + { 197, 129 }, + { 197, 130 }, + { 194, 161 }, + { 194, 162 }, + { 194, 163 }, + { 194, 164 }, + { 194, 165 }, + { 194, 166 }, + { 194, 167 }, + { 194, 168 }, + { 194, 169 }, + { 194, 170 }, + { 194, 171 }, + { 194, 172 }, + { 197, 131 }, + { 194, 174 }, + { 194, 175 }, + { 194, 176 }, + { 194, 177 }, + { 194, 178 }, + { 194, 179 }, + { 194, 180 }, + { 194, 181 }, + { 194, 182 }, + { 194, 183 }, + { 194, 184 }, + { 194, 185 }, + { 194, 186 }, + { 194, 187 }, + { 194, 188 }, + { 194, 189 }, + { 194, 190 }, + { 194, 191 }, + { 195, 128 }, + { 195, 129 }, + { 195, 130 }, + { 195, 131 }, + { 195, 132 }, + { 195, 133 }, + { 195, 134 }, + { 195, 135 }, + { 195, 136 }, + { 195, 137 }, + { 195, 138 }, + { 195, 139 }, + { 195, 140 }, + { 195, 141 }, + { 195, 142 }, + { 195, 143 }, + { 195, 144 }, + { 195, 145 }, + { 195, 146 }, + { 195, 147 }, + { 195, 148 }, + { 195, 149 }, + { 195, 150 }, + { 195, 151 }, + { 195, 152 }, + { 195, 153 }, + { 195, 154 }, + { 195, 155 }, + { 195, 156 }, + { 195, 157 }, + { 195, 158 }, + { 195, 159 }, + { 195, 160 }, + { 195, 161 }, + { 195, 162 }, + { 195, 163 }, + { 195, 164 }, + { 195, 165 }, + { 195, 166 }, + { 195, 167 }, + { 195, 168 }, + { 195, 169 }, + { 195, 170 }, + { 195, 171 }, + { 195, 172 }, + { 195, 173 }, + { 195, 174 }, + { 195, 175 }, + { 195, 176 }, + { 195, 177 }, + { 195, 178 }, + { 195, 179 }, + { 195, 180 }, + { 195, 181 }, + { 195, 182 }, + { 195, 183 }, + { 195, 184 }, + { 195, 185 }, + { 195, 186 }, + { 195, 187 }, + { 195, 188 }, + { 195, 189 }, + { 195, 190 }, + { 195, 191 }, + }}; +} + +void BytesToChars::validate_and_infer_types() { + check_ragged_string_input(this, 0); +// check_string_input(this, 5); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + +bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to BytesToChars, it means it is not converted properly or it is not used in the supported pattern"); + + // Set output shapes + outputs[0] = inputs[0]; + outputs[1] = inputs[1]; + outputs[2].set_shape(inputs[2].get_shape()); + outputs[3].set_shape(inputs[3].get_shape()); + outputs[4].set_shape(Shape({inputs[4].get_size() * 2})); + const size_t num_elems = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + uint32_t char_pointer = 0; + + for(size_t j = 0; j < num_elems; ++j) { + + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + const auto word_len = ends[i] - begins[i]; + new_begins[i] = char_pointer; + + for (size_t k = 0; k < word_len; ++k) { + for (auto byte : m_bytes_to_chars[chars[begins[i] + k]]) { + new_chars[char_pointer++] = byte; + } + } + new_ends[i] = char_pointer; + } + } + outputs[4].set_shape({char_pointer}); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp new file mode 100644 index 000000000..2b7598c50 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + + +const std::array, 256> create_bytes_to_chars_map(); + +class OPENVINO_API BytesToChars : public ov::op::Op { +public: + OPENVINO_OP("BytesToChars"); + + BytesToChars () = default; + + BytesToChars(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + +// bool visit_attributes(ov::AttributeVisitor& visitor) override { +// visitor.on_attribute("suffix_indicator", m_suffix_indicator); +// visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); +// return true; +// } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + const std::array, 256> m_bytes_to_chars = create_bytes_to_chars_map(); +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp new file mode 100644 index 000000000..7a8cff580 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp @@ -0,0 +1,25 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "case_fold.hpp" +#include "utils.hpp" + +#include "fast_tokenizer/normalizers/normalizers.h" + +using namespace ov; + + +void CaseFold::validate_and_infer_types() { + check_string_input(this, 0); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + return evaluate_normalization_helper( + outputs, inputs, + [](const std::string& str) { + using namespace paddlenlp::fast_tokenizer; + return normalizers::NormalizedString(str).Lowercase().GetStr(); + }); +} \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp new file mode 100644 index 000000000..3f6e86e65 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API CaseFold : public ov::op::Op { +public: + OPENVINO_OP("CaseFold"); + + CaseFold () = default; + + CaseFold (const ov::OutputVector& arguments) : ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp new file mode 100644 index 000000000..d87645d4b --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "chars_to_bytes.hpp" +#include "bytes_to_chars.hpp" +#include "utils.hpp" + +using namespace ov; + +void CharsToBytes::validate_and_infer_types() { + check_ragged_string_input(this, 0); +// set_ragged_string_output(this, 0, get_input_partial_shape(0)); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +std::array, 4> CharsToBytes::create_pair_map() { + auto bytes_to_chars = create_bytes_to_chars_map(); + std::array, 4> pair_map; + + for (int i=0; i < bytes_to_chars.size(); ++i) { + std::vector chars = bytes_to_chars[i]; + if (chars.size() == 2) { + pair_map[chars[0] - 194][chars[1] - 128] = i; + }; + }; + + return pair_map; +} + +bool CharsToBytes::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to CharsToBytes, it means it is not converted properly or it is not used in the supported pattern"); + + // Set output shapes +// outputs[0] = inputs[0]; +// outputs[1] = inputs[1]; + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + outputs[2].set_shape(Shape({inputs[4].get_size()})); + const size_t num_rows = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + uint32_t char_pointer = 0; + + for(size_t row = 0; row < num_rows; ++row) { + new_begins[row] = char_pointer; + for(size_t col = ragged_begins[row]; col < ragged_ends[row]; ++col) { + const auto word_len = ends[col] - begins[col]; + + for (size_t k = 0; k < word_len; ++k) { + const auto first_byte = chars[begins[col] + k]; + if (first_byte < m_one_byte_border) { + new_chars[char_pointer++] = first_byte; + } else { + const auto second_byte = chars[begins[col] + (++k)]; + new_chars[char_pointer++] = m_pair_map[first_byte - 194][second_byte - 128]; + } + } + }; + new_ends[row] = char_pointer; + } + outputs[2].set_shape({char_pointer}); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp new file mode 100644 index 000000000..428788610 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API CharsToBytes : public ov::op::Op { +public: + OPENVINO_OP("CharsToBytes"); + + CharsToBytes () = default; + + CharsToBytes(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + + std::array, 4> create_pair_map(); + +private: + const std::array, 4> m_pair_map = create_pair_map(); + const uint8_t m_one_byte_border = 128; // if char > 128 => it is two byte char +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp new file mode 100644 index 000000000..7d8ca05a6 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp @@ -0,0 +1,142 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "combine_segments.hpp" +#include "utils.hpp" + +using namespace ov; + +void CombineSegments::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() > 0); + OPENVINO_ASSERT((get_input_size() - 1)%3 == 0); + + // First come several ragged tensors each represented as 3 regular tesors + size_t num_inputs = (get_input_size() - 1)/3; + PartialShape ps = PartialShape::dynamic(); + element::Type et = element::dynamic; + for (size_t i = 0; i < num_inputs; ++i) { + check_ragged_input(this, 3*i); + // Check limited broadcast + // Limited means that we support only two shapes on inputs: scalar and not scalars, + // and all not-scalars should have the same shape + auto rank = get_input_partial_shape(3*i).rank(); + if(rank.is_static() && rank.get_length()) { + OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); + } + OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i))); + OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i + 1))); + } + + //std::cerr << ps << '\n'; + + set_ragged_output(this, 0, ps, et); + // TODO: Avoid emitting ragged indices for the second ragged tensor, they should be identical to the first output ragged tensor + set_ragged_output(this, 3, ps, get_input_element_type(get_input_size() - 1)); +} + +bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // FIXME: Works for POD types only (not for strings!) + size_t num_of_ragged = (inputs.size() - 1)/3; + OPENVINO_ASSERT(num_of_ragged == inputs.back().get_size()); + std::vector begins; + std::vector ends; + std::vector nelems; + std::vector elems; + auto element_type = inputs[2].get_element_type(); + auto elem_size = element_type.size(); + size_t max_nelems = 0; + size_t flat_out_size = 0; + Shape ps; + + for(size_t i = 0; i < num_of_ragged; ++i) { + OPENVINO_ASSERT(inputs[3*i + 2].get_element_type() == element_type); + begins.push_back(inputs[3*i + 0].data()); + ends.push_back(inputs[3*i + 1].data()); + nelems.push_back(inputs[3*i + 0].get_size()); + //std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; + elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); + // TODO: Get rank from a tensor instead of partial_shape. This is a WA for CPU bug that gives 1D tensors instead of 0D tensors. + if(get_input_partial_shape(3*i + 0).rank().get_length() > 0) { + ps = inputs[3*i + 0].get_shape(); + //std::cerr << "updated\n"; + } + //std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; + //OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); + max_nelems = std::max(max_nelems, nelems.back()); + } + + // flat_out_size is going to be an estimation of the final size + // This is only an estimation, not the exact output size, because ragged tensor may have gaps in the representation + + for(size_t i = 0; i < num_of_ragged; ++i) { + //std::cerr << "max_nelems = " << max_nelems << "\n"; + if(nelems[i] == 1) { + flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast + } else { + flat_out_size += inputs[3*i + 2].get_size(); // FIXME: doesn't work for overlapped ragged regions + } + } + + auto ids = reinterpret_cast(inputs.back().data()); + size_t id_type_size = inputs.back().get_element_type().size(); + + outputs[3*0 + 0].set_shape(ps); + outputs[3*0 + 1].set_shape(ps); + OPENVINO_ASSERT(max_nelems == outputs[3*0 + 0].get_size()); + OPENVINO_ASSERT(max_nelems == outputs[3*0 + 1].get_size()); + outputs[3*0 + 2].set_shape({flat_out_size}); + + outputs[3*1 + 0].set_shape(ps); + outputs[3*1 + 1].set_shape(ps); + OPENVINO_ASSERT(max_nelems == outputs[3*1 + 0].get_size()); + OPENVINO_ASSERT(max_nelems == outputs[3*1 + 1].get_size()); + outputs[3*1 + 2].set_shape({flat_out_size}); + + auto out_elem_begins = outputs[3*0 + 0].data(); + auto out_elem_ends = outputs[3*0 + 1].data(); + auto out_elems = reinterpret_cast(outputs[3*0 + 2].data()); + auto out_id_begins = outputs[3*1 + 0].data(); + auto out_id_ends = outputs[3*1 + 1].data(); + auto out_ids = reinterpret_cast(outputs[3*1 + 2].data()); + + auto out_elems_orig = out_elems; + auto out_ids_orig = out_ids; + size_t out_offset = 0; + + for(size_t i = 0; i < max_nelems; ++i) { + out_elem_begins[i] = out_offset; + out_id_begins[i] = out_offset; + + for(size_t j = 0; j < num_of_ragged; ++j) { + const char* begin; + size_t len; + if(nelems[j] == 1) { + begin = elems[j] + elem_size*begins[j][0]; + len = ends[j][0] - begins[j][0]; + } else { + begin = elems[j] + elem_size*begins[j][i]; + len = ends[j][i] - begins[j][i]; + } + auto end = begin + elem_size*len; + out_elems = std::copy(begin, end, out_elems); + for(size_t k = 0; k < len; ++k) { + out_ids = std::copy(ids + id_type_size*j, ids + id_type_size*(j + 1), out_ids); + } + out_offset += len; + } + + out_elem_ends[i] = out_offset; + out_id_ends[i] = out_offset; + } + + OPENVINO_ASSERT(out_offset <= flat_out_size); + + outputs[3*0 + 2].set_shape({out_offset}); + outputs[3*1 + 2].set_shape({out_offset}); + + OPENVINO_ASSERT(out_elems == out_elems_orig + outputs[3*0 + 2].get_byte_size()); + OPENVINO_ASSERT(out_ids == out_ids_orig + outputs[3*1 + 2].get_byte_size()); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp new file mode 100644 index 000000000..9399f959a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API CombineSegments : public ov::op::Op { +public: + OPENVINO_OP("CombineSegments"); + + CombineSegments () = default; + + CombineSegments(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py similarity index 100% rename from modules/custom_operations/user_ie_extensions/sentence_piece/convert_tokenizer.py rename to modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py similarity index 100% rename from modules/custom_operations/user_ie_extensions/sentence_piece/hf_parser.py rename to modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp new file mode 100644 index 000000000..83514b801 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "normalize_unicode.hpp" +#include "utils.hpp" + +using namespace ov; + +namespace { +using namespace paddlenlp::fast_tokenizer::normalizers; +using NormalizersMap = std::map>; + +const NormalizersMap normalizers = { + {"NFD", [](const std::string& str) { return NormalizedString(str).NFD().GetStr(); }}, + {"NFC", [](const std::string& str) { return NormalizedString(str).NFC().GetStr(); }}, + {"NFKD", [](const std::string& str) { return NormalizedString(str).NFKD().GetStr(); }}, + {"NFKC", [](const std::string& str) { return NormalizedString(str).NFKC().GetStr(); }}, +}; + +} + +void NormalizeUnicode::validate_and_infer_types() { + check_string_input(this, 0); + OPENVINO_ASSERT(normalizers.find(m_normalization_form) != normalizers.end(), "NormalizeUnicode doesn't know normalization form " + m_normalization_form); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + return evaluate_normalization_helper(outputs, inputs, normalizers.at(m_normalization_form)); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp new file mode 100644 index 000000000..0d2e27d89 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API NormalizeUnicode : public ov::op::Op { +public: + OPENVINO_OP("NormalizeUnicode"); + + NormalizeUnicode () = default; + + NormalizeUnicode(const ov::OutputVector& arguments, const std::string& normalization_form = "NFD") : + ov::op::Op(arguments), + m_normalization_form(normalization_form) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_normalization_form); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("normalization_form", m_normalization_form); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_normalization_form = "NFD"; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp new file mode 100644 index 000000000..aaf2ad78f --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ragged_tensor_pack.hpp" +#include "utils.hpp" + +using namespace ov; + + +void RaggedTensorPack::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 3); + OPENVINO_ASSERT(get_input_element_type(0) == element::i32); + OPENVINO_ASSERT(get_input_element_type(1) == element::i32); + + // Pass through the base tensor which is used to build ragged dimensions + // TODO: Provide correct implementation that saves information about ragged structure + // TODO: Requires single-tensor packed representation for ragged tensor + set_output_type(0, get_input_element_type(2), get_input_partial_shape(2)); +} + + +bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass the base tensor with elements throug. + + auto input_shape = inputs[0].get_shape(); + //std::cout << "[ DEBUG ] RaggedTensorPack: shape = " << input_shape << "\n"; + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto num_elements = shape_size(input_shape); + + //for(size_t i = 0; i < num_elements; ++i) { + //std::cout << "[ DEBUG ] [" << i << "] " << begins[i] << ":" << ends[i] << " with size = " << ends[i] - begins[i] << "\n"; + //} + + inputs[2].copy_to(outputs[0]); + + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp new file mode 100644 index 000000000..edcbf4bbc --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Having a decomposed representation for a tensor, converts it to a single string tensor for debugging purposes and to facilitate model conversion +// Base tensor on which this operation builds a ragged tensor can have any shape or type, this operation doesn't try to interpret it. +class RaggedTensorPack : public ov::op::Op { +public: + OPENVINO_OP("RaggedTensorPack"); + + RaggedTensorPack () = default; + + RaggedTensorPack(ov::OutputVector inputs) + : ov::op::Op(inputs) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp new file mode 100644 index 000000000..acb145d11 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "ragged_to_dense.hpp" +#include "utils.hpp" + +using namespace ov; +using op::v0::Constant; + +void RaggedToDense::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 3 + 1 + 1); + + // Input ragged tensor + check_ragged_input(this, 0); + + // Target size along ragged dimension + OPENVINO_ASSERT(get_input_element_type(3).is_integral_number()); + auto rank = get_input_partial_shape(3).rank(); + OPENVINO_ASSERT( + rank.is_dynamic() || + rank.get_length() == 0 || + rank.get_length() == 1 && get_input_partial_shape(3)[0].compatible(1), + "Target dense dimension size for RaggedToDense should be a 0D or 1D tensor with a single element"); + + // Default value to fill out of ragged range elements in output tensor + OPENVINO_ASSERT(get_input_element_type(4).compatible(get_input_element_type(2))); + auto input4_rank = get_input_partial_shape(4).rank(); + OPENVINO_ASSERT(input4_rank.compatible(0)); + + set_input_is_relevant_to_shape(3); + + if(get_input_partial_shape(0).rank().is_dynamic()) { + set_output_type(0, get_input_element_type(2), PartialShape::dynamic()); + set_output_type(1, element::boolean, PartialShape::dynamic()); + } else { + auto shape = get_input_partial_shape(0); + if(auto target_dim = dynamic_cast(get_input_node_ptr(3))) { + shape.push_back(target_dim->cast_vector()[0]); + } else { + shape.push_back(Dimension()); + } + set_output_type(0, get_input_element_type(2), shape); + set_output_type(1, element::boolean, shape); + } +} + + +bool RaggedToDense::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // FIXME: Works for POD types only (not for strings!) + // FIXME: Output mask is calculated even if there are no consumers + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto nelems = inputs[0].get_size(); + auto elems = reinterpret_cast(inputs[2].data()); + auto elem_size = inputs[2].get_element_type().size(); + auto default_value = reinterpret_cast(inputs[4].data()); + + // Suppose validate was called and set correct output shape + // Take a target shape value for ragged dimension + size_t target_dim = outputs[0].get_shape().back(); + + auto out_elems = reinterpret_cast(outputs[0].data()); + auto out_mask = outputs[1].data(); + + auto out_elem_orig = out_elems; + auto out_mask_orig = out_mask; + + for(size_t i = 0; i < nelems; ++i) { + auto begin = elems + elem_size*begins[i]; + auto len = std::min(size_t(ends[i] - begins[i]), target_dim); // truncation + auto end = begin + elem_size*len; + out_elems = std::copy(begin, end, out_elems); + out_mask = std::fill_n(out_mask, len, char(1)); + if(len < target_dim) + out_mask = std::fill_n(out_mask, target_dim - len, char(0)); + while(len < target_dim) { + out_elems = std::copy(default_value, default_value + elem_size, out_elems); + ++len; + } + } + + OPENVINO_ASSERT(out_elems == out_elem_orig + outputs[0].get_byte_size()); + OPENVINO_ASSERT(out_mask == out_mask_orig + outputs[1].get_byte_size()); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp new file mode 100644 index 000000000..698b16157 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Takes a ragged tensor with one ragged right-most dimension and produces a normal tensor +class OPENVINO_API RaggedToDense : public ov::op::Op { +public: + OPENVINO_OP("RaggedToDense"); + + RaggedToDense () = default; + + RaggedToDense(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp new file mode 100644 index 000000000..95a88e603 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "normalizer.h" // for absl::string_view + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "regex_normalization.hpp" +#include "utils.hpp" + +using namespace ov; + + +void RegexNormalization::validate_and_infer_types() { + check_string_input(this, 0); + check_string_scalar_input(this, 3); + check_string_scalar_input(this, 4); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto search_pattern_buf = inputs[3].data(); + auto replace_pattern_buf = inputs[4].data(); + auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + + using namespace paddlenlp::fast_tokenizer::normalizers; + re2::RE2 search_pattern_re(search_pattern); + + return evaluate_normalization_helper( + outputs, inputs, + [&replace_pattern, &search_pattern_re](const std::string& str) { + return NormalizedString(str).Replace(search_pattern_re, std::string(replace_pattern)).GetStr(); + }); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp new file mode 100644 index 000000000..7272d3086 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API RegexNormalization : public ov::op::Op { +public: + OPENVINO_OP("RegexNormalization"); + + RegexNormalization () = default; + + RegexNormalization(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp new file mode 100644 index 000000000..10c64e704 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -0,0 +1,205 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "normalizer.h" // for absl::string_view + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "regex_split.hpp" +#include "utils.hpp" + +using namespace ov; + + +namespace { + +using paddlenlp::fast_tokenizer::core::SplitMode; +const std::map split_modes = { + {"remove", SplitMode::REMOVED}, + {"isolate", SplitMode::ISOLATED}, + {"contiguous", SplitMode::CONTIGUOUS}, + {"merge_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, + {"merge_with_next", SplitMode::MERGED_WITH_NEXT}, +}; + +} + + +void RegexSplit::validate_and_infer_types() { +// check_string_input(this, 0); +// check_string_scalar_input(this, 3); +// check_ragged_string_input(this, 0); +// check_string_input(this, 5); + OPENVINO_ASSERT(split_modes.find(m_behaviour) != split_modes.end(), "RegexSplit doesn't support unknown split mode: " + m_behaviour); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + +bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + + if (inputs.size() < 5) { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + ov::Tensor ragged_begins_tensor(ov::element::i32, inputs[0].get_shape()); + ov::Tensor ragged_ends_tensor(ov::element::i32, inputs[0].get_shape()); + auto ragged_begins = ragged_begins_tensor.data(); + auto ragged_ends = ragged_ends_tensor.data(); + for (int i=0; i < inputs[0].get_size(); ++i) { + ragged_begins[i] = i; + ragged_ends[i] = i + 1; + }; + + auto split_pattern_buf = inputs[3].data(); + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + +// std::cerr << "[ RegexSplit ] Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; + + const size_t num_rows = inputs[0].get_size(); + const size_t num_chars = inputs[2].get_size(); + + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); + + outputs[4] = inputs[2]; + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; + + using namespace paddlenlp::fast_tokenizer; + auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); + + for(size_t seq = 0; seq < num_rows; ++seq) { +// std::cerr << "================= Seq: " << seq << " ====================\n"; +// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; + + new_ragged_begins[seq] = ragged_offset; + + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); +// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); + pretokenizer(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); +// std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); +// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; +// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[ragged_col] + offset.first; + new_ends[ragged_offset++] = begins[ragged_col] + offset.second; + }; + } + + new_ragged_ends[seq] = ragged_offset; + } + + // Fix real shape based on collected results + outputs[2].set_shape({size_t(ragged_offset)}); + outputs[3].set_shape({size_t(ragged_offset)}); + + } else { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto split_pattern_buf = inputs[5].data(); + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + +// std::cerr << "Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; + + outputs[4] = inputs[4]; + const size_t num_rows = inputs[0].get_size(); + const size_t num_chars = inputs[4].get_size(); + + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); + + outputs[4] = inputs[4]; + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; + + using namespace paddlenlp::fast_tokenizer; + auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); + + for(size_t seq = 0; seq < num_rows; ++seq) { +// std::cerr << "----------------- Seq: " << seq << " -----------------\n"; +// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; + + new_ragged_begins[seq] = ragged_offset; + + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); +// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); + pretokenizer(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); + + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); +// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; +// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; + new_begins[ragged_offset] = begins[ragged_col] + offset.first; + new_ends[ragged_offset++] = begins[ragged_col] + offset.second; + + +// std::cerr << "New begins and ends:\n"; +// for (size_t i = 0; i < outputs[2].get_size(); ++i) { +// std::cerr << outputs[2].data()[i] << ", "; +// } +// std::cerr << "\n"; +// +// for (size_t i = 0; i < outputs[3].get_size(); ++i) { +// std::cerr << outputs[3].data()[i] << ", "; +// } +// std::cerr << "\n"; + }; + } + + new_ragged_ends[seq] = ragged_offset; + } + + // Fix real shape based on collected results + outputs[2].set_shape({size_t(ragged_offset)}); + outputs[3].set_shape({size_t(ragged_offset)}); + + } + + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp new file mode 100644 index 000000000..438b47d64 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API RegexSplit : public ov::op::Op { +public: + OPENVINO_OP("RegexSplit"); + + RegexSplit () = default; + + RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false) : + ov::op::Op(arguments), + m_behaviour(behaviour), + m_invert(invert) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_behaviour, m_invert); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("behaviour", m_behaviour); + visitor.on_attribute("invert", m_invert); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_behaviour = "remove"; + bool m_invert = false; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp new file mode 100644 index 000000000..f13ad3eed --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -0,0 +1,209 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "normalizer.h" + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" + +#include "sentence_piece.hpp" +#include "utils.hpp" + +using sentencepiece::SentencePieceProcessor; +using namespace TemplateExtension; +using namespace ov; +using namespace ov::frontend; +using namespace ov::opset10; + +// TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor + +SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse) : m_sp(std::make_shared()), + m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), + m_reverse(reverse), Op(args) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + + // form extra options to configure SentencePieceProcessor + std::string extra_options = ""; + if (m_add_bos) { + extra_options += "bos"; + } + if (m_add_eos) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "eos"; + } + /* TODO: TF ignores this option, so we are ignoring it as well; need to understand what should we do + if (m_reverse) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "reverse"; + } + */ + // example of extra_options, if "bos:eos:reverse" + CHECK_OK(m_sp->SetEncodeExtraOptions(extra_options)); + constructor_validate_and_infer_types(); +} + +SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const std::shared_ptr& sp, + int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse) : m_sp(sp), + m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), + m_reverse(reverse), Op(args) { + constructor_validate_and_infer_types(); +} + +void SentencepieceTokenizer::validate_and_infer_types() { + + #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)"); + FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor"); + + #else + + FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); + FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); + + #if USE_STRING_TENSORS + + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::string || get_input_element_type(1) == element::u8, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 or string depending on the current stage of model preparation"); + #else + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::string, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor"); + #endif + + #else + +#if 0 // change to 0 when compiled with master and the bug with data propagation from within inline context is not solved + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::u8, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 tensor, but got " + + get_input_element_type(1).get_type_name()); +#endif + + #endif + + #endif + + // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values + // and dense shape + set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); + set_output_type(1, element::i32, PartialShape{ Dimension() }); + set_output_type(2, element::i64, PartialShape{ Dimension(2) }); +} + +bool SentencepieceTokenizer::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("nbest_size", m_nbest_size); + visitor.on_attribute("alpha", m_alpha); + visitor.on_attribute("add_bos", m_add_bos); + visitor.on_attribute("add_eos", m_add_eos); + visitor.on_attribute("reverse", m_reverse); + return true; +} + +bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + std::vector sparse_indices; + std::vector sparse_values; + std::vector sparse_dense_shape; + +#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + auto begin_ids = inputs[1].data(); + auto end_ids = inputs[2].data(); + auto data = inputs[3].data(); + + auto batch_size = shape_size(inputs[1].get_shape()); + +#else + +#if USE_STRING_TENSORS + + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); + #else + const ov::Tensor& strings_tensor = inputs[1]; + #endif + + const std::string* strings = strings_tensor.data(); + size_t batch_size = ov::shape_size(strings_tensor.get_shape()); + +#else + + // const uint8_t* strings = inputs[1].data(); + // auto bitstream_size = inputs[1].get_byte_size(); + + // // check the format of the input bitstream representing the string tensor + // FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + // auto batch_size = *reinterpret_cast(strings + 0); + // FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, + // "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + // auto begin_ids = reinterpret_cast(strings + 4); + // auto end_ids = begin_ids + 1; + // auto data = strings + 4 + 4 + 4 * batch_size; + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data); + +#endif + +#endif + //std::cerr << " Batch size: " << batch_size << "\n"; + + size_t max_token_id = 0; + for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { +#if USE_STRING_TENSORS && !SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + const std::string& sentence = strings[batch_ind]; + //std::cerr << " sentence: " << sentence << "\n"; +#else + auto begin_ind = begin_ids[batch_ind]; + auto end_ind = end_ids[batch_ind]; + //std::string sentence(data + begin_ind, data + end_ind); + absl::string_view sentence((const char*)data + begin_ind, end_ind - begin_ind); + //std::cerr << "string: " << sentence << "\n"; +#endif + std::vector ids; + CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); + // put into resulted vectors + for (size_t token_id = 0; token_id < ids.size(); ++token_id) { + sparse_indices.push_back(static_cast(batch_ind)); + sparse_indices.push_back(static_cast(token_id)); + sparse_values.push_back(static_cast(ids[token_id])); + } + max_token_id = max_token_id < ids.size() ? ids.size() : max_token_id; + } + sparse_dense_shape.push_back(static_cast(batch_size)); + sparse_dense_shape.push_back(static_cast(max_token_id)); + + outputs[0].set_shape({ sparse_indices.size() / 2, 2 }); + memcpy(outputs[0].data(), sparse_indices.data(), sizeof(int64_t) * sparse_indices.size()); + outputs[1].set_shape({ sparse_values.size() }); + memcpy(outputs[1].data(), sparse_values.data(), sizeof(int32_t) * sparse_values.size()); + outputs[2].set_shape({ 2 }); + memcpy(outputs[2].data(), sparse_dense_shape.data(), sizeof(int64_t) * sparse_dense_shape.size()); + return true; +} + +bool SentencepieceTokenizer::has_evaluate() const { + return true; +} + +std::shared_ptr SentencepieceTokenizer::clone_with_new_inputs(const OutputVector& new_args) const { + return std::make_shared(new_args, m_sp, m_nbest_size, m_alpha, m_add_bos, m_add_eos, m_reverse); +} \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp new file mode 100644 index 000000000..cec0a9532 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace sentencepiece { + class SentencePieceProcessor; +} + +namespace TemplateExtension { + class SentencepieceTokenizer : public ov::op::Op { + public: + OPENVINO_OP("SentencepieceTokenizer"); + + SentencepieceTokenizer() = default; + SentencepieceTokenizer(const ov::OutputVector& args, int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse); + SentencepieceTokenizer(const ov::OutputVector& args, const std::shared_ptr& sp, int32_t nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const override; + + private: + std::shared_ptr m_sp; + int32_t m_nbest_size; + float m_alpha; + bool m_add_bos; + bool m_add_eos; + bool m_reverse; + }; +} // namespace TemplateExtension diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/str_pack.py b/modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py similarity index 100% rename from modules/custom_operations/user_ie_extensions/sentence_piece/str_pack.py rename to modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp new file mode 100644 index 000000000..f89c465c6 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "string_tensor_pack.hpp" +#include "utils.hpp" + +using namespace ov; + + +void StringTensorPack::validate_and_infer_types() { + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supports only 'begins_ends' mode, but get " + m_mode); + check_string_input(this, 0); + #if USE_STRING_TENSORS + set_output_type(0, element::string, get_input_partial_shape(0)); + #else + set_output_type(0, element::u8, PartialShape{Dimension()}); + #endif +} + +bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +#if USE_STRING_TENSORS + // TODO + return false; +#else + auto rank = inputs[0].get_shape().size(); + if (rank != 1) { + std::cerr << "[ WARNING ] StringTensorPack ignores the rank " << rank << " of input tensor and set rank=1 in the output\n"; + } + + auto num_elements = shape_size(inputs[0].get_shape()); + auto num_chars = shape_size(inputs[2].get_shape()); + auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; + outputs[0].set_shape(Shape{num_output_elements}); + + // FIXME: Do the repacking, otherwise cannot handle string tensors with gaps between strings + //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + auto output = outputs[0].data(); + auto output_int32 = reinterpret_cast(output); + + *output_int32++ = num_elements; + *output_int32++ = 0; + output_int32 = std::copy(ends, ends + num_elements, output_int32); + output = reinterpret_cast(output_int32); + output = std::copy(chars, chars + num_chars, output); + + OPENVINO_ASSERT(num_output_elements == output - outputs[0].data(), "[ INTERNAL ERROR ] StringTensorPack output tensor is corrupted"); + + // WARNING! Chars are not repacked. If there are gaps between strings, they will remain. + + return true; +#endif +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp new file mode 100644 index 000000000..8766e6062 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Having a decomposed representation for a tensor, converts it to a single string tensor +// (packed u8 or natively supported element::string depending on whether or not USE_STRING_TENSORS defined). +class StringTensorPack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorPack"); + + StringTensorPack () = default; + + StringTensorPack(ov::OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; + +private: + + std::string m_mode = "begins_ends"; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp new file mode 100644 index 000000000..35854e685 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "string_tensor_unpack.hpp" +#include "utils.hpp" + +using namespace ov; + + +void StringTensorUnpack::validate_and_infer_types() { + OPENVINO_ASSERT( + get_input_size() == 1, + "Number of inputs for StringTensorUnpack is not equal to 1"); + + auto output_shape = PartialShape::dynamic(); + + // In case of explicit string tensors the shape is carried by input tensor itself + // OPENVINO_ASSERT( + // input_shape == PartialShape::dynamic(), + // "Excplicitly set shape for a string tensor in the unpacking is not supported"); + + // There are three cases that affect expected element type of the input tensor: + // - when string tensor is passed and we are before the hack is applied (element::string) and + // - when string tensor is passed and we are after the hack in CPU (element::u8) and + // - when stirng tensor is not really used, and we expect a packed string tensor in this case (element::u8) + + OPENVINO_ASSERT( +#if OPENVINO_ELEMENT_STRING_SUPPORTED + get_input_element_type(0) == element::string || +#endif +#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS + get_input_element_type(0) == element::u8 || +#endif + get_input_element_type(0) == element::dynamic, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + if(get_input_element_type(0) == element::string) { + output_shape = get_input_partial_shape(0); + } +#endif + +#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS + if(get_input_element_type(0) == element::u8) + { + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. + // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. + const auto& rt_info = get_input_tensor(0).get_rt_info(); + auto it = rt_info.find("__original_partial_shape"); + + // StringTensorUnpack expects __original_partial_shape attribute of type PartialShape in the input tensor. + // If it is not found that means that model compilation wasn't pass the expected transformation where a string tensor + // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. + if(it != rt_info.end() && it->second.is()) { + output_shape = it->second.as(); + } else { + #endif + #if !USE_STRING_TENSORS + // If string tensors shouldn't be used, then the packed u8 format is also expected + // as an input, but in this case only rank is known + OPENVINO_ASSERT( + get_input_partial_shape(0).rank().is_dynamic() || get_input_partial_shape(0).rank().get_length() == 1, + "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + + get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); + + output_shape = PartialShape({Dimension()}); // [?] + #endif + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + } + #endif + } +#endif + + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); + + if (m_mode == "begins_ends") { + set_string_output(this, 0, output_shape); + } +} + +bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ptensor = &inputs[0]; + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + if(ptensor->get_element_type() == element::u8 && ptensor->get_byte_size() == sizeof(void*)) { + auto data = *reinterpret_cast(ptensor->data()); + if(data != nullptr) { + ptensor = reinterpret_cast(data); + } + } + #endif + + auto tensor = *ptensor; + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + if(tensor.get_element_type() == element::string) { + Shape input_shape = tensor.get_shape(); + const std::string* input_strings = tensor.data(); + unpack_strings(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); + return true; + } else { +#endif + +#if USE_STRING_TENSORS + OPENVINO_ASSERT(false, "Detected a u8 tensor but element::string tensor should be provided"); +#endif + + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(tensor, batch_size, begin_ids, end_ids, data); + auto num_chars = end_ids[batch_size - 1]; + + outputs[0].set_shape(Shape{static_cast(batch_size)}); + outputs[1].set_shape(Shape{static_cast(batch_size)}); + outputs[2].set_shape(Shape{static_cast(num_chars)}); + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + std::copy(begin_ids, begin_ids + batch_size, begins); + std::copy(end_ids, end_ids + batch_size, ends); + std::copy(data, data + num_chars, chars); + + return true; + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + } +#endif +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp new file mode 100644 index 000000000..2570b9596 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Unpack a string tensor representation regardless of the source format, which +// can be an OV tensor with element::string element type (if supported) or u8 +// packed representation, to a decompose tensor representation that may potentially +// consist of multiple tensors. The destination format is defined by `mode` attribute. +// Shape of the output tensor is compitelly recognized from the input (if supported) +// or defined partially by a dedicated input attribute `shape`. If `shape` is not set, +// which default to completelly dynamic `shape`, then output shape is defined +// by an input tensor. +class StringTensorUnpack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorUnpack"); + + StringTensorUnpack () = default; + + StringTensorUnpack(ov::OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; + +private: + + std::string m_mode = "begins_ends"; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp new file mode 100644 index 000000000..6eea48158 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp @@ -0,0 +1,251 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" + +#include "tensorflow_translators.hpp" +#include "utils.hpp" + +#include "string_tensor_pack.hpp" +#include "string_tensor_unpack.hpp" +#include "sentence_piece.hpp" +#include "case_fold.hpp" +#include "normalize_unicode.hpp" +#include "regex_normalization.hpp" +#include "regex_split.hpp" + +#include "wordpiece_tokenizer.hpp" + +using namespace TemplateExtension; +using namespace ov; +using namespace ov::frontend; +using namespace ov::opset10; + +namespace { + template + T extract_scalar_const_value(const std::shared_ptr& node, const std::string& const_name) { + auto const_node = as_type_ptr(node); + FRONT_END_GENERAL_CHECK(const_node, "Conversion expects " + const_name + " to be constant."); + std::vector const_value = const_node->cast_vector(); + FRONT_END_GENERAL_CHECK(const_value.size() == 1, "Conversion expects " + const_name + " to be a scalar."); + return const_value[0]; + } +} // namespace + +OutputVector translate_sentencepiece_op(const NodeContext& node) { + // extract model to configure SentencePieceTokenizer + auto sp_model_ov_any = node.get_attribute_as_any("model"); + FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), + "SentencePieceOp configuration model is in incorrect format"); + auto str_spm_model = sp_model_ov_any.as(); + auto sp_model_const = std::make_shared(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data()); + return { sp_model_const }; +} + +NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { + // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, + // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp + FRONT_END_GENERAL_CHECK(node.get_input_size() > 0, "RaggedTensorToSparse expects at least one input."); + auto node_name = node.get_name(); + + // check that producers of RaggedTensorToSparse is SentencePieceTokenizer + auto sp_tokenize_op = node.get_input(0).get_node_shared_ptr(); + FRONT_END_GENERAL_CHECK(sp_tokenize_op->get_input_size() > 6, + "SentencepieceTokenizeOp expects at least six inputs"); + + // prepare inputs that go to custom operation + // prepare input 0 - SentencePieceTokenizer configuration model + auto sp_model_const = as_type_ptr(sp_tokenize_op->input_value(0).get_node_shared_ptr()); + FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant."); + + // prepare input six inputs + auto inputs = sp_tokenize_op->input_value(1); + + // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes + auto nbest_size = extract_scalar_const_value(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size"); + auto alpha = extract_scalar_const_value(sp_tokenize_op->input_value(3).get_node_shared_ptr(), "alpha"); + auto add_bos = extract_scalar_const_value(sp_tokenize_op->input_value(4).get_node_shared_ptr(), "add_bos"); + auto add_eos = extract_scalar_const_value(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos"); + auto reverse = extract_scalar_const_value(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse"); + +#if !USE_STRING_TENSORS + // Override type of input tensor if this is a Parameter + if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { + parameter->set_partial_shape(PartialShape{ Dimension() }); + parameter->set_element_type(element::u8); + parameter->validate_and_infer_types(); + } +#endif + +#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + OutputVector inputs_vector = OutputVector{ sp_model_const }; + auto unpacked_outputs = std::make_shared(OutputVector{inputs}, "begins_ends")->outputs(); + inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end()); + +#else + + OutputVector inputs_vector = OutputVector{ sp_model_const, inputs }; + +#endif + + // create a node with custom operation + auto sp_tokenizer_ext = std::make_shared(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse); + FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3, + "Internal error: SentencepieceTokenizer operation extension must have three outputs."); + + // set tensor names + sp_tokenizer_ext->output(0).add_names({ node_name + ":0" }); + sp_tokenizer_ext->output(1).add_names({ node_name + ":1" }); + sp_tokenizer_ext->output(2).add_names({ node_name + ":2" }); + + // create named outputs for the conversion extension + NamedOutputVector named_results; + named_results.push_back({ "sparse_indices", sp_tokenizer_ext->output(0) }); + named_results.push_back({ "sparse_values", sp_tokenizer_ext->output(1) }); + named_results.push_back({ "sparse_dense_shape", sp_tokenizer_ext->output(2) }); + + return named_results; +} + +ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared( + pre_translate_string_tensor_input(node.get_input(0)))->outputs()) }; +} + +ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "NormalizeUTF8 expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared( + pre_translate_string_tensor_input(node.get_input(0)), + node.get_attribute("normalization_form"))->outputs()) }; +} + +ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input"); + ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); + inputs.push_back(string_attribute_to_constant(node, "pattern")); + inputs.push_back(string_attribute_to_constant(node, "rewrite")); + return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) }; +} + +ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "RegexSplitWithOffsets expects 3 inputs"); + ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); + auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there + inputs.push_back(delim_regex_pattern); + // TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolate` behaviour + auto outputs = std::make_shared(inputs)->outputs(); + auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); + return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; +} + +ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "WordpieceTokenizeWithOffsets expects 2 inputs"); + ov::OutputVector inputs = pre_translate_ragged_string_tensor_input(node.get_input(0)); + + #if USE_STRING_TENSORS + // It may seem enough to call pre_translate_string_tensor_input that will override Parameter element + // type in case if string tensors are not used. + // But a Parameter is still required to be overridden even if string tensors are used because in TF model + // it is represented not as a string tensor, but as a resource with hash table for lookup that we cannot interpret + // and have to replace by 1D string tensor. + override_parameter(node.get_input(1).get_node_shared_ptr(), element::string, PartialShape{Dimension()}); + #endif + + auto vocab = pre_translate_string_tensor_input(node.get_input(1)); + inputs.insert(inputs.end(), vocab.begin(), vocab.end()); + // FIXME: Cannot set real value for unk_token_id from attributes because it is not known in this operation + // TODO: Set other attributes. + auto wp_tokenizer = std::make_shared( + inputs, + node.get_attribute("suffix_indicator"), + node.get_attribute("max_bytes_per_word") + ); + return { post_translate_ragged_tensor_output(wp_tokenizer->outputs()) }; +} + +ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFindV2 expects 3 inputs"); + + // Check if this node is used in a combination with already converted WordpieceTokenizeWithOffsets + auto wp_tokenizer_outputs = pre_translate_ragged_tensor_input(node.get_input(1)); + auto wp_tokenizer = dynamic_cast(wp_tokenizer_outputs[0].get_node()); + OPENVINO_ASSERT(wp_tokenizer, "Conversion of LookupTableFindV2 without coupled WordpieceTokenizer is not yet supported"); + + // TODO: Check vocab matching for LookupTableFindV2 and WordpieceTokenizer + + // TODO: Check if overflow really happens in real models due to i64 to i32 conversion + auto unk_token_id = std::make_shared(node.get_input(2), element::i32); + + auto wp_tokenizer_inputs = wp_tokenizer->input_values(); + wp_tokenizer_inputs.push_back(unk_token_id); + //std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n"; + + auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs); + return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) }; +} + +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { + // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. + // It checks if the input tensor has string type, and then perform custom tranlation. + // Otherwise it should operate identically to the stock version of Reshape translator in TF FE. + // TODO: Introduce an API to call original translators from an extension without copying the code to an extension. + + FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "Tensorflow Reshape op should have two inputs"); + auto tensor = node.get_input(0); + auto shape = node.get_input(1); + if(auto pack = dynamic_cast(tensor.get_node())) { + // TODO: If it is a beginning of the graph, how to detect strings? It falls in 'else' branch in this case. + // FIXME: Needs extension for a Parameter to prepare it first + auto begins = std::make_shared(pack->input_value(0), shape, false); + auto ends = std::make_shared(pack->input_value(1), shape, false); + auto chars = pack->input_value(2); + auto reshape = post_translate_string_tensor_output({begins, ends, chars}); + return {reshape}; + } else { + auto reshape = std::make_shared(tensor, shape, false); + return {reshape}; + } + // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals +} + +// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes +ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { + auto ov_type = node.get_attribute_as_any("dtype"); + std::shared_ptr const_node; + if (!ov_type.is() || ov_type.as() == ov::element::dynamic || + ov_type.as() == ov::element::undefined) { + if (ov_type.is() && ov_type.as() == "DT_STRING") { + auto value_as_any = node.get_attribute_as_any("value"); + const auto& values = value_as_any.as>(); + ov::Tensor begins(element::i32, {}), ends(element::i32, {}), chars(element::u8, {}); + unpack_strings(&values[0], {values.size()}, begins, ends, chars); + const_node = std::make_shared(OutputVector{ + std::make_shared(begins), + std::make_shared(ends), + std::make_shared(chars) + }); + } else { + const_node = std::make_shared(OutputVector{}); + } + } else { + //static std::vector tensors; + auto tensor = node.get_attribute("value"); + //tensors.push_back(tensor); + const_node = std::make_shared(tensor); + #if OPENVINO_ELEMENT_STRING_SUPPORTED + if (const_node->get_element_type() == element::string) { + if(shape_size(tensor.get_shape()) > 0) { + auto strings = std::dynamic_pointer_cast(const_node)->get_data_ptr(); + } + const_node = std::make_shared(const_node->outputs()); + const_node = std::make_shared(const_node->outputs()); + } + #endif + } + //set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name + return {const_node}; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp new file mode 100644 index 000000000..8d501dced --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node); +ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); +ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node); +ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); +ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node); +ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node); +ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); +ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); +ov::OutputVector translate_const(const ov::frontend::NodeContext& node); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp new file mode 100644 index 000000000..2eb4dcb20 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "string_tensor_pack.hpp" +#include "string_tensor_unpack.hpp" +#include "ragged_tensor_pack.hpp" +#include "sentence_piece.hpp" +#include "case_fold.hpp" +#include "normalize_unicode.hpp" +#include "regex_normalization.hpp" +#include "regex_split.hpp" +#include "combine_segments.hpp" +#include "bytes_to_chars.hpp" +#include "wordpiece_tokenizer.hpp" +#include "bpe_tokenizer.hpp" +#include "ragged_to_dense.hpp" +#include "vocab_decoder.hpp" +#include "chars_to_bytes.hpp" + +#include "tensorflow_translators.hpp" diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py similarity index 100% rename from modules/custom_operations/user_ie_extensions/sentence_piece/tokenizer_pipeline.py rename to modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp new file mode 100644 index 000000000..a3e12be73 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" +#include "string_tensor_pack.hpp" +#include "string_tensor_unpack.hpp" +#include "ragged_tensor_pack.hpp" + +using namespace ov; +using namespace ov::frontend; +using namespace ov::opset10; + +void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) { + auto strings = packed.data(); + auto bitstream_size = packed.get_byte_size(); + // check the format of the input bitstream representing the string tensor + FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + batch_size = *reinterpret_cast(strings + 0); + FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + begin_ids = reinterpret_cast(strings + 4); + end_ids = begin_ids + 1; + symbols = strings + 4 + 4 + 4 * batch_size; +} + +void check_string_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::u8, "Expected a u8 tensor as the third part of the decomposed string representation"); +} + +void check_string_scalar_input(const Node* node, size_t input_index) { + auto shape = node->get_input_partial_shape(input_index); + auto element_type = node->get_input_element_type(input_index); + + #if false && USE_STRING_TENSORS + // This block is not used when we convert ops to decomposed representation (and we really do) + + OPENVINO_ASSERT( + (element_type == element::dynamic || element_type == element::string) && + (shape.rank().is_dynamic() || shape.rank().get_length() == 0), + "string/0D tensor is expected, but observed: " + element_type.get_type_name() + shape.to_string()); + + #else + + OPENVINO_ASSERT( + (element_type == element::dynamic || element_type == element::u8) && + (shape.rank().is_dynamic() || shape.rank().get_length() == 1), + "u8/1D tensor is expected"); + + #endif +} + +void check_ragged_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged representation"); + auto rank = node->get_input_partial_shape(input_index+2).rank(); + FRONT_END_GENERAL_CHECK(rank.is_dynamic() || rank.get_length() == 1, "The last tensor in ragged tensor representation should be a 1D tensor"); +} + +void check_ragged_string_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::i32, "Expected an i32 tensor as the third part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+3) == element::i32, "Expected an i32 tensor as the forth part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+4) == element::u8, "Expected a u8 tensor as the fifth part of the decomposed ragged string representation"); +} + +void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); // byte offset in output[+2] -- begin of each string + node->set_output_type(output_index+1, element::i32, shape); // byte offset in output[+2] -- end of each string + node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); // symbols from all strings concatenated +} + +void set_ragged_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements + node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+3] -- end of each ragged dimension elements + node->set_output_type(output_index+2, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- begin of each string + node->set_output_type(output_index+3, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- end of each string + node->set_output_type(output_index+4, element::u8, PartialShape{Dimension()}); // symbols from all strings cnocatenated +} + +void set_ragged_output(Node* node, size_t output_index, const PartialShape& shape, element::Type type) { + node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements + node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+2] -- end of each ragged dimension elements + node->set_output_type(output_index+2, type, PartialShape{Dimension()}); // flatten elements +} + + +void unpack_strings (const std::string* strings, const Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars) { // TODO: no need for a reference to a ov::Tensor? + auto nelements = shape_size(shape); + + size_t total = 0; + for(size_t i = 0; i < nelements; ++i) + total += strings[i].length(); + + begins.set_shape(shape); + ends.set_shape(shape); + chars.set_shape(Shape{total}); + + auto pbegins = begins.data(); + auto pends = ends.data(); + auto poutput_symbols = reinterpret_cast(chars.data()); + size_t offset = 0; + + for(size_t i = 0; i < nelements; ++i) + { + pbegins[i] = offset; + poutput_symbols = std::copy(strings[i].begin(), strings[i].end(), poutput_symbols); + offset += strings[i].length(); + pends[i] = offset; + } +} + +void override_parameter (std::shared_ptr node, element::Type type, const PartialShape& shape) { + if (auto parameter = std::dynamic_pointer_cast(node)) { + // TODO: Apply this change conditionally based on real Parameter value + std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n"; + parameter->set_partial_shape(shape); + parameter->set_element_type(type); + parameter->validate_and_infer_types(); + } +} + +// TODO: replace NodeContext and input_index by a single input +OutputVector pre_translate_string_tensor_input(ov::Output input) { + auto input_node = input.get_node_shared_ptr(); + +#if !USE_STRING_TENSORS + override_parameter(input_node, element::u8, PartialShape{Dimension()}); +#endif + + if (auto struct_pack = std::dynamic_pointer_cast(input_node)) { + FRONT_END_GENERAL_CHECK(struct_pack->get_input_size() == 3, "Expected 3 inputs to StringTensorPack which represents a string tensor"); + return struct_pack->input_values(); + } else { + #if USE_STRING_TENSORS || true // always + return std::make_shared(OutputVector{input}, "begins_ends")->outputs(); + #else + // Suppose this is u8 packed string tensor with a single batch dimension + // Unpack this tensor using standard operations + + // Cannot do that because there is not ReinterprectCast operation in OV + // TODO: Find a way to make it without reinterpretation operation or introduce it as an extension (easy) + #endif + } +} + +OutputVector pre_translate_ragged_tensor_input(ov::Output input) { + auto ragged_pack = dynamic_cast(input.get_node()); + OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); + return ragged_pack->input_values(); +} + +OutputVector pre_translate_ragged_string_tensor_input(ov::Output input) { + // auto ragged_pack = dynamic_cast(node.get_input(input_index).get_node()); + // OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); + auto ragged_inputs = pre_translate_ragged_tensor_input(input); + auto string_inputs = pre_translate_string_tensor_input(ragged_inputs[2]); + ragged_inputs.pop_back(); + ragged_inputs.insert(ragged_inputs.end(), string_inputs.begin(), string_inputs.end()); + // auto string_pack = dynamic_cast(ragged_pack->get_input_node_ptr(2)); + // OPENVINO_ASSERT(string_pack, "Expected StringTensorPack as a base for RaggedTensorPack but didn't find it"); + return ragged_inputs; +} + +ov::Output post_translate_string_tensor_output(const OutputVector& outputs) { + FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); + return std::make_shared(outputs, "begins_ends"); +} + +ov::Output post_translate_ragged_tensor_output(const OutputVector& outputs) { + FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); + return std::make_shared(outputs); +} + +bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorVector& inputs, std::function normalizer) { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elements = inputs[0].get_size(); + + // TODO: How to avoid copying from this temporary buffer? + // TODO: It can be possible to collect output symbols directly in the output tensor memory if `normalizer` has reasonable estimation for the final size. + std::deque buffer; + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + + for(size_t i = 0; i < num_elements; ++i) { + new_begins[i] = buffer.size(); + std::string new_str = normalizer(std::string(chars + begins[i], chars + ends[i])); + buffer.insert(buffer.end(), new_str.begin(), new_str.end()); + new_ends[i] = buffer.size(); + } + + // Copy collected symbols to the target output tensor + + outputs[2].set_shape(Shape{buffer.size()}); + auto new_chars = outputs[2].data(); + std::copy(buffer.begin(), buffer.end(), new_chars); + + return true; +} + +std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name) { + // FIXME: using space to pad the value to work-around CPU issue with empty constants + auto value = node.get_attribute(name) + " "; + + // TODO: How to translate attribute `replace_global`? + + #if USE_STRING_TENSORS + return std::make_shared(element::string, Shape{}, &value); + #else + return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); + #endif +} \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp new file mode 100644 index 000000000..f44333553 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + + +#ifndef OPENVINO_ELEMENT_STRING_SUPPORTED + #define OPENVINO_ELEMENT_STRING_SUPPORTED 0 +#endif + +#ifndef OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + #define OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK 0 +#endif + +#define USE_STRING_TENSORS 0 // modify this depending on willingness to use explicit string tensors + +#if USE_STRING_TENSORS && !OPENVINO_ELEMENT_STRING_SUPPORTED + #error "USE_STRING_TENSORS = 1 can be used only when OpenVINO supports element::string that is determined by OPENVINO_ELEMENT_STRING_SUPPORTED == 1" +#endif + +#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS 0 + + +void parse_packed_strings ( + const ov::Tensor& packed, + int32_t& batch_size, + const int32_t*& begin_ids, + const int32_t*& end_ids, + const uint8_t*& symbols); + + +void check_string_input(const ov::Node* node, size_t input_index); + +void check_string_scalar_input(const ov::Node* node, size_t input_index); + +void check_ragged_input(const ov::Node* node, size_t input_index); + +void check_ragged_string_input(const ov::Node* node, size_t input_index); + +void set_string_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape); + +void set_ragged_string_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape); + +void set_ragged_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape, ov::element::Type type); + +void unpack_strings (const std::string* strings, const ov::Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars); + +void override_parameter (std::shared_ptr node, ov::element::Type type, const ov::PartialShape& shape); + +ov::OutputVector pre_translate_string_tensor_input(ov::Output input); + +ov::OutputVector pre_translate_ragged_tensor_input(ov::Output input); + +ov::OutputVector pre_translate_ragged_string_tensor_input(ov::Output input); + +ov::Output post_translate_string_tensor_output(const ov::OutputVector& outputs); + +ov::Output post_translate_ragged_tensor_output(const ov::OutputVector& outputs); + +bool evaluate_normalization_helper ( + ov::TensorVector& outputs, + const ov::TensorVector& inputs, + std::function normalizer); + +std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp new file mode 100644 index 000000000..e5284babd --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp @@ -0,0 +1,74 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "vocab_decoder.hpp" +#include "utils.hpp" + +using namespace ov; + +void VocabDecoder::validate_and_infer_types() { +// check_ragged_string_input(this, 0); + check_string_input(this, 1); + const auto shape = get_input_partial_shape(0); + set_ragged_string_output(this, 0, {shape[0]}); +} + +bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto batch_size = inputs[0].get_shape()[0]; + auto seq_len = inputs[0].get_shape()[1]; + auto input_data = inputs[0].data(); + + auto vocab_begins = inputs[1].data(); + auto vocab_ends = inputs[2].data(); + auto vocab_chars = inputs[3].data(); + auto vocab_size = inputs[1].get_size(); + + std::vector> vocab; + vocab.resize(vocab_size); + + OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern"); + + for(size_t id = 0; id < vocab_size; ++id) { + std::vector token = std::vector(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[id] = token; + } + // Set output shapes + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len}); + outputs[3].set_shape({batch_size * seq_len}); + outputs[4].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + const size_t num_rows = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + new_ragged_begins[batch] = batch * seq_len; + new_ragged_ends[batch] = new_ragged_begins[batch] + seq_len; + + for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) { + auto token_id = input_data[seq]; + auto token = vocab[token_id]; + + std::copy(token.begin(), token.end(), &new_chars[char_offset]); + + new_begins[seq] = char_offset; + char_offset += token.size(); + new_ends[seq] = char_offset; + } + } + outputs[4].set_shape({char_offset}); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp new file mode 100644 index 000000000..1479dcb5a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API VocabDecoder : public ov::op::Op { +public: + OPENVINO_OP("VocabDecoder"); + + VocabDecoder () = default; + + VocabDecoder(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp new file mode 100644 index 000000000..a5cf696ac --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "wordpiece_tokenizer.hpp" +#include "utils.hpp" + +using namespace ov; + + +void WordpieceTokenizer::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_input(this, 5); + set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); +} + +#undef tokenizer + +bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto vocab_begins = inputs[5].data(); + auto vocab_ends = inputs[6].data(); + auto vocab_chars = inputs[7].data(); + + auto vocab_size = inputs[5].get_size(); + + OPENVINO_ASSERT(inputs.size() == 9, "Too few inputs passed to WordpieceTokenizer, it means it is not converted properly or it is not used in the supported pattern"); + + auto unk_token_id = *inputs[8].data(); + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_rows = inputs[0].get_size(); + + //const size_t num_parts = inputs[2].get_size(); + //size_t new_num_parts = num_parts; + + // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area + // to represent different elements in ragged tensor + outputs[2].set_shape({inputs[4].get_size()}); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t ragged_offset = 0; + + using namespace paddlenlp::fast_tokenizer; + +// std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; + core::Vocab vocab; + std::string unk_token; + if(unk_token_id < 0) + unk_token_id += vocab_size; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + if(id == unk_token_id) + unk_token = token; + } + +// std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; +// std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; +// std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; + + auto tokenizer = models::FastWordPiece(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); // FIXME: why true? + +// std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; + + + for(size_t seq = 0; seq < num_rows; ++seq) { + new_begins[seq] = ragged_offset; + + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + std::vector results = tokenizer.Tokenize(str); + +// std::cerr << "[ WordpieceTokenizer ] String bytes: "; +// for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { +// std::cerr << static_cast (chars[i]) << " "; +// } +// std::cerr << "\n"; +// std::cerr << "[ WordpieceTokenizer ] String: '" << str << "'\n"; +// std::cerr << "[ WordpieceTokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; + for (const core::Token& token : results) { +// std::cout << "[ WordpieceTokenizer ] id: " << token.id_ << ", value: " << token.value_ +// << ", offset: (" << token.offset_.first << ", " +// << token.offset_.second << ")." << std::endl; + OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); + new_elems[ragged_offset++] = token.id_; + }; + } + new_ends[seq] = ragged_offset; + } + outputs[2].set_shape({size_t(ragged_offset)}); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp new file mode 100644 index 000000000..6467a9376 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class OPENVINO_API WordpieceTokenizer : public ov::op::Op { +public: + OPENVINO_OP("WordpieceTokenizer"); + + WordpieceTokenizer () = default; + + WordpieceTokenizer(const ov::OutputVector& arguments, const std::string& suffix_indicator = "##", int max_bytes_per_word = 100) : + ov::op::Op(arguments), + m_suffix_indicator(suffix_indicator), + m_max_bytes_per_word(max_bytes_per_word) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_suffix_indicator, m_max_bytes_per_word); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("suffix_indicator", m_suffix_indicator); + visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_suffix_indicator = "##"; + int m_max_bytes_per_word = 100; // TODO: Can it be done outside the op as preprocessing of the input? +}; From f8d0e0d759bc53d2069229d31bcab73187b5bbef Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 17 Jul 2023 19:23:15 +0100 Subject: [PATCH 037/116] Add regex to detokenizer pipeline, all splitters have 5 inputs --- .../user_ie_extensions/tokenizer/hf_parser.py | 18 +- .../tokenizer/regex_normalization.cpp | 6 +- .../tokenizer/regex_split.cpp | 215 +++++------------- .../tokenizer/tokenizer_pipeline.py | 80 +++++-- 4 files changed, 125 insertions(+), 194 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py index dae5b5a99..1b2772bde 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py @@ -14,7 +14,7 @@ NormalizeUnicode, NMTNormalizationStep, CaseFoldStep, - RegExpNormalizationStep, + RegexNormalizationStep, StripStringStep, PreTokenizatinStep, PunctuationSplitStep, @@ -28,12 +28,13 @@ CombineSegmentsStep, VocabDecoderStep, CharsToBytesStep, + RegexDecodingStep, ) -def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegExpNormalizationStep: +def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep: regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"] - return RegExpNormalizationStep( + return RegexNormalizationStep( regex_search_pattern=regex_search_pattern, replace_term=normalizer_dict["content"], ) @@ -43,12 +44,12 @@ def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[Normalization steps: List[NormalizationStep] = [] if normalizer_dict["clean_text"] is True: - steps.append(RegExpNormalizationStep.del_control_chars_regex()) + steps.append(RegexNormalizationStep.del_control_chars_regex()) # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L127 if normalizer_dict.get("strip_accents") or normalizer_dict["lowercase"]: steps.append(NormalizeUnicode("NFD")) - steps.append(RegExpNormalizationStep.strip_accents_regex()) + steps.append(RegexNormalizationStep.strip_accents_regex()) if normalizer_dict["lowercase"] is True: steps.append(CaseFoldStep()) @@ -77,7 +78,7 @@ def parse_byte_level_pretokenization_step( ) -> List[Union[NormalizationStep, PreTokenizatinStep]]: steps = [] if pretokenizer_dict.get("add_prefix_space"): - steps.append(RegExpNormalizationStep(regex_search_pattern="^(\S)", replace_term=" $1")) + steps.append(RegexNormalizationStep.add_prefix_whitespace_regex()) # regex is used by default, but it does not appear in config yet if pretokenizer_dict.get("use_regex", True): @@ -123,7 +124,7 @@ def parse(self, number_of_inputs: Optional[int] = None) -> TokenizerPipeline: "NFKD": lambda step_dict: NormalizeUnicode("NFKD"), "Nmt": lambda step_dict: NMTNormalizationStep(), "Lowercase": lambda step_dict: CaseFoldStep(), - "StripAccents": lambda step_dict: RegExpNormalizationStep.strip_accents_regex(), + "StripAccents": lambda step_dict: RegexNormalizationStep.strip_accents_regex(), "BertNormalizer": parse_bert_normalizer, "Replace": parse_replace_normalizer, "Strip": parse_strip_step, @@ -238,4 +239,7 @@ def decoding(self) -> None: if self.tokenizer_json["decoder"]["type"] == "ByteLevel": self.pipeline.add_steps(VocabDecoderStep()) self.pipeline.add_steps(CharsToBytesStep()) + + if self.original_tokenizer.clean_up_tokenization_spaces: + self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces()) return diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp index 95a88e603..b35994809 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -27,12 +27,12 @@ bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - using namespace paddlenlp::fast_tokenizer::normalizers; re2::RE2 search_pattern_re(search_pattern); - return evaluate_normalization_helper( outputs, inputs, [&replace_pattern, &search_pattern_re](const std::string& str) { - return NormalizedString(str).Replace(search_pattern_re, std::string(replace_pattern)).GetStr(); + std::string result = static_cast(str); + re2::RE2::GlobalReplace(&result, search_pattern_re, replace_pattern); + return result; }); } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp index 10c64e704..f1c61e6ae 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -32,174 +32,69 @@ const std::map split_modes = { void RegexSplit::validate_and_infer_types() { -// check_string_input(this, 0); -// check_string_scalar_input(this, 3); -// check_ragged_string_input(this, 0); -// check_string_input(this, 5); + check_ragged_string_input(this, 0); + check_string_scalar_input(this, 5); OPENVINO_ASSERT(split_modes.find(m_behaviour) != split_modes.end(), "RegexSplit doesn't support unknown split mode: " + m_behaviour); set_ragged_string_output(this, 0, get_input_partial_shape(0)); } bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - - if (inputs.size() < 5) { - auto begins = inputs[0].data(); - auto ends = inputs[1].data(); - auto chars = inputs[2].data(); - - ov::Tensor ragged_begins_tensor(ov::element::i32, inputs[0].get_shape()); - ov::Tensor ragged_ends_tensor(ov::element::i32, inputs[0].get_shape()); - auto ragged_begins = ragged_begins_tensor.data(); - auto ragged_ends = ragged_ends_tensor.data(); - for (int i=0; i < inputs[0].get_size(); ++i) { - ragged_begins[i] = i; - ragged_ends[i] = i + 1; - }; - - auto split_pattern_buf = inputs[3].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - -// std::cerr << "[ RegexSplit ] Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; - - const size_t num_rows = inputs[0].get_size(); - const size_t num_chars = inputs[2].get_size(); - - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - - outputs[2].set_shape(Shape{num_chars}); - outputs[3].set_shape(Shape{num_chars}); - - outputs[4] = inputs[2]; - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - int32_t ragged_offset = 0; - - using namespace paddlenlp::fast_tokenizer; - auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - - for(size_t seq = 0; seq < num_rows; ++seq) { -// std::cerr << "================= Seq: " << seq << " ====================\n"; -// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; - - new_ragged_begins[seq] = ragged_offset; - - for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - - auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); -// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; - paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); - pretokenizer(&pretokenized); - size_t num_splits = pretokenized.GetSplitsSize(); -// std::cerr << "[ RegexSplit ] num_splits: " << num_splits << "\n"; - - for (size_t j = 0; j < num_splits; ++j) { - auto split = pretokenized.GetSplit(j); - const auto& value = split.normalized_.GetStr(); - auto offset = split.normalized_.GetOrginalOffset(); -// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; -// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[ragged_col] + offset.first; - new_ends[ragged_offset++] = begins[ragged_col] + offset.second; - }; - } - - new_ragged_ends[seq] = ragged_offset; + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + auto split_pattern_buf = inputs[5].data(); + auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + + outputs[4] = inputs[4]; + const size_t num_rows = inputs[0].get_size(); + const size_t num_chars = inputs[4].get_size(); + + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); + + outputs[4] = inputs[4]; + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; + + using namespace paddlenlp::fast_tokenizer; + auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); + + for(size_t seq = 0; seq < num_rows; ++seq) { + new_ragged_begins[seq] = ragged_offset; + + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); + pretokenizer(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); + + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); + new_begins[ragged_offset] = begins[ragged_col] + offset.first; + new_ends[ragged_offset++] = begins[ragged_col] + offset.second; + }; } - // Fix real shape based on collected results - outputs[2].set_shape({size_t(ragged_offset)}); - outputs[3].set_shape({size_t(ragged_offset)}); - - } else { - auto ragged_begins = inputs[0].data(); - auto ragged_ends = inputs[1].data(); - auto begins = inputs[2].data(); - auto ends = inputs[3].data(); - auto chars = inputs[4].data(); - - auto split_pattern_buf = inputs[5].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - -// std::cerr << "Split Pattern: `" << split_pattern << "`, behaviour: " << m_behaviour << "\n"; - - outputs[4] = inputs[4]; - const size_t num_rows = inputs[0].get_size(); - const size_t num_chars = inputs[4].get_size(); - - outputs[0].set_shape(inputs[0].get_shape()); - outputs[1].set_shape(inputs[1].get_shape()); - - outputs[2].set_shape(Shape{num_chars}); - outputs[3].set_shape(Shape{num_chars}); - - outputs[4] = inputs[4]; - - // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions - // and only number of elements in the original tensors matter - - // Get pointers in the output tensors - auto new_ragged_begins = outputs[0].data(); - auto new_ragged_ends = outputs[1].data(); - auto new_begins = outputs[2].data(); - auto new_ends = outputs[3].data(); - int32_t ragged_offset = 0; - - using namespace paddlenlp::fast_tokenizer; - auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - - for(size_t seq = 0; seq < num_rows; ++seq) { -// std::cerr << "----------------- Seq: " << seq << " -----------------\n"; -// std::cerr << "Ragged begins: " << ragged_begins[seq] << "; Ragged Ends: " << ragged_ends[seq] << "\n"; - - new_ragged_begins[seq] = ragged_offset; - - for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { - auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); -// std::cerr << "[ RegexSplit ] old_str: '" << str << "'\n"; - paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); - pretokenizer(&pretokenized); - size_t num_splits = pretokenized.GetSplitsSize(); - - - for (size_t j = 0; j < num_splits; ++j) { - auto split = pretokenized.GetSplit(j); - const auto& value = split.normalized_.GetStr(); - auto offset = split.normalized_.GetOrginalOffset(); -// std::cerr << "[ RegexSplit ] split part: '" << value << "'\n"; -// std::cerr << "[ RegexSplit ] split offs: " << offset.first << ":" << offset.second << "\n"; - new_begins[ragged_offset] = begins[ragged_col] + offset.first; - new_ends[ragged_offset++] = begins[ragged_col] + offset.second; - - -// std::cerr << "New begins and ends:\n"; -// for (size_t i = 0; i < outputs[2].get_size(); ++i) { -// std::cerr << outputs[2].data()[i] << ", "; -// } -// std::cerr << "\n"; -// -// for (size_t i = 0; i < outputs[3].get_size(); ++i) { -// std::cerr << outputs[3].data()[i] << ", "; -// } -// std::cerr << "\n"; - }; - } - - new_ragged_ends[seq] = ragged_offset; - } - - // Fix real shape based on collected results - outputs[2].set_shape({size_t(ragged_offset)}); - outputs[3].set_shape({size_t(ragged_offset)}); - + new_ragged_ends[seq] = ragged_offset; } + // Fix real shape based on collected results + outputs[2].set_shape({size_t(ragged_offset)}); + outputs[3].set_shape({size_t(ragged_offset)}); + return true; } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py index 7a47137d0..a049e3054 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py @@ -16,9 +16,6 @@ from openvino.runtime.utils.types import as_node, make_constant_node -string_ops = None #MagicMock() - - def pack_strings(strings): assert isinstance(strings, list) to_bytes = lambda x: x.to_bytes(4, "little") @@ -104,19 +101,27 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: @dataclass -class RegExpNormalizationStep(NormalizationStep): +class RegexNormalizationStep(NormalizationStep): regex_search_pattern: str replace_term: str @classmethod - def strip_accents_regex(cls) -> "RegExpNormalizationStep": + def strip_accents_regex(cls) -> "RegexNormalizationStep": return cls(regex_search_pattern=r"\p{Mn}", replace_term="") @classmethod - def del_control_chars_regex(cls) -> "RegExpNormalizationStep": + def add_prefix_whitespace_regex(cls) -> "RegexNormalizationStep": + return cls(regex_search_pattern=r"^(\S)", replace_term=r" \1") + + @classmethod + def del_control_chars_regex(cls) -> "RegexNormalizationStep": # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L17 return cls(regex_search_pattern=r"((?=[^\n\t\r])\p{Cc})|((?=[^\n\t\r])\p{Cf})", replace_term=" ") + @classmethod + def clean_up_tokenization_spaces(cls) -> "RegexNormalizationStep": + return cls(regex_search_pattern=r" ([\.\?\!\,])| ('[ms])| (') | ('[rv]e)", replace_term="\1") + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: input_nodes.extend( ( @@ -141,13 +146,13 @@ class NMTNormalizationStep(NormalizationStep): @dataclass class StripAccentsStep(NormalizationStep): def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return RegExpNormalizationStep.strip_accents_regex().get_ov_subgraph(input_nodes) + return RegexNormalizationStep.strip_accents_regex().get_ov_subgraph(input_nodes) @dataclass class DelControlCharsStep(NormalizationStep): def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return RegExpNormalizationStep.del_control_chars_regex().get_ov_subgraph(input_nodes) + return RegexNormalizationStep.del_control_chars_regex().get_ov_subgraph(input_nodes) @dataclass @@ -680,6 +685,31 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: return core.make_node("CharsToBytes", input_nodes, {}).outputs() +@dataclass +class RegexDecodingStep(DecodingStep): + regex_search_pattern: str + replace_term: str + + @classmethod + def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)", + replace_term=r"\1", + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + input_nodes.extend( + ( + *self.create_string_constant_node(self.regex_search_pattern).outputs(), + *self.create_string_constant_node(self.replace_term).outputs(), + ) + ) + return core.make_node( + "RegexNormalization", + input_nodes + ).outputs() + + @dataclass class TokenizerPipeline: steps: List[BasePipelineStep] = field(default_factory=list) @@ -725,7 +755,7 @@ def post_tokenization_steps(self) -> List[PostTokenizationStep]: return [step for step in self.steps if isinstance(step, PostTokenizationStep)] @property - def decoding_steps(self) -> List[PostTokenizationStep]: + def decoding_steps(self) -> List[DecodingStep]: return [step for step in self.steps if isinstance(step, DecodingStep)] def create_string_input(self) -> Node: @@ -742,10 +772,23 @@ def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[No for step in self.normalization_steps: input_node = step.get_ov_subgraph(input_node) - # batch_size = opset10.shape_of(input_node[0]) - # ragged_begins = opset10.range(as_node(0), batch_size, as_node(1)).outputs() - # ragged_ends = opset10.range(as_node(1), opset10.add(batch_size, as_node(1)).outputs(), as_node(1)) - # input_node = [ragged_begins, ragged_ends] + input_node + shape = opset10.shape_of(input_node[0]) + batch_size = opset10.gather(shape, as_node(0), as_node(0)) + + # FIXME: Cannot create range with specific data type from python + ragged_begins = opset10.convert( + opset10.range(as_node(0), batch_size, as_node(1)), + "i32", + ).outputs() + ragged_ends = opset10.convert( + opset10.range( + as_node(1), + opset10.add(batch_size, as_node(1)), + as_node(1), + ), + "i32", + ).outputs() + input_node = ragged_begins + ragged_ends + input_node for step in chain(self.pretokenization_steps, self.tokenization_steps): input_node = step.get_ov_subgraph(input_node) @@ -755,24 +798,13 @@ def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[No return processing_pipelines_outputs def create_post_tokenization_pipeline(self, input_nodes: List[op.Parameter]) -> List[Output]: - #outputs = [] for step in self.post_tokenization_steps: pipeline_step = step.get_ov_subgraph(input_nodes) input_nodes = pipeline_step - #if isinstance(step, CombineSegmentsStep): - # input_nodes.append(MagicMock(name="token_type_ids")) - # outputs.append(input_nodes.pop(-1)) # token_type_ids node - #if isinstance(step, PaddingStep): - # print('HEY!!!!!!!') - # input_nodes.append(MagicMock(name="attention_mask")) - # outputs.append(input_nodes.pop(-1)) # attention_mask node - - #outputs.insert(0, input_nodes[0]) return input_nodes def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: - # input_nodes = inputs.outputs() for step in self.decoding_steps: pipeline_step = step.get_ov_subgraph(input_nodes) input_nodes = pipeline_step From 10c10c5e545434e959a5c595bae5114304b34737 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 27 Jul 2023 15:28:01 +0100 Subject: [PATCH 038/116] Add Caching for RegexNormalization --- .../tokenizer/regex_normalization.cpp | 35 ++++++++++++------- .../tokenizer/regex_normalization.hpp | 23 ++++++++---- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp index b35994809..b369f4bb5 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -2,11 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "normalizer.h" // for absl::string_view -#include "fast_tokenizer/normalizers/normalizers.h" -#include "fast_tokenizer/models/models.h" -#include "fast_tokenizer/pretokenizers/pretokenizers.h" #include "regex_normalization.hpp" #include "utils.hpp" @@ -14,6 +10,27 @@ using namespace ov; +RegexNormalization::RegexNormalization(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr()); + auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); + auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); + auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr()); + auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + m_search_pattern_re = std::make_shared(search_pattern); + + constructor_validate_and_infer_types(); + } + + +RegexNormalization::RegexNormalization( + const ov::OutputVector& arguments, const std::shared_ptr search_pattern_re, const absl::string_view replace_pattern + ) : ov::op::Op(arguments), m_search_pattern_re(search_pattern_re), m_replace_pattern(replace_pattern) { + constructor_validate_and_infer_types(); + } + + void RegexNormalization::validate_and_infer_types() { check_string_input(this, 0); check_string_scalar_input(this, 3); @@ -22,17 +39,11 @@ void RegexNormalization::validate_and_infer_types() { } bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto search_pattern_buf = inputs[3].data(); - auto replace_pattern_buf = inputs[4].data(); - auto search_pattern = absl::string_view((const char*)search_pattern_buf, shape_size(inputs[3].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - auto replace_pattern = absl::string_view((const char*)replace_pattern_buf, shape_size(inputs[4].get_shape()) - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - - re2::RE2 search_pattern_re(search_pattern); return evaluate_normalization_helper( outputs, inputs, - [&replace_pattern, &search_pattern_re](const std::string& str) { + [this](const std::string& str) { std::string result = static_cast(str); - re2::RE2::GlobalReplace(&result, search_pattern_re, replace_pattern); + re2::RE2::GlobalReplace(&result, *this->m_search_pattern_re, this->m_replace_pattern); return result; }); } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp index 7272d3086..e67d69d3f 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp @@ -4,23 +4,31 @@ #pragma once +#include "normalizer.h" // for absl::string_view + #include +#include "openvino/opsets/opset10.hpp" +#include "fast_tokenizer/normalizers/normalizers.h" + +using namespace ov; +using namespace ov::opset10; class OPENVINO_API RegexNormalization : public ov::op::Op { public: OPENVINO_OP("RegexNormalization"); RegexNormalization () = default; - - RegexNormalization(const ov::OutputVector& arguments) : - ov::op::Op(arguments) { - constructor_validate_and_infer_types(); - } + RegexNormalization(const ov::OutputVector& arguments); + RegexNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr search_pattern_re, + const absl::string_view replace_pattern + ); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs); + return std::make_shared(inputs, m_search_pattern_re, m_replace_pattern); } bool visit_attributes(ov::AttributeVisitor& visitor) override { @@ -32,4 +40,7 @@ class OPENVINO_API RegexNormalization : public ov::op::Op { bool has_evaluate() const { return true; } +private: + std::shared_ptr m_search_pattern_re; + absl::string_view m_replace_pattern; }; From 4eb12f856abc0bdf8348b3779a49ee95489f7ed5 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 27 Jul 2023 21:15:17 +0100 Subject: [PATCH 039/116] Add Caching for RegexSplit --- .../tokenizer/regex_normalization.cpp | 6 +-- .../tokenizer/regex_split.cpp | 45 ++++++++++++------- .../tokenizer/regex_split.hpp | 28 ++++++++---- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp index b369f4bb5..f4f57af03 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -12,9 +12,9 @@ using namespace ov; RegexNormalization::RegexNormalization(const ov::OutputVector& arguments) : ov::op::Op(arguments) { - auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr()); - auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); - auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); + auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr()); + auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); + auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr()); auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp index f1c61e6ae..32a2eb65d 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -2,21 +2,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "normalizer.h" // for absl::string_view - #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" - -#include "fast_tokenizer/normalizers/normalizers.h" -#include "fast_tokenizer/models/models.h" -#include "fast_tokenizer/pretokenizers/pretokenizers.h" +// #include "regex_split.hpp" #include "utils.hpp" +#include "fast_tokenizer/normalizers/normalizers.h" using namespace ov; - namespace { using paddlenlp::fast_tokenizer::core::SplitMode; @@ -31,6 +26,33 @@ const std::map split_modes = { } +RegexSplit::RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour, bool invert) : + ov::op::Op(arguments), + m_behaviour(behaviour), + m_invert(invert) { + auto split_pattern_const = as_type_ptr(arguments[5].get_node_shared_ptr()); + auto split_pattern_buf = static_cast(split_pattern_const->get_data_ptr()); + auto split_pattern = std::string(split_pattern_buf, split_pattern_const->get_byte_size()); + auto m_pretokenizer = std::make_shared(split_pattern, split_modes.at(behaviour), invert); + + constructor_validate_and_infer_types(); +} + + +RegexSplit::RegexSplit( + const ov::OutputVector& arguments, + const std::shared_ptr pretokenizer, + const std::string& behaviour, + bool invert +) : + ov::op::Op(arguments), + m_pretokenizer(pretokenizer), + m_behaviour(behaviour), + m_invert(invert) { + constructor_validate_and_infer_types(); +} + + void RegexSplit::validate_and_infer_types() { check_ragged_string_input(this, 0); check_string_scalar_input(this, 5); @@ -45,9 +67,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto ends = inputs[3].data(); auto chars = inputs[4].data(); - auto split_pattern_buf = inputs[5].data(); - auto split_pattern = absl::string_view((const char*)split_pattern_buf, shape_size(inputs[5].get_shape())/* - 1*/); // Shouldn't be applied FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - outputs[4] = inputs[4]; const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[4].get_size(); @@ -67,19 +86,15 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp auto new_ends = outputs[3].data(); int32_t ragged_offset = 0; - using namespace paddlenlp::fast_tokenizer; - auto pretokenizer = pretokenizers::SplitPreTokenizer(std::string(split_pattern), split_modes.at(m_behaviour), m_invert); - for(size_t seq = 0; seq < num_rows; ++seq) { new_ragged_begins[seq] = ragged_offset; for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); - pretokenizer(&pretokenized); + (*m_pretokenizer)(&pretokenized); size_t num_splits = pretokenized.GetSplitsSize(); - for (size_t j = 0; j < num_splits; ++j) { auto split = pretokenized.GetSplit(j); const auto& value = split.normalized_.GetStr(); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp index 438b47d64..edda8cccb 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -4,25 +4,35 @@ #pragma once +#include "normalizer.h" // for absl::string_view + #include +#include "openvino/opsets/opset10.hpp" +#include "fast_tokenizer/normalizers/normalizers.h" // for re2::RE2 +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +using namespace ov; +using namespace ov::opset10; +using namespace paddlenlp::fast_tokenizer; + class OPENVINO_API RegexSplit : public ov::op::Op { public: OPENVINO_OP("RegexSplit"); RegexSplit () = default; - - RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false) : - ov::op::Op(arguments), - m_behaviour(behaviour), - m_invert(invert) { - constructor_validate_and_infer_types(); - } + RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false); + RegexSplit( + const ov::OutputVector& arguments, + const std::shared_ptr pretokenizer, + const std::string& behaviour = "remove", + bool invert = false + ); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_behaviour, m_invert); + return std::make_shared(inputs, m_pretokenizer, m_behaviour, m_invert); } bool visit_attributes(ov::AttributeVisitor& visitor) override { @@ -38,7 +48,7 @@ class OPENVINO_API RegexSplit : public ov::op::Op { } private: - + std::shared_ptr m_pretokenizer; std::string m_behaviour = "remove"; bool m_invert = false; }; From c5efaf0724ed45938ada683fe5fe0556161cc9d9 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 28 Jul 2023 17:28:21 +0100 Subject: [PATCH 040/116] Add Wordpiece Cache --- .../tokenizer/regex_split.cpp | 1 + .../tokenizer/regex_split.hpp | 1 - .../tokenizer/wordpiece_tokenizer.cpp | 106 +++++++++++------- .../tokenizer/wordpiece_tokenizer.hpp | 28 +++-- 4 files changed, 86 insertions(+), 50 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp index 32a2eb65d..b520f49bd 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -11,6 +11,7 @@ #include "fast_tokenizer/normalizers/normalizers.h" using namespace ov; +using namespace ov::opset10; namespace { diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp index edda8cccb..7e7202e34 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -12,7 +12,6 @@ #include "fast_tokenizer/pretokenizers/pretokenizers.h" using namespace ov; -using namespace ov::opset10; using namespace paddlenlp::fast_tokenizer; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp index a5cf696ac..aa60639df 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp @@ -2,14 +2,74 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "fast_tokenizer/normalizers/normalizers.h" -#include "fast_tokenizer/models/models.h" -#include "fast_tokenizer/pretokenizers/pretokenizers.h" +//#include "fast_tokenizer/normalizers/normalizers.h" +// +//#include "fast_tokenizer/pretokenizers/pretokenizers.h" #include "wordpiece_tokenizer.hpp" #include "utils.hpp" +#include "openvino/opsets/opset10.hpp" using namespace ov; +using namespace ov::opset10; + + + +WordpieceTokenizer::WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::string& suffix_indicator, + int max_bytes_per_word +) : + ov::op::Op(arguments), + m_suffix_indicator(suffix_indicator), + m_max_bytes_per_word(max_bytes_per_word) { + +// std::cerr << "Slow\n"; + + using namespace paddlenlp::fast_tokenizer; + + auto vocab_begins_const = as_type_ptr(arguments[5].get_node_shared_ptr()); + auto vocab_begins = static_cast(vocab_begins_const->get_data_ptr()); + auto vocab_size = vocab_begins_const->get_shape()[0]; + + auto vocab_ends_const = as_type_ptr(arguments[6].get_node_shared_ptr()); + auto vocab_ends = static_cast(vocab_ends_const->get_data_ptr()); + + auto vocab_chars_const = as_type_ptr(arguments[7].get_node_shared_ptr()); + auto vocab_chars = static_cast(vocab_chars_const->get_data_ptr()); + + auto unk_token_id_const = as_type_ptr(arguments[8].get_node_shared_ptr()); + auto unk_token_id = *static_cast(vocab_begins_const->get_data_ptr()); + + core::Vocab vocab; + std::string unk_token; + if(unk_token_id < 0) + unk_token_id += vocab_size; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + if(id == unk_token_id) + unk_token = token; + } + + m_tokenizer = std::make_shared(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); + constructor_validate_and_infer_types(); +} + +WordpieceTokenizer::WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr tokenizer, + const std::string& suffix_indicator, + int max_bytes_per_word +) : + ov::op::Op(arguments), + m_tokenizer(tokenizer), + m_suffix_indicator(suffix_indicator), + m_max_bytes_per_word(max_bytes_per_word) { + +// std::cerr << "Fast\n"; + constructor_validate_and_infer_types(); +} void WordpieceTokenizer::validate_and_infer_types() { @@ -18,7 +78,6 @@ void WordpieceTokenizer::validate_and_infer_types() { set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); } -#undef tokenizer bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { auto ragged_begins = inputs[0].data(); @@ -27,16 +86,6 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto ends = inputs[3].data(); auto chars = inputs[4].data(); - auto vocab_begins = inputs[5].data(); - auto vocab_ends = inputs[6].data(); - auto vocab_chars = inputs[7].data(); - - auto vocab_size = inputs[5].get_size(); - - OPENVINO_ASSERT(inputs.size() == 9, "Too few inputs passed to WordpieceTokenizer, it means it is not converted properly or it is not used in the supported pattern"); - - auto unk_token_id = *inputs[8].data(); - // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); @@ -57,39 +106,18 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec using namespace paddlenlp::fast_tokenizer; -// std::cerr << "[ WordpieceTokenizer ] Start vocab reading\n"; - core::Vocab vocab; - std::string unk_token; - if(unk_token_id < 0) - unk_token_id += vocab_size; - for(size_t id = 0; id < vocab_size; ++id) { - auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[token] = int32_t(id); // TODO: Check range - if(id == unk_token_id) - unk_token = token; - } - -// std::cerr << "[ WordpieceTokenizer ] Finish vocab reading\n"; -// std::cerr << "[ WordpieceTokenizer ] unk_token = " << unk_token << "\n"; -// std::cerr << "[ WordpieceTokenizer ] Start tokenizer initialization\n"; - - auto tokenizer = models::FastWordPiece(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); // FIXME: why true? - -// std::cerr << "[ WordpieceTokenizer ] Finish tokenizer initialization\n"; - - for(size_t seq = 0; seq < num_rows; ++seq) { new_begins[seq] = ragged_offset; for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); - std::vector results = tokenizer.Tokenize(str); + std::vector results = m_tokenizer->Tokenize(str); // std::cerr << "[ WordpieceTokenizer ] String bytes: "; -// for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { -// std::cerr << static_cast (chars[i]) << " "; -// } + for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { + std::cerr << static_cast (chars[i]) << " "; + } // std::cerr << "\n"; // std::cerr << "[ WordpieceTokenizer ] String: '" << str << "'\n"; // std::cerr << "[ WordpieceTokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp index 6467a9376..cce8c5a21 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp @@ -5,24 +5,32 @@ #pragma once #include +#include "fast_tokenizer/models/models.h" + +using namespace paddlenlp::fast_tokenizer; + +#undef tokenizer class OPENVINO_API WordpieceTokenizer : public ov::op::Op { public: OPENVINO_OP("WordpieceTokenizer"); WordpieceTokenizer () = default; - - WordpieceTokenizer(const ov::OutputVector& arguments, const std::string& suffix_indicator = "##", int max_bytes_per_word = 100) : - ov::op::Op(arguments), - m_suffix_indicator(suffix_indicator), - m_max_bytes_per_word(max_bytes_per_word) { - constructor_validate_and_infer_types(); - } - + WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::string& suffix_indicator = "##", + int max_bytes_per_word = 100 + ); + WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr tokenizer, + const std::string& suffix_indicator = "##", + int max_bytes_per_word = 100 + ); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_suffix_indicator, m_max_bytes_per_word); + return std::make_shared(inputs, m_tokenizer, m_suffix_indicator, m_max_bytes_per_word); } bool visit_attributes(ov::AttributeVisitor& visitor) override { @@ -38,7 +46,7 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { } private: - + std::shared_ptr m_tokenizer; std::string m_suffix_indicator = "##"; int m_max_bytes_per_word = 100; // TODO: Can it be done outside the op as preprocessing of the input? }; From 239acc469260d02673d501ec885a51ffdd953ced Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 31 Jul 2023 19:14:45 +0100 Subject: [PATCH 041/116] Add NodeFactory --- .../tokenizer/tokenizer_pipeline.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py index a049e3054..933c4e7d8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py @@ -12,7 +12,8 @@ import numpy as np from openvino.runtime.exceptions import UserInputError, OVTypeError -from openvino.runtime import Type, PartialShape, op, Model, Core, Output, Node, opset10 +from openvino.runtime import Type, PartialShape, op, Model, Output, Node, opset10 +from openvino.runtime.utils.node_factory import NodeFactory from openvino.runtime.utils.types import as_node, make_constant_node @@ -36,9 +37,9 @@ def pack_string(s): ) # + ' ' is WA for CPU bug -core = Core() +factory = NodeFactory() # TODO: Use relative path -core.add_extension("/home/apaniuko/python/openvino/bin/intel64/Debug/libuser_ov_extensions.so") +factory.add_extension("/home/apaniuko/python/openvino/bin/intel64/Debug/libuser_ov_extensions.so") class BasePipelineStep: @@ -76,7 +77,7 @@ def create_string_constant_node(value: str) -> op.Constant: else: # support only 1D strings for now ps = pack_strings(value) - return core.make_node("StringTensorUnpack", op.Constant(ps).outputs()) + return factory.create("StringTensorUnpack", op.Constant(ps).outputs()) @dataclass @@ -89,7 +90,7 @@ class NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return core.make_node( + return factory.create( "NormalizeUnicode", input_nodes, {"normalization_form": self.normalization_form} ).outputs() @@ -97,7 +98,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: @dataclass class CaseFoldStep(NormalizationStep): def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return core.make_node("CaseFold", input_nodes).outputs() + return factory.create("CaseFold", input_nodes).outputs() @dataclass @@ -129,7 +130,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: *self.create_string_constant_node(self.replace_term).outputs(), ) ) - return core.make_node( + return factory.create( "RegexNormalization", input_nodes ).outputs() @@ -249,7 +250,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: input_nodes.extend( self.create_string_constant_node(self.split_pattern).outputs() ) - return core.make_node( + return factory.create( "RegexSplit", input_nodes, { @@ -276,7 +277,7 @@ class PunctuationSplitStep(PreTokenizatinStep): class BytesToCharsStep(PreTokenizatinStep): """Maps chars to other chars for Byte-level BPE Tokenizer""" def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return core.make_node( + return factory.create( "BytesToChars", input_nodes, ).outputs() @@ -320,7 +321,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: *as_node(self.unk_token_id).outputs(), ) ) - return core.make_node( + return factory.create( "WordpieceTokenizer", input_nodes, { @@ -360,7 +361,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: *self.create_string_constant_node(self.merges).outputs(), ) ) - return core.make_node( + return factory.create( "BPETokenizer", input_nodes, { @@ -592,7 +593,7 @@ def get_ov_subgraph(self, input_nodes): # Decomposed implementation - return core.make_node('CombineSegments', op_inputs).outputs() + return factory.create('CombineSegments', op_inputs).outputs() print(input_nodes) assert len(input_nodes) == 3, '[ TOKENIZER PIPELINE CONVERSION ] CombineSegments can be converted for a single ragged input tensor only, this is temporary limitation' print('self.segment_ids:', self.segment_ids) @@ -648,7 +649,7 @@ def get_ov_subgraph(self, input_nodes): #print(input_nodes[3*i:3*(i+1)]) #print(as_node(self.max_length).outputs()) #print(as_node(np.array(0, dtype=int)).outputs()) - cur_outputs = core.make_node( + cur_outputs = factory.create( "RaggedToDense", input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs() ).outputs() @@ -676,13 +677,13 @@ def get_vocab_node_outputs(self) -> Optional[List[Output]]: def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: input_nodes.extend(self.get_vocab_node_outputs()) - return core.make_node("VocabDecoder", input_nodes, {}).outputs() + return factory.create("VocabDecoder", input_nodes, {}).outputs() @dataclass class CharsToBytesStep(DecodingStep): def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return core.make_node("CharsToBytes", input_nodes, {}).outputs() + return factory.create("CharsToBytes", input_nodes, {}).outputs() @dataclass @@ -704,7 +705,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: *self.create_string_constant_node(self.replace_term).outputs(), ) ) - return core.make_node( + return factory.create( "RegexNormalization", input_nodes ).outputs() @@ -768,7 +769,7 @@ def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[No processing_pipelines_outputs = [] for input_node in input_nodes: - input_node = core.make_node("StringTensorUnpack", input_node.outputs()).outputs() + input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() for step in self.normalization_steps: input_node = step.get_ov_subgraph(input_node) @@ -809,7 +810,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: pipeline_step = step.get_ov_subgraph(input_nodes) input_nodes = pipeline_step - return core.make_node("StringTensorPack", input_nodes).outputs() + return factory.create("StringTensorPack", input_nodes).outputs() def get_encoder_ov_subgraph(self) -> Model: input_nodes = [self.create_string_input() for _ in range(self.number_of_inputs)] From 38552b09222fc0f04cdbc59cf29c3b0e52169468 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 4 Aug 2023 15:01:46 +0100 Subject: [PATCH 042/116] Fix regex nodes init --- .../tokenizer/regex_normalization.cpp | 26 +++++++++++-------- .../tokenizer/regex_normalization.hpp | 4 +-- .../tokenizer/regex_split.cpp | 21 ++++++++++----- .../tokenizer/regex_split.hpp | 2 +- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp index f4f57af03..9d6529b4e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -12,21 +12,24 @@ using namespace ov; RegexNormalization::RegexNormalization(const ov::OutputVector& arguments) : ov::op::Op(arguments) { - auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr()); - auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); - auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); - auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr()); - auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - m_search_pattern_re = std::make_shared(search_pattern); - constructor_validate_and_infer_types(); } RegexNormalization::RegexNormalization( - const ov::OutputVector& arguments, const std::shared_ptr search_pattern_re, const absl::string_view replace_pattern + const ov::OutputVector& arguments, + const std::shared_ptr& search_pattern_re, + const absl::string_view replace_pattern ) : ov::op::Op(arguments), m_search_pattern_re(search_pattern_re), m_replace_pattern(replace_pattern) { + if (m_search_pattern_re == nullptr) { + auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr()); + auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); + auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); + auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr()); + auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + m_search_pattern_re = std::make_shared(search_pattern); + }; constructor_validate_and_infer_types(); } @@ -38,12 +41,13 @@ void RegexNormalization::validate_and_infer_types() { set_string_output(this, 0, get_input_partial_shape(0)); } + bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { return evaluate_normalization_helper( outputs, inputs, [this](const std::string& str) { - std::string result = static_cast(str); - re2::RE2::GlobalReplace(&result, *this->m_search_pattern_re, this->m_replace_pattern); + std::string result; + re2::RE2::Extract(str, *m_search_pattern_re, m_replace_pattern, &result); return result; }); } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp index e67d69d3f..a3ec22397 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp @@ -18,10 +18,10 @@ class OPENVINO_API RegexNormalization : public ov::op::Op { OPENVINO_OP("RegexNormalization"); RegexNormalization () = default; - RegexNormalization(const ov::OutputVector& arguments); + RegexNormalization(const ov::OutputVector& arguments); // does not used RegexNormalization( const ov::OutputVector& arguments, - const std::shared_ptr search_pattern_re, + const std::shared_ptr& search_pattern_re, const absl::string_view replace_pattern ); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp index b520f49bd..f187d877e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -31,18 +31,13 @@ RegexSplit::RegexSplit(const ov::OutputVector& arguments, const std::string& beh ov::op::Op(arguments), m_behaviour(behaviour), m_invert(invert) { - auto split_pattern_const = as_type_ptr(arguments[5].get_node_shared_ptr()); - auto split_pattern_buf = static_cast(split_pattern_const->get_data_ptr()); - auto split_pattern = std::string(split_pattern_buf, split_pattern_const->get_byte_size()); - auto m_pretokenizer = std::make_shared(split_pattern, split_modes.at(behaviour), invert); - constructor_validate_and_infer_types(); } RegexSplit::RegexSplit( const ov::OutputVector& arguments, - const std::shared_ptr pretokenizer, + const std::shared_ptr& pretokenizer, const std::string& behaviour, bool invert ) : @@ -50,6 +45,14 @@ RegexSplit::RegexSplit( m_pretokenizer(pretokenizer), m_behaviour(behaviour), m_invert(invert) { + + if (m_pretokenizer == nullptr) { + auto split_pattern_const = as_type_ptr(arguments[5].get_node_shared_ptr()); + auto split_pattern_buf = static_cast(split_pattern_const->get_data_ptr()); + auto split_pattern = std::string(split_pattern_buf, split_pattern_const->get_byte_size()); + m_pretokenizer = std::make_shared(split_pattern, split_modes.at(behaviour), invert); + }; + constructor_validate_and_infer_types(); } @@ -62,6 +65,8 @@ void RegexSplit::validate_and_infer_types() { } bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + std::cerr << "[ RegexSplit ] Eval \n"; + auto ragged_begins = inputs[0].data(); auto ragged_ends = inputs[1].data(); auto begins = inputs[2].data(); @@ -72,9 +77,13 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[4].get_size(); + std::cerr << "[ RegexSplit ] Before Shape \n"; + outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); + std::cerr << "[ RegexSplit ] After Shape \n"; + outputs[2].set_shape(Shape{num_chars}); outputs[3].set_shape(Shape{num_chars}); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp index 7e7202e34..a7d9b2851 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -23,7 +23,7 @@ class OPENVINO_API RegexSplit : public ov::op::Op { RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false); RegexSplit( const ov::OutputVector& arguments, - const std::shared_ptr pretokenizer, + const std::shared_ptr& pretokenizer, const std::string& behaviour = "remove", bool invert = false ); From 597ccd4eb8fbe6d5bc0e94ff8868b38e11516db7 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 10 Aug 2023 15:09:00 +0100 Subject: [PATCH 043/116] Fix Wordpiece Cache --- .../tokenizer/wordpiece_tokenizer.cpp | 72 +++++++------------ .../tokenizer/wordpiece_tokenizer.hpp | 2 +- 2 files changed, 26 insertions(+), 48 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp index aa60639df..3983c3f50 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp @@ -2,10 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 // -//#include "fast_tokenizer/normalizers/normalizers.h" -// -//#include "fast_tokenizer/pretokenizers/pretokenizers.h" - #include "wordpiece_tokenizer.hpp" #include "utils.hpp" #include "openvino/opsets/opset10.hpp" @@ -14,7 +10,6 @@ using namespace ov; using namespace ov::opset10; - WordpieceTokenizer::WordpieceTokenizer( const ov::OutputVector& arguments, const std::string& suffix_indicator, @@ -24,41 +19,12 @@ WordpieceTokenizer::WordpieceTokenizer( m_suffix_indicator(suffix_indicator), m_max_bytes_per_word(max_bytes_per_word) { -// std::cerr << "Slow\n"; - - using namespace paddlenlp::fast_tokenizer; - - auto vocab_begins_const = as_type_ptr(arguments[5].get_node_shared_ptr()); - auto vocab_begins = static_cast(vocab_begins_const->get_data_ptr()); - auto vocab_size = vocab_begins_const->get_shape()[0]; - - auto vocab_ends_const = as_type_ptr(arguments[6].get_node_shared_ptr()); - auto vocab_ends = static_cast(vocab_ends_const->get_data_ptr()); - - auto vocab_chars_const = as_type_ptr(arguments[7].get_node_shared_ptr()); - auto vocab_chars = static_cast(vocab_chars_const->get_data_ptr()); - - auto unk_token_id_const = as_type_ptr(arguments[8].get_node_shared_ptr()); - auto unk_token_id = *static_cast(vocab_begins_const->get_data_ptr()); - - core::Vocab vocab; - std::string unk_token; - if(unk_token_id < 0) - unk_token_id += vocab_size; - for(size_t id = 0; id < vocab_size; ++id) { - auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[token] = int32_t(id); // TODO: Check range - if(id == unk_token_id) - unk_token = token; - } - - m_tokenizer = std::make_shared(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); constructor_validate_and_infer_types(); } WordpieceTokenizer::WordpieceTokenizer( const ov::OutputVector& arguments, - const std::shared_ptr tokenizer, + const std::shared_ptr& tokenizer, const std::string& suffix_indicator, int max_bytes_per_word ) : @@ -67,7 +33,30 @@ WordpieceTokenizer::WordpieceTokenizer( m_suffix_indicator(suffix_indicator), m_max_bytes_per_word(max_bytes_per_word) { -// std::cerr << "Fast\n"; + if (m_tokenizer == nullptr) { + // vocab constant folding doesn't work, get packed constant + auto packed_vocab_const = as_type_ptr(arguments[5].get_node_shared_ptr()->get_input_node_shared_ptr(0)); + auto packed_vocab_buf = static_cast(packed_vocab_const->get_data_ptr()); + auto vocab_size = *reinterpret_cast(packed_vocab_buf + 0); + auto vocab_begins = reinterpret_cast(packed_vocab_buf + 4); + auto vocab_ends = reinterpret_cast(packed_vocab_buf + 4 + 4); + auto vocab_chars = packed_vocab_buf + 4 + 4 + 4 * vocab_size; + + auto unk_token_id_const = as_type_ptr(arguments[8].get_node_shared_ptr()); + auto unk_token_id = *static_cast(unk_token_id_const->get_data_ptr()); + + core::Vocab vocab; + std::string unk_token; + if(unk_token_id < 0) + unk_token_id += vocab_size; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + if(id == unk_token_id) + unk_token = token; + } + m_tokenizer = std::make_shared(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); + } constructor_validate_and_infer_types(); } @@ -91,8 +80,6 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec outputs[1].set_shape(inputs[1].get_shape()); const size_t num_rows = inputs[0].get_size(); - //const size_t num_parts = inputs[2].get_size(); - //size_t new_num_parts = num_parts; // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area // to represent different elements in ragged tensor @@ -104,8 +91,6 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto new_elems = outputs[2].data(); int32_t ragged_offset = 0; - using namespace paddlenlp::fast_tokenizer; - for(size_t seq = 0; seq < num_rows; ++seq) { new_begins[seq] = ragged_offset; @@ -114,17 +99,10 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); std::vector results = m_tokenizer->Tokenize(str); -// std::cerr << "[ WordpieceTokenizer ] String bytes: "; for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { std::cerr << static_cast (chars[i]) << " "; } -// std::cerr << "\n"; -// std::cerr << "[ WordpieceTokenizer ] String: '" << str << "'\n"; -// std::cerr << "[ WordpieceTokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; for (const core::Token& token : results) { -// std::cout << "[ WordpieceTokenizer ] id: " << token.id_ << ", value: " << token.value_ -// << ", offset: (" << token.offset_.first << ", " -// << token.offset_.second << ")." << std::endl; OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); new_elems[ragged_offset++] = token.id_; }; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp index cce8c5a21..cbfe664a8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp @@ -23,7 +23,7 @@ class OPENVINO_API WordpieceTokenizer : public ov::op::Op { ); WordpieceTokenizer( const ov::OutputVector& arguments, - const std::shared_ptr tokenizer, + const std::shared_ptr& tokenizer, const std::string& suffix_indicator = "##", int max_bytes_per_word = 100 ); From e6933b782d606a4e7fcbd4438ec902f92ebd4dc3 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 10 Aug 2023 16:34:49 +0100 Subject: [PATCH 044/116] Add BPE Cache --- .../tokenizer/bpe_tokenizer.cpp | 199 +++++++++--------- .../tokenizer/bpe_tokenizer.hpp | 30 ++- 2 files changed, 121 insertions(+), 108 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp index 25ad3db31..47e4eb7ab 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp @@ -2,17 +2,114 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "fast_tokenizer/normalizers/normalizers.h" -#include "fast_tokenizer/models/models.h" -#include "fast_tokenizer/pretokenizers/pretokenizers.h" - #include "bpe_tokenizer.hpp" #include "utils.hpp" +#include "openvino/opsets/opset10.hpp" using namespace ov; +using namespace ov::opset10; #undef tokenizer + +BPETokenizer::BPETokenizer( + const ov::OutputVector& arguments, + const std::string& unk_token, + bool fuse_unk, + const std::string& suffix_indicator, + const std::string& end_suffix, + bool byte_fallback +) : + ov::op::Op(arguments), + m_unk_token(unk_token), + m_fuse_unk(fuse_unk), + m_suffix_indicator(suffix_indicator), + m_end_suffix(end_suffix), + m_byte_fallback(byte_fallback) { + + constructor_validate_and_infer_types(); +} +BPETokenizer::BPETokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr& tokenizer, + const std::string& unk_token, + bool fuse_unk, + const std::string& suffix_indicator, + const std::string& end_suffix, + bool byte_fallback +) : + ov::op::Op(arguments), + m_tokenizer(tokenizer), + m_unk_token(unk_token), + m_fuse_unk(fuse_unk), + m_suffix_indicator(suffix_indicator), + m_end_suffix(end_suffix), + m_byte_fallback(byte_fallback) { + + if (m_tokenizer == nullptr) { + std::cerr << "[ BPETokenizer ] Slow Init\n"; + // vocab constant folding doesn't work, get packed constant + auto packed_vocab_const = as_type_ptr(arguments[5].get_node_shared_ptr()->get_input_node_shared_ptr(0)); + auto packed_vocab_buf = static_cast(packed_vocab_const->get_data_ptr()); + auto vocab_size = *reinterpret_cast(packed_vocab_buf + 0); + auto vocab_begins = reinterpret_cast(packed_vocab_buf + 4); + auto vocab_ends = reinterpret_cast(packed_vocab_buf + 4 + 4); + auto vocab_chars = packed_vocab_buf + 4 + 4 + 4 * vocab_size; + + core::Vocab vocab; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + } + + auto packed_merges_const = as_type_ptr(arguments[8].get_node_shared_ptr()->get_input_node_shared_ptr(0)); + auto packed_merges_buf = static_cast(packed_merges_const->get_data_ptr()); + auto merges_size = *reinterpret_cast(packed_merges_buf + 0); + auto merges_begins = reinterpret_cast(packed_merges_buf + 4); + auto merges_ends = reinterpret_cast(packed_merges_buf + 4 + 4); + auto merges_chars = packed_merges_buf + 4 + 4 + 4 * merges_size; + + core::Merges merges; + std::string delim = " "; + for(size_t id = 0; id < merges_size; ++id) { + auto merge = std::string(merges_chars + merges_begins[id], merges_chars + merges_ends[id]); + const int delim_pos = merge.find(delim); + + std::pair merge_pair = { + merge.substr(0, delim_pos), merge.substr(delim_pos + 1) + }; + merges.emplace_back(merge_pair); + } + + std::vector unk_token = {}; + if (m_unk_token.size() > 0) { + unk_token.push_back(m_unk_token); + }; + std::vector suffix_indicator = {}; + if (m_suffix_indicator.size() > 0) { + suffix_indicator.push_back(m_suffix_indicator); + }; + std::vector end_suffix = {}; + if (m_end_suffix.size() > 0) { + end_suffix.push_back(m_end_suffix); + }; + + m_tokenizer = std::make_shared( + vocab, + merges, + 10000 /* default cache size */, + std::vector {} /* dropout - don't use dropout for inference */, + unk_token, + suffix_indicator, + end_suffix, + m_fuse_unk + ); + } + + constructor_validate_and_infer_types(); +} + + void BPETokenizer::validate_and_infer_types() { check_ragged_string_input(this, 0); check_string_input(this, 5); @@ -27,20 +124,8 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i auto ends = inputs[3].data(); auto chars = inputs[4].data(); - auto vocab_begins = inputs[5].data(); - auto vocab_ends = inputs[6].data(); - auto vocab_chars = inputs[7].data(); - - auto merges_begins = inputs[8].data(); - auto merges_ends = inputs[9].data(); - auto merges_chars = inputs[10].data(); - - auto vocab_size = inputs[5].get_size(); - auto merges_size = inputs[8].get_size(); - OPENVINO_ASSERT(inputs.size() == 11, "Too few inputs passed to BPETokenizer, it means it is not converted properly or it is not used in the supported pattern"); -#if 1 // Set output shapes outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); @@ -50,104 +135,24 @@ bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i // to represent different elements in ragged tensor outputs[2].set_shape({inputs[4].get_size()}); - using namespace paddlenlp::fast_tokenizer; - -// std::cerr << "[ BPETokenizer ] Start vocab reading\n"; - core::Vocab vocab; - int32_t unk_token_id = -1; - -// std::cerr << "[ BPETokenizer ] Vocab size is " << vocab_size << "\n"; - - for(size_t id = 0; id < vocab_size; ++id) { - auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[token] = int32_t(id); // TODO: Check range - } - -// std::cerr << "[ BPETokenizer ] Finish vocab reading\n"; -// -// std::cerr << "[ BPETokenizer ] Start merges reading\n"; -// std::cerr << "[ BPETokenizer ] Merges Size: " << merges_size << "\n"; - core::Merges merges; - std::string delim = " "; - - - for(size_t id = 0; id < merges_size; ++id) { - auto merge = std::string(merges_chars + merges_begins[id], merges_chars + merges_ends[id]); - const int delim_pos = merge.find(delim); - - std::pair merge_pair = { - merge.substr(0, delim_pos), merge.substr(delim_pos + 1) - }; - merges.emplace_back(merge_pair); - } - -// std::cerr << "[ BPETokenizer ] Finish merges reading\n"; - - -// std::cerr << "[ BPETokenizer ] Start tokenizer initialization\n"; - - std::vector unk_token = {}; - if (m_unk_token.size() > 0) { - unk_token.push_back(m_unk_token); - }; - std::vector suffix_indicator = {}; - if (m_suffix_indicator.size() > 0) { - suffix_indicator.push_back(m_suffix_indicator); - }; - std::vector end_suffix = {}; - if (m_end_suffix.size() > 0) { - end_suffix.push_back(m_end_suffix); - }; - - models::BPE tokenizer(vocab, merges, 10000 /* default cache size */, {} /* dropout - don't use dropout for inference */, - unk_token, suffix_indicator, end_suffix, m_fuse_unk); - -// std::cerr << "[ BPETokenizer ] Finish tokenizer initialization\n"; - // Get pointers in the output tensors auto new_begins = outputs[0].data(); auto new_ends = outputs[1].data(); auto new_elems = outputs[2].data(); int32_t ragged_offset = 0; -// std::cerr << "Ragged Begins and ends:\n"; -// for (size_t i = 0; i < inputs[0].get_size(); ++i) { -// std::cerr << inputs[0].data()[i] << ", "; -// } -// std::cerr << "\n"; -// for (size_t i = 0; i < inputs[1].get_size(); ++i) { -// std::cerr << inputs[1].data()[i] << ", "; -// } -// std::cerr << "\n"; - - for(size_t seq = 0; seq < num_rows; ++seq) { new_begins[seq] = ragged_offset; for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); - - std::cerr << "[ BPETokenizer ] String: '" << str << "'\n"; -// std::cerr << "[ BPETokenizer ] String len: " << ends[ragged_col] - begins[ragged_col] << "\n"; - - std::vector results = tokenizer.Tokenize(str); - + std::vector results = m_tokenizer->Tokenize(str); for (const core::Token& token : results) { - std::cout << "[ BPETokenizer ] id: " << token.id_ << ", value: " << token.value_ - << ", offset: (" << token.offset_.first << ", " - << token.offset_.second << ")." << std::endl; OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); new_elems[ragged_offset++] = token.id_; }; } - new_ends[seq] = ragged_offset; } outputs[2].set_shape({size_t(ragged_offset)}); return true; - -#else - // Stub implementation that transforms each input string to its length duplicating element if the length is odd - // End of stub implementation -#endif } - diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp index dc0a8dd4b..99073b1b2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp @@ -5,13 +5,19 @@ #pragma once #include +#include "fast_tokenizer/models/models.h" + + +using namespace paddlenlp::fast_tokenizer; + +#undef tokenizer +#undef m_tokenizer class OPENVINO_API BPETokenizer : public ov::op::Op { public: OPENVINO_OP("BPETokenizer"); BPETokenizer () = default; - BPETokenizer( const ov::OutputVector& arguments, const std::string& unk_token = "", @@ -19,20 +25,21 @@ class OPENVINO_API BPETokenizer : public ov::op::Op { const std::string& suffix_indicator = "", const std::string& end_suffix = "", bool byte_fallback = false - ) : - ov::op::Op(arguments), - m_unk_token(unk_token), - m_fuse_unk(fuse_unk), - m_suffix_indicator(suffix_indicator), - m_end_suffix(end_suffix), - m_byte_fallback(byte_fallback) { - constructor_validate_and_infer_types(); - } + ); + BPETokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr& tokenizer, + const std::string& unk_token = "", + bool fuse_unk = false, + const std::string& suffix_indicator = "", + const std::string& end_suffix = "", + bool byte_fallback = false + ); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_unk_token, m_fuse_unk, m_suffix_indicator, m_end_suffix, m_byte_fallback); + return std::make_shared(inputs, m_tokenizer, m_unk_token, m_fuse_unk, m_suffix_indicator, m_end_suffix, m_byte_fallback); } bool visit_attributes(ov::AttributeVisitor& visitor) override { @@ -51,6 +58,7 @@ class OPENVINO_API BPETokenizer : public ov::op::Op { } private: + std::shared_ptr m_tokenizer; std::string m_unk_token; bool m_fuse_unk = false; std::string m_suffix_indicator; From bd7f9d918bc8d7652daa912e8fbe05cffcae6e9d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 11 Aug 2023 15:30:36 +0100 Subject: [PATCH 045/116] Fix RegexNormalization --- .../user_ie_extensions/tokenizer/bpe_tokenizer.cpp | 1 - .../user_ie_extensions/tokenizer/regex_normalization.cpp | 8 ++++++-- .../user_ie_extensions/tokenizer/regex_split.cpp | 5 ----- .../user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp | 6 +++--- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp index 47e4eb7ab..a16d294f9 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp @@ -47,7 +47,6 @@ BPETokenizer::BPETokenizer( m_byte_fallback(byte_fallback) { if (m_tokenizer == nullptr) { - std::cerr << "[ BPETokenizer ] Slow Init\n"; // vocab constant folding doesn't work, get packed constant auto packed_vocab_const = as_type_ptr(arguments[5].get_node_shared_ptr()->get_input_node_shared_ptr(0)); auto packed_vocab_buf = static_cast(packed_vocab_const->get_data_ptr()); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp index 9d6529b4e..bebcfe297 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -46,8 +46,12 @@ bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVec return evaluate_normalization_helper( outputs, inputs, [this](const std::string& str) { - std::string result; - re2::RE2::Extract(str, *m_search_pattern_re, m_replace_pattern, &result); + // FIXME: if regex is not valid re2, return string without changing (use another regex engine) + if (m_search_pattern_re->NumberOfCapturingGroups() == -1) + return str; + + std::string result = str; + re2::RE2::GlobalReplace(&result, *m_search_pattern_re, m_replace_pattern); return result; }); } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp index f187d877e..103e02049 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -65,7 +65,6 @@ void RegexSplit::validate_and_infer_types() { } bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - std::cerr << "[ RegexSplit ] Eval \n"; auto ragged_begins = inputs[0].data(); auto ragged_ends = inputs[1].data(); @@ -77,13 +76,9 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp const size_t num_rows = inputs[0].get_size(); const size_t num_chars = inputs[4].get_size(); - std::cerr << "[ RegexSplit ] Before Shape \n"; - outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); - std::cerr << "[ RegexSplit ] After Shape \n"; - outputs[2].set_shape(Shape{num_chars}); outputs[3].set_shape(Shape{num_chars}); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp index 3983c3f50..301ee6c0e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp @@ -99,9 +99,9 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); std::vector results = m_tokenizer->Tokenize(str); - for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { - std::cerr << static_cast (chars[i]) << " "; - } +// for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { +// std::cerr << static_cast (chars[i]) << " "; +// } for (const core::Token& token : results) { OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); new_elems[ragged_offset++] = token.id_; From 99c603f534ea4137653ba5c229c15f98e1d93b51 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 6 Sep 2023 17:48:56 +0100 Subject: [PATCH 046/116] Refactor CombineSegments and Padding --- .../tokenizer/tokenizer_pipeline.py | 81 ++++++------------- 1 file changed, 24 insertions(+), 57 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py index 933c4e7d8..4e86ce698 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py @@ -1,12 +1,10 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os from dataclasses import dataclass, field from functools import singledispatchmethod -from itertools import chain +from itertools import chain, islice from typing import List, Optional, Any, Dict -from unittest.mock import MagicMock import weakref import numpy as np @@ -39,7 +37,7 @@ def pack_string(s): factory = NodeFactory() # TODO: Use relative path -factory.add_extension("/home/apaniuko/python/openvino/bin/intel64/Debug/libuser_ov_extensions.so") +factory.add_extension("/home/apaniuko/python/openvino/bin/intel64/Release/libuser_ov_extensions.so") class BasePipelineStep: @@ -536,6 +534,9 @@ def from_hf_json_bert_postprocessor( def from_hf_json_roberta_processor( cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 ) -> "CombineSegmentsStep": + if number_of_inputs == 2: + raise UserInputError("Two inputs not supported for RoBERTa processor") + post_processor_dict = tokenizer_json["post_processor"] inputs: List[TokenWithTypeId] = [Sequence(token_type_id=0)] @@ -547,61 +548,33 @@ def from_hf_json_roberta_processor( 0, AddToken(token=post_processor_dict["cls"][0], token_type_id=0) ) inputs.append(AddToken(token=post_processor_dict["sep"][0], token_type_id=0)) - - if number_of_inputs == 2: - print("WARNING: Pair of inputs not supported for RoBERTa postprocessor") - return cls(inputs) - def get_ov_subgraph(self, input_nodes): + def validate_inputs(self, input_nodes: List[Output]) -> None: number_of_sequence_inputs = sum( 1 for input_ in self.inputs if isinstance(input_, Sequence) ) - #print('number_of_sequence_inputs:', number_of_sequence_inputs) - if number_of_sequence_inputs != len(input_nodes)/3: + if number_of_sequence_inputs != len(input_nodes) / 3: raise UserInputError( f"Number of input nodes: {len(input_nodes)}, must be equal to {number_of_sequence_inputs}" ) - op_inputs = [] - i = 0 + def get_ov_subgraph(self, input_nodes): + self.validate_inputs(input_nodes) + op_inputs = [] + input_nodes_iter = iter(input_nodes) for node in self.inputs: if isinstance(node, Sequence): - op_inputs += input_nodes[3*i:3*(i+1)] - i += 1 + op_inputs.extend(islice(input_nodes_iter, 3)) else: # Put a scalar as a ragged tensor with scalar shape and a single element - op_inputs += make_constant_node(0, Type.i32).outputs() - op_inputs += make_constant_node(1, Type.i32).outputs() - #print('Should be scalar:', op_inputs[-1]) - #print('token', node._token_id) + op_inputs.extend(make_constant_node(0, Type.i32).outputs()) + op_inputs.extend(make_constant_node(1, Type.i32).outputs()) op_inputs.append(make_constant_node(np.array([node._token_id]), Type.i32).output(0)) op_inputs.append(make_constant_node(self.segment_ids, Type.i32).output(0)) - - #print('op_inputs:', op_inputs) - - # FIXME: Disabled for now, no implementation - # operation = string_ops.CombineSegments( - # *op_inputs, - # self.segment_ids, - # self.axis, - # ) - # operation.configure_mock(**{"outputs.return_value": [MagicMock()]}) - # return operation - - # Decomposed implementation - return factory.create('CombineSegments', op_inputs).outputs() - print(input_nodes) - assert len(input_nodes) == 3, '[ TOKENIZER PIPELINE CONVERSION ] CombineSegments can be converted for a single ragged input tensor only, this is temporary limitation' - print('self.segment_ids:', self.segment_ids) - # Make another ragged tensor with identical structure but with all values filled with self.segment_ids[0] - segment_ids_output = [input_nodes[0], input_nodes[1], opset10.broadcast(make_constant_node(self.segment_ids[0], Type.i32), opset10.shape_of(input_nodes[2])).output(0)] - print('[ TOKENIZER PIPELINE CONVERSION ] [ DEBUG ] CombineSegments outputs:', input_nodes + segment_ids_output) - return input_nodes + segment_ids_output - @dataclass @@ -621,20 +594,19 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "PaddingStep": # TODO: Initialize max_length ) - def get_ov_subgraph(self, input_nodes): + @staticmethod + def validate_inputs(input_nodes) -> None: # Suppose input_nodes may have multiple tuples each with 3 tensors represented decomposed ragged tensors # We suppose that all ragged tensors represent the same structure and produce the mask only once - assert len(input_nodes) % 3 == 0 - assert len(input_nodes) >= 3 + if len(input_nodes) % 3 != 0 or len(input_nodes) < 3: + raise UserInputError( + f"Number of input nodes should be divisible by 3 and bigger or equal 3. Got {len(input_nodes)}" + ) + + def get_ov_subgraph(self, input_nodes): + self.validate_inputs(input_nodes) - #lens = opset10.subtract(input_nodes[1], input_nodes[2]) - #max_len = opset10.reduce_max(lens) - #padded_len = outputs = [] - #print(self.token) - #print('max_length =', self.max_length) - #print('ERRROR: SETTING MAX_LENGTH = 100') - #print('ERROR: Ignoring pad token and set it to id = 0') if self.max_length == -1 or self.max_length >= 2 ** 31: # Calculate max_length as the maximum ragged length @@ -642,13 +614,8 @@ def get_ov_subgraph(self, input_nodes): else: max_length = make_constant_node(self.max_length, Type.i32) - #if self.token_type_id == -1: - # self.token_type_id = 0 names = ["input_ids", "token_type_ids"] - for i, name in zip(range(len(input_nodes)//3), names): - #print(input_nodes[3*i:3*(i+1)]) - #print(as_node(self.max_length).outputs()) - #print(as_node(np.array(0, dtype=int)).outputs()) + for i, name in enumerate(names): cur_outputs = factory.create( "RaggedToDense", input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs() From 6cc9b36a9faa31855dfa2f98b9bffa2037c2c65e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 7 Sep 2023 20:48:23 +0100 Subject: [PATCH 047/116] Refactoring --- .../tokenizer/convert_tokenizer.py | 109 +++---- .../user_ie_extensions/tokenizer/hf_parser.py | 20 +- .../user_ie_extensions/tokenizer/str_pack.py | 67 +++-- .../tokenizer/tokenizer_pipeline.py | 269 ++++++++---------- 4 files changed, 243 insertions(+), 222 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py index db2e4b30b..749183297 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py @@ -3,14 +3,18 @@ # SPDX-License-Identifier: Apache-2.0 import sys -from typing import Any, Tuple, Union +import logging +from typing import Any, Tuple, Union, Optional, Sequence -from openvino.runtime.exceptions import OVTypeError +from openvino.runtime.exceptions import OVTypeError, UserInputError from openvino.runtime import Model +logger = logging.getLogger(__name__) + + def convert_tokenizer( - tokenizer_object: Any, number_of_inputs: int = 1, with_decoder=False + tokenizer_object: Any, number_of_inputs: int = 1, with_decoder=False ) -> Union[Model, Tuple[Model, Model]]: if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase @@ -18,9 +22,7 @@ def convert_tokenizer( # TODO: Remove this check if isinstance(tokenizer_object, PreTrainedTokenizerBase): - pipeline = TransformersTokenizerPipelineParser(tokenizer_object).parse( - number_of_inputs=number_of_inputs - ) + pipeline = TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs) ov_tokenizer = pipeline.get_encoder_ov_subgraph() if with_decoder: ov_detokenizer = pipeline.get_decoder_ov_subgraph() @@ -33,7 +35,8 @@ def convert_tokenizer( filtered_outputs = [] for i, output_name in enumerate(ov_tokenizer_output_names): current_output = next( - (output for output in ov_tokenizer.outputs if output.any_name == output_name), False + (output for output in ov_tokenizer.outputs if output.any_name == output_name), + False, ) if current_output: filtered_outputs.append(current_output) @@ -44,54 +47,64 @@ def convert_tokenizer( filtered_outputs.append(ov_tokenizer.output(i)) if with_decoder: - return Model(filtered_outputs, ov_tokenizer.get_parameters()), ov_detokenizer + return ( + Model(filtered_outputs, ov_tokenizer.get_parameters()), + ov_detokenizer, + ) return Model(filtered_outputs, ov_tokenizer.get_parameters()) raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") -def connect_models(model1: Model, model2: Model, name_map=None, *, by_indices=None, by_names=None) -> Model: - # TODO: Relax this limitation by not connecting some inputs/outputs together - #print(len(model2.inputs)) - #print(len(model1.outputs)) - #assert len(model2.inputs) == len(model1.outputs) - - if by_indices is None and by_names is None: - by_names = True - - if name_map is not None: - by_names = True - - # TODO: Check only one of by_indices and by_names is set - +def connect_models( + first: Model, + second: Model, + name_map: Optional[Sequence[Tuple[str, str]]] = None, + by_indices: bool = False, + keep_unaligned_inputs: bool = True, + keep_unaligned_outputs: bool = False, +) -> Model: if by_indices: - aligned_model1_outputs = model1.outputs - aligned_model2_inputs = model2.inputs - elif by_names: - if name_map is None: - aligned_model1_outputs = model1.outputs - aligned_model2_inputs = [model2.input(model1_output.get_any_name()) for model1_output in aligned_model1_outputs] - - ''' - aligned_model1_outputs = [] - aligned_model2_inputs = [] - for model2_input in model2.inputs: - # Search for corresponding model1 output by all possible names - for model1_output in model2.outputs - ''' - - else: - aligned_model1_outputs = [model1.output(name1) for name1, _ in name_map] - aligned_model2_inputs = [model2.input(name2) for _, name2 in name_map] - - for model2_input, model1_output in zip(aligned_model2_inputs, aligned_model1_outputs): - #print(f'Connecting: {model1_output.get_any_name()} -> {model2_input.get_any_name()}') - for target in model2_input.get_target_inputs(): - target.replace_source_output(model1_output.get_node().input_value(0)) - #target.replace_source_output(model1_output) # TODO: Produces incorrect topology - - connected_model = Model(model2.outputs, model1.get_parameters()) + min_len = min(len(first.outputs), len(second.inputs)) + aligned_first_outputs = first.outputs[:min_len] + aligned_second_inputs = second.inputs[:min_len] + elif name_map is None: + aligned_first_outputs = first.outputs + aligned_second_inputs = [second.input(model1_output.get_any_name()) for model1_output in aligned_first_outputs] + else: + aligned_first_outputs = [first.output(name1) for name1, _ in name_map] + aligned_second_inputs = [second.input(name2) for _, name2 in name_map] + + for second_input, first_output in zip(aligned_second_inputs, aligned_first_outputs): + logger.debug(f"Connecting: {first_output.get_any_name()} -> {second_input.get_any_name()}") + for target in second_input.get_target_inputs(): + target.replace_source_output(first_output.get_node().input_value(0)) + # target.replace_source_output(model1_output) # TODO: Produces incorrect topology + + new_inputs = first.get_parameters() + remaining_inputs = [input_ for input_ in second.inputs if input_ not in aligned_second_inputs] + if keep_unaligned_inputs: + new_inputs.expend(remaining_inputs) + elif remaining_inputs: + logger.info( + "Some inputs of the second model were left uncovered and not included in the connected model: " + + ", ".join(input_.name for input_ in remaining_inputs) + + ". To add them set `keep_unaligned_inputs` to `True`" + ) + + new_outputs = second.outputs + remaining_outputs = [output for output in first.outputs if output not in aligned_first_outputs] + if keep_unaligned_outputs: + new_outputs.extend(remaining_outputs) + elif remaining_outputs: + logger.info( + "Some outputs of the first model were left uncovered and not included in the connected model: " + + ", ".join(output.name for output in remaining_outputs) + + ". To add them set `keep_unaligned_outputs` to `True`" + ) + + connected_model = Model(new_outputs, new_inputs, f"{first.get_name()}_{second.get_name()}") # TODO: Cleanup model1 and mode2 to avoid using them, they are ill-formed after the reconnection connected_model.validate_nodes_and_infer_types() return connected_model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py index 1b2772bde..ff800ac0b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py @@ -69,12 +69,12 @@ def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep: return RegexSplitStep( split_pattern=split_pattern, invert=pretokenizer_dict["invert"], - behaviour=pretokenizer_dict["behavior"].lower().rstrip("d") + behaviour=pretokenizer_dict["behavior"].lower().rstrip("d"), ) def parse_byte_level_pretokenization_step( - pretokenizer_dict: Dict[str, Any] + pretokenizer_dict: Dict[str, Any] ) -> List[Union[NormalizationStep, PreTokenizatinStep]]: steps = [] if pretokenizer_dict.get("add_prefix_space"): @@ -117,7 +117,10 @@ def parse(self, number_of_inputs: Optional[int] = None) -> TokenizerPipeline: return self.pipeline - normalizers_map: Dict[str, Callable[[Dict[str, Any]], Union[NormalizationStep, List[NormalizationStep]]]] = { + normalizers_map: Dict[ + str, + Callable[[Dict[str, Any]], Union[NormalizationStep, List[NormalizationStep]]], + ] = { "NFC": lambda step_dict: NormalizeUnicode("NFC"), "NFD": lambda step_dict: NormalizeUnicode("NFD"), "NFKC": lambda step_dict: NormalizeUnicode("NFKC"), @@ -146,7 +149,10 @@ def normalization(self) -> None: else: self.parse_normalizer_step(self.tokenizer_json["normalizer"]) - pre_tokenization_map: Dict[str, Callable[[Dict[str, Any]], Union[PreTokenizatinStep, List[PreTokenizatinStep]]]] = { + pre_tokenization_map: Dict[ + str, + Callable[[Dict[str, Any]], Union[PreTokenizatinStep, List[PreTokenizatinStep]]], + ] = { "BertPreTokenizer": lambda step_dict: RegexSplitStep.bert_splitter(), "Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(), "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(), @@ -155,7 +161,7 @@ def normalization(self) -> None: "ByteLevel": parse_byte_level_pretokenization_step, "Digits": lambda step_dict: RegexSplitStep.digits_splitter( "isolate" if step_dict["individual_digits"] else "contiguous" - ) + ), } def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None: @@ -186,8 +192,8 @@ def tokenization_model(self) -> None: def post_tokenization(self) -> None: if ( - self.tokenizer_json["post_processor"] is None - or self.tokenizer_json["post_processor"]["type"] == "ByteLevel" + self.tokenizer_json["post_processor"] is None + or self.tokenizer_json["post_processor"]["type"] == "ByteLevel" ): self.add_truncation() self.add_padding() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py b/modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py index 01d739661..1f09a992e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py @@ -1,28 +1,61 @@ +from typing import List +from io import BytesIO + import numpy as np +from numpy.typing import NDArray +from openvino.runtime.exceptions import UserInputError + + +def to_bytes(number: int) -> bytes: + return number.to_bytes(4, "little") + + +def pack_string(string: str) -> NDArray: + return np.frombuffer(bytes(string + " ", "utf-8"), dtype=np.uint8) # + ' ' is WA for CPU bug + + +def pack_strings(strings: List[str]) -> NDArray: + """ + Convert any list of string to U8/1D numpy array compatible with converted OV model input + """ + if not isinstance(strings, list): + raise UserInputError("") -# Convert any list of string to U8/1D numpy array compatible with converted OV model input -def pack_strings(strings): - to_bytes = lambda x: x.to_bytes(4, 'little') batch_size = len(strings) if batch_size == 0: return to_bytes(0) - offsets = to_bytes(0) - symbols = bytes() - for s in strings: - symbols += bytes(s, 'utf-8') - offsets += to_bytes(len(symbols)) - return np.frombuffer(bytearray(to_bytes(batch_size) + offsets + symbols), np.uint8) - -# Convert an array of uint8 elements to a list of strings; reverse to pack_strings + + buffer = BytesIO() + buffer.write(to_bytes(batch_size)) + symbols = BytesIO() + offset = 0 + buffer.write(to_bytes(offset)) + for string in strings: + byte_string = string.encode("utf-8") + offset += len(byte_string) + + buffer.write(to_bytes(offset)) + symbols.write(byte_string) + + buffer.write(symbols.getvalue()) + return np.frombuffer(buffer.getvalue(), np.uint8) + + # TODO: handle possible sighed values in batch size and offsets -def unpack_strings(u8_tensor): - from_bytes = lambda offset, size: int.from_bytes(u8_tensor[offset:offset+size], 'little') +def unpack_strings(u8_tensor: NDArray, decoding_errors: str = "replace") -> List[str]: + """ + Convert an array of uint8 elements to a list of strings; reverse to pack_strings + """ + + def from_bytes(offset: int, size: int) -> int: + return int.from_bytes(u8_tensor[offset : offset + size], "little") + batch_size = from_bytes(0, 4) strings = [] for i in range(batch_size): - begin = from_bytes(4 + i*4, 4) - end = from_bytes(4 + (i+1)*4, 4) + begin = from_bytes(4 + i * 4, 4) + end = from_bytes(4 + (i + 1) * 4, 4) length = end - begin - begin += 4*(batch_size + 2) - strings.append(bytes(u8_tensor[begin:begin+length]).decode('utf-8')) + begin += 4 * (batch_size + 2) + strings.append(bytes(u8_tensor[begin : begin + length]).decode("utf-8", errors=decoding_errors)) return strings diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py index 4e86ce698..114a369ac 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py @@ -4,36 +4,17 @@ from dataclasses import dataclass, field from functools import singledispatchmethod from itertools import chain, islice -from typing import List, Optional, Any, Dict +from typing import List, Optional, Any, Dict, Union import weakref import numpy as np from openvino.runtime.exceptions import UserInputError, OVTypeError -from openvino.runtime import Type, PartialShape, op, Model, Output, Node, opset10 +from openvino.runtime import Type, PartialShape, op, Model, Output, opset10 from openvino.runtime.utils.node_factory import NodeFactory from openvino.runtime.utils.types import as_node, make_constant_node - -def pack_strings(strings): - assert isinstance(strings, list) - to_bytes = lambda x: x.to_bytes(4, "little") - batch_size = len(strings) - if batch_size == 0: - return to_bytes(0) - offsets = to_bytes(0) - symbols = bytes() - for s in strings: - symbols += bytes(s, "utf-8") - offsets += to_bytes(len(symbols)) - return np.frombuffer(bytearray(to_bytes(batch_size) + offsets + symbols), np.uint8) - - -def pack_string(s): - return np.frombuffer( - bytes(s + " ", "utf-8"), dtype=np.uint8 - ) # + ' ' is WA for CPU bug - +from str_pack import pack_string, pack_strings factory = NodeFactory() # TODO: Use relative path @@ -63,11 +44,11 @@ def get_pipeline(self) -> Optional["TokenizerPipeline"]: def set_pipeline(self, pipeline: "TokenizerPipeline") -> None: self._pipeline = weakref.ref(pipeline) - def get_ov_subgraph(self, *input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, *input_nodes: List[Output]) -> List[Output]: raise NotImplementedError @staticmethod - def create_string_constant_node(value: str) -> op.Constant: + def create_string_constant_node(value: Union[str, List[str]]) -> op.Constant: if isinstance(value, str): # string scalar ps = pack_string(value) @@ -87,15 +68,17 @@ class NormalizationStep(BasePipelineStep): class NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return factory.create( - "NormalizeUnicode", input_nodes, {"normalization_form": self.normalization_form} + "NormalizeUnicode", + input_nodes, + {"normalization_form": self.normalization_form}, ).outputs() @dataclass class CaseFoldStep(NormalizationStep): - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return factory.create("CaseFold", input_nodes).outputs() @@ -115,23 +98,26 @@ def add_prefix_whitespace_regex(cls) -> "RegexNormalizationStep": @classmethod def del_control_chars_regex(cls) -> "RegexNormalizationStep": # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L17 - return cls(regex_search_pattern=r"((?=[^\n\t\r])\p{Cc})|((?=[^\n\t\r])\p{Cf})", replace_term=" ") + return cls( + regex_search_pattern=r"((?=[^\n\t\r])\p{Cc})|((?=[^\n\t\r])\p{Cf})", + replace_term=" ", + ) @classmethod def clean_up_tokenization_spaces(cls) -> "RegexNormalizationStep": - return cls(regex_search_pattern=r" ([\.\?\!\,])| ('[ms])| (') | ('[rv]e)", replace_term="\1") + return cls( + regex_search_pattern=r" ([\.\?\!\,])| ('[ms])| (') | ('[rv]e)", + replace_term="\1", + ) - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend( ( - *self.create_string_constant_node(self.regex_search_pattern).outputs(), - *self.create_string_constant_node(self.replace_term).outputs(), + self.create_string_constant_node(self.regex_search_pattern), + self.create_string_constant_node(self.replace_term), ) ) - return factory.create( - "RegexNormalization", - input_nodes - ).outputs() + return factory.create("RegexNormalization", input_nodes).outputs() @dataclass @@ -144,14 +130,14 @@ class NMTNormalizationStep(NormalizationStep): @dataclass class StripAccentsStep(NormalizationStep): - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return RegexNormalizationStep.strip_accents_regex().get_ov_subgraph(input_nodes) + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return RegexNormalizationStep.strip_accents_regex().get_ov_subgraph(input_nodes).outputs() @dataclass class DelControlCharsStep(NormalizationStep): - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - return RegexNormalizationStep.del_control_chars_regex().get_ov_subgraph(input_nodes) + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return RegexNormalizationStep.del_control_chars_regex().get_ov_subgraph(input_nodes).outputs() @dataclass @@ -209,7 +195,7 @@ def bert_keep_delimeters_splitter(cls) -> "RegexSplitStep": ], ), invert=False, - behaviour="isolate" + behaviour="isolate", ) @classmethod @@ -230,11 +216,7 @@ def byte_level_splitter(cls) -> "RegexSplitStep": @classmethod def add_whitespace_to_the_next_word(cls): - return cls( - r"\s\S", - invert=False, - behaviour="merge_with_next" - ) + return cls(r"\s\S", invert=False, behaviour="merge_with_next") @classmethod def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": @@ -244,10 +226,8 @@ def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": behaviour=behaviour, ) - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: - input_nodes.extend( - self.create_string_constant_node(self.split_pattern).outputs() - ) + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs()) return factory.create( "RegexSplit", input_nodes, @@ -261,20 +241,23 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: @dataclass class WhitespaceSplitStep(PreTokenizatinStep): """Works like python `str.split`.""" + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes) + return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes).outputs() @dataclass class PunctuationSplitStep(PreTokenizatinStep): """Splits string on punctuation chars.""" + # behaviour: str = "Isolated" @dataclass class BytesToCharsStep(PreTokenizatinStep): """Maps chars to other chars for Byte-level BPE Tokenizer""" - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return factory.create( "BytesToChars", input_nodes, @@ -312,7 +295,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "WordPieceTokenizationS vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])], ) - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend( ( *self.create_string_constant_node(self.vocab).outputs(), @@ -325,7 +308,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: { "suffix_indicator": self.suffix_indicator, "max_bytes_per_word": self.max_bytes_per_word, - } + }, ).outputs() @@ -350,7 +333,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": merges=tokenizer_json["model"]["merges"], ) - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: pipeline = self.get_pipeline() pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs() input_nodes.extend( @@ -368,7 +351,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: "suffix_indicator": self.suffix_indicator, "end_suffix": self.end_suffix, "byte_fallback": self.byte_fallback, - } + }, ).outputs() @@ -398,31 +381,32 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any], num_of_added_tokens: int = def from_hf_object(cls, tokenizer: Any, num_of_added_tokens: int = 0) -> "TruncationStep": max_length = min( tokenizer.model_max_length - num_of_added_tokens, - 2 ** 31 - 1 - num_of_added_tokens, + 2**31 - 1 - num_of_added_tokens, ) return cls( max_length=max_length, truncate_right=tokenizer.truncation_side == "right", ) + @staticmethod + def validate_inputs(input_nodes): + if len(input_nodes) != 3: + raise UserInputError("Only one input ragged tensor is supported as an input for TruncationStep") + def get_ov_subgraph(self, input_nodes: List[Output]): - # FIXME: disabled for now - # operation = string_ops.Truncation( - # *input_nodes, - # as_node(self.max_length), - # self.truncate_right, - # self.axis, - # ) - # operation.configure_mock(**{"outputs.return_value": [MagicMock() for _ in range(len(input_nodes))]}) - # return operation - #print('[ TOKENIZER PIPELINE CONVERSION ] WARNING: Truncation is not applied because it is not implemented') - #print('Trunc max_length:', self.max_length) # FIXME: Truncation side (truncate_right) is ignored # TODO: Check if axis is the right-most dimension - assert len(input_nodes) == 3, 'Only one input ragged tensor is supported as an input for TruncationStep' + self.validate_inputs(input_nodes) - max_length = opset10.minimum(opset10.subtract(input_nodes[1], input_nodes[0]), make_constant_node(self.max_length, Type.i32)) - return [input_nodes[0], opset10.add(input_nodes[0], max_length).output(0), input_nodes[2]] + max_length = opset10.minimum( + opset10.subtract(input_nodes[1], input_nodes[0]), + make_constant_node(self.max_length, Type.i32), + ) + return [ + input_nodes[0], + opset10.add(input_nodes[0], max_length).output(0), + input_nodes[2], + ] @dataclass @@ -434,10 +418,6 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None: if vocab is not None and self.token in vocab: self._token_id = vocab.index(self.token) - @property - def token_id(self) -> Optional[int]: - return self._token_id - @dataclass class TokenWithTypeId: @@ -481,7 +461,7 @@ def number_of_added_tokens(self) -> int: @classmethod def from_hf_json_template_postprocessor( - cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 ) -> "CombineSegmentsStep": inputs: List[TokenWithTypeId] = [] if number_of_inputs == 1: @@ -498,12 +478,11 @@ def from_hf_json_template_postprocessor( inputs.append(step) else: inputs.append(Sequence(token_type_id=template_dict["Sequence"]["type_id"])) - return cls(inputs) @classmethod def from_hf_json_bert_postprocessor( - cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 ) -> "CombineSegmentsStep": post_processor_dict = tokenizer_json["post_processor"] inputs: List[TokenWithTypeId] = [ @@ -517,7 +496,6 @@ def from_hf_json_bert_postprocessor( token_type_id=0, ), ] - if number_of_inputs == 2: inputs.extend( [ @@ -532,7 +510,7 @@ def from_hf_json_bert_postprocessor( @classmethod def from_hf_json_roberta_processor( - cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 ) -> "CombineSegmentsStep": if number_of_inputs == 2: raise UserInputError("Two inputs not supported for RoBERTa processor") @@ -544,16 +522,12 @@ def from_hf_json_roberta_processor( if not post_processor_dict.get("add_special_tokens", True): return cls(inputs) - inputs.insert( - 0, AddToken(token=post_processor_dict["cls"][0], token_type_id=0) - ) + inputs.insert(0, AddToken(token=post_processor_dict["cls"][0], token_type_id=0)) inputs.append(AddToken(token=post_processor_dict["sep"][0], token_type_id=0)) return cls(inputs) def validate_inputs(self, input_nodes: List[Output]) -> None: - number_of_sequence_inputs = sum( - 1 for input_ in self.inputs if isinstance(input_, Sequence) - ) + number_of_sequence_inputs = sum(1 for input_ in self.inputs if isinstance(input_, Sequence)) if number_of_sequence_inputs != len(input_nodes) / 3: raise UserInputError( f"Number of input nodes: {len(input_nodes)}, must be equal to {number_of_sequence_inputs}" @@ -567,14 +541,16 @@ def get_ov_subgraph(self, input_nodes): for node in self.inputs: if isinstance(node, Sequence): op_inputs.extend(islice(input_nodes_iter, 3)) - else: + elif isinstance(node, AddToken): # Put a scalar as a ragged tensor with scalar shape and a single element op_inputs.extend(make_constant_node(0, Type.i32).outputs()) op_inputs.extend(make_constant_node(1, Type.i32).outputs()) op_inputs.append(make_constant_node(np.array([node._token_id]), Type.i32).output(0)) + else: + raise UserInputError(f"Unexpected node type in CombineSegments: {type(node)}") op_inputs.append(make_constant_node(self.segment_ids, Type.i32).output(0)) - return factory.create('CombineSegments', op_inputs).outputs() + return factory.create("CombineSegments", op_inputs).outputs() @dataclass @@ -595,7 +571,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "PaddingStep": ) @staticmethod - def validate_inputs(input_nodes) -> None: + def validate_inputs(input_nodes: List[Output]) -> None: # Suppose input_nodes may have multiple tuples each with 3 tensors represented decomposed ragged tensors # We suppose that all ragged tensors represent the same structure and produce the mask only once if len(input_nodes) % 3 != 0 or len(input_nodes) < 3: @@ -603,28 +579,33 @@ def validate_inputs(input_nodes) -> None: f"Number of input nodes should be divisible by 3 and bigger or equal 3. Got {len(input_nodes)}" ) - def get_ov_subgraph(self, input_nodes): + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: self.validate_inputs(input_nodes) outputs = [] - if self.max_length == -1 or self.max_length >= 2 ** 31: + if self.max_length == -1 or self.max_length >= 2**31: # Calculate max_length as the maximum ragged length - max_length = opset10.reduce_max(opset10.subtract(input_nodes[1], input_nodes[0]), make_constant_node(0, Type.i32)) + max_length = opset10.reduce_max( + opset10.subtract(input_nodes[1], input_nodes[0]), + make_constant_node(0, Type.i32), + ) else: max_length = make_constant_node(self.max_length, Type.i32) - names = ["input_ids", "token_type_ids"] + names = ["input_ids", "token_type_ids"][: len(input_nodes) // 3] for i, name in enumerate(names): cur_outputs = factory.create( "RaggedToDense", - input_nodes[3*i:3*(i+1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs() + input_nodes[3 * i : 3 * (i + 1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs(), ).outputs() cur_outputs[0].tensor.add_names({name}) outputs.append(cur_outputs[0]) if i == 0: - mask = opset10.convert(cur_outputs[1], "i32").output(0) # TODO: Change RaggedToDense to generate mask of any type + mask = opset10.convert(cur_outputs[1], "i32").output( + 0 + ) # TODO: Change RaggedToDense to generate mask of any type mask.tensor.add_names({"attention_mask"}) outputs.append(mask) @@ -642,14 +623,14 @@ class VocabDecoderStep(DecodingStep): def get_vocab_node_outputs(self) -> Optional[List[Output]]: return self.get_pipeline().vocab_node_outputs - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend(self.get_vocab_node_outputs()) return factory.create("VocabDecoder", input_nodes, {}).outputs() @dataclass class CharsToBytesStep(DecodingStep): - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return factory.create("CharsToBytes", input_nodes, {}).outputs() @@ -665,17 +646,14 @@ def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep": replace_term=r"\1", ) - def get_ov_subgraph(self, input_nodes: List[Output]) -> Node: + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend( ( *self.create_string_constant_node(self.regex_search_pattern).outputs(), *self.create_string_constant_node(self.replace_term).outputs(), ) ) - return factory.create( - "RegexNormalization", - input_nodes - ).outputs() + return factory.create("RegexNormalization", input_nodes).outputs() @dataclass @@ -706,6 +684,26 @@ def _(self, steps: list) -> None: def __getitem__(self, item: int) -> BasePipelineStep: return self.steps[item] + def get_encoder_ov_subgraph(self) -> Model: + input_nodes = [self.create_string_input() for _ in range(self.number_of_inputs)] + + processing_outputs = [] + for input_node in input_nodes: + input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() + for step in self.normalization_steps: + input_node = step.get_ov_subgraph(input_node) + input_node = self.add_ragged_dimention(input_node) + + for step in chain(self.pretokenization_steps, self.tokenization_steps): + input_node = step.get_ov_subgraph(input_node) + + processing_outputs.extend(input_node) + + for step in self.post_tokenization_steps: + processing_outputs = step.get_ov_subgraph(processing_outputs) + + return Model(processing_outputs, input_nodes, name="tokenizer_encoder") + @property def normalization_steps(self) -> List[NormalizationStep]: return [step for step in self.steps if isinstance(step, NormalizationStep)] @@ -726,51 +724,29 @@ def post_tokenization_steps(self) -> List[PostTokenizationStep]: def decoding_steps(self) -> List[DecodingStep]: return [step for step in self.steps if isinstance(step, DecodingStep)] - def create_string_input(self) -> Node: + def create_string_input(self) -> op.Parameter: return op.Parameter(Type.u8, PartialShape(["?"])) - def create_int_input(self, input_type=Type.i32) -> Node: + def create_int_input(self, input_type=Type.i32) -> op.Parameter: return op.Parameter(input_type, PartialShape(["?", "?", "?"])) - def create_processing_pipeline(self, input_nodes: List[op.Parameter]) -> List[Node]: - processing_pipelines_outputs = [] - - for input_node in input_nodes: - input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() - for step in self.normalization_steps: - input_node = step.get_ov_subgraph(input_node) - - shape = opset10.shape_of(input_node[0]) - batch_size = opset10.gather(shape, as_node(0), as_node(0)) - - # FIXME: Cannot create range with specific data type from python - ragged_begins = opset10.convert( - opset10.range(as_node(0), batch_size, as_node(1)), - "i32", - ).outputs() - ragged_ends = opset10.convert( - opset10.range( - as_node(1), - opset10.add(batch_size, as_node(1)), - as_node(1), - ), - "i32", - ).outputs() - input_node = ragged_begins + ragged_ends + input_node - - for step in chain(self.pretokenization_steps, self.tokenization_steps): - input_node = step.get_ov_subgraph(input_node) - - processing_pipelines_outputs += input_node - - return processing_pipelines_outputs - - def create_post_tokenization_pipeline(self, input_nodes: List[op.Parameter]) -> List[Output]: - for step in self.post_tokenization_steps: - pipeline_step = step.get_ov_subgraph(input_nodes) - input_nodes = pipeline_step - - return input_nodes + def add_ragged_dimention(self, input_node): + shape = opset10.shape_of(input_node[0]) + batch_size = opset10.gather(shape, as_node(0), as_node(0)) + # FIXME: Cannot create range with specific data type from python + ragged_begins = opset10.convert( + opset10.range(as_node(0), batch_size, as_node(1)), + "i32", + ).outputs() + ragged_ends = opset10.convert( + opset10.range( + as_node(1), + opset10.add(batch_size, as_node(1)), + as_node(1), + ), + "i32", + ).outputs() + return ragged_begins + ragged_ends + input_node def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: for step in self.decoding_steps: @@ -779,12 +755,6 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return factory.create("StringTensorPack", input_nodes).outputs() - def get_encoder_ov_subgraph(self) -> Model: - input_nodes = [self.create_string_input() for _ in range(self.number_of_inputs)] - processing_outputs = self.create_processing_pipeline(input_nodes) - outputs = self.create_post_tokenization_pipeline(processing_outputs) - return Model(outputs, input_nodes, name="tokenizer_encoder") - def get_greedy_decoding_ov_subgraph(self, input_node: op.Parameter) -> List[Output]: argmax = opset10.topk( data=input_node, @@ -806,4 +776,3 @@ def get_decoder_ov_subgraph(self) -> Model: model = Model(outputs, [input_node], name="tokenizer_decoder") model.output().tensor.add_names({"string_output"}) return model - From 973c52d7fbd3ccfdf10837fd454ceba25a20b431 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 8 Sep 2023 15:31:33 +0100 Subject: [PATCH 048/116] Clean-up commented code --- .../user_ie_extensions/tokenizer/bytes_to_chars.cpp | 1 - .../user_ie_extensions/tokenizer/bytes_to_chars.hpp | 8 +++----- .../user_ie_extensions/tokenizer/chars_to_bytes.cpp | 7 +------ .../user_ie_extensions/tokenizer/chars_to_bytes.hpp | 3 +++ .../user_ie_extensions/tokenizer/combine_segments.cpp | 7 ------- .../user_ie_extensions/tokenizer/convert_tokenizer.py | 2 +- .../user_ie_extensions/tokenizer/ragged_tensor_pack.cpp | 7 ------- .../user_ie_extensions/tokenizer/regex_split.cpp | 2 -- .../user_ie_extensions/tokenizer/regex_split.hpp | 3 --- .../user_ie_extensions/tokenizer/string_tensor_unpack.cpp | 6 +++--- .../user_ie_extensions/tokenizer/utils.cpp | 4 ---- .../user_ie_extensions/tokenizer/vocab_decoder.cpp | 1 - .../user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp | 3 --- 13 files changed, 11 insertions(+), 43 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp index 2919098a3..b9b4f5338 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp @@ -274,7 +274,6 @@ const std::array, 256> create_bytes_to_chars_map() { void BytesToChars::validate_and_infer_types() { check_ragged_string_input(this, 0); -// check_string_input(this, 5); set_ragged_string_output(this, 0, get_input_partial_shape(0)); } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp index 2b7598c50..d064467a0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp @@ -26,11 +26,9 @@ class OPENVINO_API BytesToChars : public ov::op::Op { return std::make_shared(inputs); } -// bool visit_attributes(ov::AttributeVisitor& visitor) override { -// visitor.on_attribute("suffix_indicator", m_suffix_indicator); -// visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); -// return true; -// } + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp index d87645d4b..75e980dd3 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp @@ -39,11 +39,6 @@ bool CharsToBytes::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i auto ends = inputs[3].data(); auto chars = inputs[4].data(); - OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to CharsToBytes, it means it is not converted properly or it is not used in the supported pattern"); - - // Set output shapes -// outputs[0] = inputs[0]; -// outputs[1] = inputs[1]; outputs[0].set_shape(inputs[0].get_shape()); outputs[1].set_shape(inputs[1].get_shape()); outputs[2].set_shape(Shape({inputs[4].get_size()})); @@ -66,7 +61,7 @@ bool CharsToBytes::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i new_chars[char_pointer++] = first_byte; } else { const auto second_byte = chars[begins[col] + (++k)]; - new_chars[char_pointer++] = m_pair_map[first_byte - 194][second_byte - 128]; + new_chars[char_pointer++] = m_pair_map[first_byte - m_first_byte_offset][second_byte - m_second_byte_offset]; } } }; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp index 428788610..25dd91dc3 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp @@ -38,4 +38,7 @@ class OPENVINO_API CharsToBytes : public ov::op::Op { private: const std::array, 4> m_pair_map = create_pair_map(); const uint8_t m_one_byte_border = 128; // if char > 128 => it is two byte char + // + const uint8_t m_first_byte_offset = 194; + const uint8_t m_second_byte_offset = 128; }; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp index 7d8ca05a6..17a3a6e98 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp @@ -28,8 +28,6 @@ void CombineSegments::validate_and_infer_types() { OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i + 1))); } - //std::cerr << ps << '\n'; - set_ragged_output(this, 0, ps, et); // TODO: Avoid emitting ragged indices for the second ragged tensor, they should be identical to the first output ragged tensor set_ragged_output(this, 3, ps, get_input_element_type(get_input_size() - 1)); @@ -54,15 +52,11 @@ bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector begins.push_back(inputs[3*i + 0].data()); ends.push_back(inputs[3*i + 1].data()); nelems.push_back(inputs[3*i + 0].get_size()); - //std::cerr << "inputs[3*i + 0].get_size() = " << inputs[3*i + 0].get_size() << "\n"; elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); // TODO: Get rank from a tensor instead of partial_shape. This is a WA for CPU bug that gives 1D tensors instead of 0D tensors. if(get_input_partial_shape(3*i + 0).rank().get_length() > 0) { ps = inputs[3*i + 0].get_shape(); - //std::cerr << "updated\n"; } - //std::cerr << "ps = " << ps << "\nget_input_partial_shape(3*i) = " << get_input_partial_shape(3*i) << "\n"; - //OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); max_nelems = std::max(max_nelems, nelems.back()); } @@ -70,7 +64,6 @@ bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector // This is only an estimation, not the exact output size, because ragged tensor may have gaps in the representation for(size_t i = 0; i < num_of_ragged; ++i) { - //std::cerr << "max_nelems = " << max_nelems << "\n"; if(nelems[i] == 1) { flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast } else { diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py index 749183297..2890e155b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py @@ -85,7 +85,7 @@ def connect_models( new_inputs = first.get_parameters() remaining_inputs = [input_ for input_ in second.inputs if input_ not in aligned_second_inputs] if keep_unaligned_inputs: - new_inputs.expend(remaining_inputs) + new_inputs.extend(remaining_inputs) elif remaining_inputs: logger.info( "Some inputs of the second model were left uncovered and not included in the connected model: " diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp index aaf2ad78f..6276d13df 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp @@ -21,18 +21,11 @@ void RaggedTensorPack::validate_and_infer_types() { bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - // Implementation for debuggin purposes: directly print ragged indices to std::cout and pass the base tensor with elements throug. - auto input_shape = inputs[0].get_shape(); - //std::cout << "[ DEBUG ] RaggedTensorPack: shape = " << input_shape << "\n"; auto begins = inputs[0].data(); auto ends = inputs[1].data(); auto num_elements = shape_size(input_shape); - //for(size_t i = 0; i < num_elements; ++i) { - //std::cout << "[ DEBUG ] [" << i << "] " << begins[i] << ":" << ends[i] << " with size = " << ends[i] - begins[i] << "\n"; - //} - inputs[2].copy_to(outputs[0]); return true; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp index 103e02049..235a54cce 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -4,7 +4,6 @@ #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" -// #include "regex_split.hpp" #include "utils.hpp" @@ -65,7 +64,6 @@ void RegexSplit::validate_and_infer_types() { } bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - auto ragged_begins = inputs[0].data(); auto ragged_ends = inputs[1].data(); auto begins = inputs[2].data(); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp index a7d9b2851..e2729cce0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -4,11 +4,8 @@ #pragma once -#include "normalizer.h" // for absl::string_view - #include #include "openvino/opsets/opset10.hpp" -#include "fast_tokenizer/normalizers/normalizers.h" // for re2::RE2 #include "fast_tokenizer/pretokenizers/pretokenizers.h" using namespace ov; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp index 35854e685..119bbf9b8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp @@ -16,9 +16,9 @@ void StringTensorUnpack::validate_and_infer_types() { auto output_shape = PartialShape::dynamic(); // In case of explicit string tensors the shape is carried by input tensor itself - // OPENVINO_ASSERT( - // input_shape == PartialShape::dynamic(), - // "Excplicitly set shape for a string tensor in the unpacking is not supported"); +// OPENVINO_ASSERT( +// input_shape == PartialShape::dynamic(), +// "Excplicitly set shape for a string tensor in the unpacking is not supported"); // There are three cases that affect expected element type of the input tensor: // - when string tensor is passed and we are before the hack is applied (element::string) and diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index a3e12be73..0509438fd 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -156,14 +156,10 @@ OutputVector pre_translate_ragged_tensor_input(ov::Output input) { } OutputVector pre_translate_ragged_string_tensor_input(ov::Output input) { - // auto ragged_pack = dynamic_cast(node.get_input(input_index).get_node()); - // OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); auto ragged_inputs = pre_translate_ragged_tensor_input(input); auto string_inputs = pre_translate_string_tensor_input(ragged_inputs[2]); ragged_inputs.pop_back(); ragged_inputs.insert(ragged_inputs.end(), string_inputs.begin(), string_inputs.end()); - // auto string_pack = dynamic_cast(ragged_pack->get_input_node_ptr(2)); - // OPENVINO_ASSERT(string_pack, "Expected StringTensorPack as a base for RaggedTensorPack but didn't find it"); return ragged_inputs; } diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp index e5284babd..81e398bb0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp @@ -12,7 +12,6 @@ using namespace ov; void VocabDecoder::validate_and_infer_types() { -// check_ragged_string_input(this, 0); check_string_input(this, 1); const auto shape = get_input_partial_shape(0); set_ragged_string_output(this, 0, {shape[0]}); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp index 301ee6c0e..a4e853ec7 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp @@ -99,9 +99,6 @@ bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVec auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); std::vector results = m_tokenizer->Tokenize(str); -// for (auto i = begins[ragged_col]; i < ends[ragged_col]; ++i) { -// std::cerr << static_cast (chars[i]) << " "; -// } for (const core::Token& token : results) { OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); new_elems[ragged_offset++] = token.id_; From 1fa02b2cce3d1993df1ee2dbe610dec8e614370e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 27 Sep 2023 16:43:07 +0100 Subject: [PATCH 049/116] Sentencepiece Model Encoder from Transformers Tokenizer --- .../tokenizer/ov_tokenizer/__init__.py | 5 + .../{ => ov_tokenizer}/convert_tokenizer.py | 4 +- .../tokenizer/{ => ov_tokenizer}/hf_parser.py | 91 ++++++++++++++++++- .../tokenizer/ov_tokenizer/node_factory.py | 21 +++++ .../tokenizer/{ => ov_tokenizer}/str_pack.py | 0 .../{ => ov_tokenizer}/tokenizer_pipeline.py | 44 ++++----- .../tokenizer/sentence_piece.cpp | 36 +++++++- 7 files changed, 168 insertions(+), 33 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py rename modules/custom_operations/user_ie_extensions/tokenizer/{ => ov_tokenizer}/convert_tokenizer.py (97%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => ov_tokenizer}/hf_parser.py (76%) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py rename modules/custom_operations/user_ie_extensions/tokenizer/{ => ov_tokenizer}/str_pack.py (100%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => ov_tokenizer}/tokenizer_pipeline.py (95%) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py new file mode 100644 index 000000000..6e0daafb1 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py @@ -0,0 +1,5 @@ +from .node_factory import init_extension + +from .convert_tokenizer import convert_tokenizer, connect_models +from .hf_parser import convert_sentencepiece_model_tokenizer +from .str_pack import pack_strings, unpack_strings diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py similarity index 97% rename from modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py rename to modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py index 2890e155b..75348b9b3 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py @@ -6,7 +6,7 @@ import logging from typing import Any, Tuple, Union, Optional, Sequence -from openvino.runtime.exceptions import OVTypeError, UserInputError +from openvino.runtime.exceptions import OVTypeError from openvino.runtime import Model @@ -18,7 +18,7 @@ def convert_tokenizer( ) -> Union[Model, Tuple[Model, Model]]: if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase - from hf_parser import TransformersTokenizerPipelineParser + from .hf_parser import TransformersTokenizerPipelineParser # TODO: Remove this check if isinstance(tokenizer_object, PreTrainedTokenizerBase): diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py similarity index 76% rename from modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py rename to modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py index ff800ac0b..5f0178ac0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py @@ -3,12 +3,18 @@ # SPDX-License-Identifier: Apache-2.0 import json +import tempfile from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Optional, Dict, Callable, Union, List +import numpy as np +import openvino.runtime.opset12 as opset from openvino.runtime.exceptions import OVTypeError -from tokenizer_pipeline import ( +from openvino.runtime import op, Type, Model, PartialShape +from openvino.runtime.utils.types import make_constant_node, as_node + +from .tokenizer_pipeline import ( TokenizerPipeline, NormalizationStep, NormalizeUnicode, @@ -30,6 +36,7 @@ CharsToBytesStep, RegexDecodingStep, ) +from .node_factory import factory def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep: @@ -83,7 +90,8 @@ def parse_byte_level_pretokenization_step( # regex is used by default, but it does not appear in config yet if pretokenizer_dict.get("use_regex", True): # re2 does not support negative lookahead, so there is two steps replicate the behaviour - steps.append(RegexSplitStep.add_whitespace_to_the_next_word()) + # this WA causes segfault for CLIP tokenizer + # steps.append(RegexSplitStep.add_whitespace_to_the_next_word()) steps.append(RegexSplitStep.byte_level_splitter()) steps.append(BytesToCharsStep()) @@ -249,3 +257,82 @@ def decoding(self) -> None: if self.original_tokenizer.clean_up_tokenization_spaces: self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces()) return + + +def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizer"): + return hf_tokenizer.vocab_files_names.get("vocab_file", "").endswith(".model") + + +def convert_sentencepiece_model_tokenizer( + hf_tokenizer: "PreTrainedTokenizer", add_attention_mask: bool = True +) -> Model: + if not is_sentencepiece_model(hf_tokenizer): + raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.") + + fairseq_offset = getattr(hf_tokenizer, "fairseq_offset", None) + + with tempfile.TemporaryDirectory() as tmp: + hf_tokenizer.save_pretrained(tmp) + sp_model = np.fromfile(Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"], dtype=np.uint8) + + if hf_tokenizer.is_fast: + hf_slow_tokenizer = hf_tokenizer.slow_tokenizer_class.from_pretrained(tmp) + fairseq_offset = getattr(hf_slow_tokenizer, "fairseq_offset", None) + + input_node = op.Parameter(Type.u8, PartialShape(["?"])) + input_node.set_friendly_name("string_input") + + if hasattr(hf_tokenizer, "add_eos_token"): + add_eos_token = hf_tokenizer.add_eos_token + else: + add_eos_token = ( + getattr(hf_tokenizer, "truncation_side", "") == "right" + or getattr(hf_tokenizer, "padding_side", "") == "right" + ) + if hasattr(hf_tokenizer, "add_bos_token"): + add_bos_token = hf_tokenizer.add_bos_token + else: + add_bos_token = add_eos_token + + tokenizer_node = factory.create( + "SentencepieceTokenizer", + [as_node(sp_model), input_node], + { + "add_bos": add_bos_token, + "add_eos": add_eos_token, + "reverse": False, + "alpha": 0.0, + }, + ) + + indices, values, dense_shape = tokenizer_node.outputs() + + if fairseq_offset: + values = opset.add(values, make_constant_node(fairseq_offset, values.element_type)).output(0) + + default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type) + broadcast = opset.broadcast(default_value, dense_shape) + scatternd_input_ids = factory.create( + "ScatterNDUpdate", + [broadcast, indices, values], # FIXME: pad left side instead of right + ) + scatternd_input_ids.output(0).tensor.add_names({"input_ids"}) + + outputs = scatternd_input_ids.outputs() + + if add_attention_mask: + attention_mask = factory.create( + "ScatterNDUpdate", + [ + broadcast, + indices, + opset.broadcast( + make_constant_node(1, values.element_type), + opset.shape_of(values), + ), + ], + ) + attention_mask.output(0).tensor.add_names({"attention_mask"}) + outputs.append(attention_mask.output(0)) + + return Model(outputs, [input_node], "sp_tokenizer_encoder") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py new file mode 100644 index 000000000..7e2929465 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py @@ -0,0 +1,21 @@ +import os +from typing import Union +from pathlib import Path + +from openvino.runtime.utils.node_factory import NodeFactory + + +factory = NodeFactory() + + +def init_extension(extension_path: Union[str, Path]) -> None: + """ + Initialize factory with compiled tokenizer extension. + + :param extension_path: path to prebuilt C++ tokenizer library. + """ + factory.add_extension(extension_path) + + +if _extension_path := os.environ.get("OV_TOKENIZER_PREBUILD_EXTENSION_PATH"): + init_extension(_extension_path) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/str_pack.py similarity index 100% rename from modules/custom_operations/user_ie_extensions/tokenizer/str_pack.py rename to modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/str_pack.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py similarity index 95% rename from modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py rename to modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py index 114a369ac..ead328f7c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py @@ -10,15 +10,11 @@ import numpy as np from openvino.runtime.exceptions import UserInputError, OVTypeError -from openvino.runtime import Type, PartialShape, op, Model, Output, opset10 -from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime import Type, PartialShape, op, Model, Output, opset10 as opset from openvino.runtime.utils.types import as_node, make_constant_node -from str_pack import pack_string, pack_strings - -factory = NodeFactory() -# TODO: Use relative path -factory.add_extension("/home/apaniuko/python/openvino/bin/intel64/Release/libuser_ov_extensions.so") +from .str_pack import pack_string, pack_strings +from .node_factory import factory class BasePipelineStep: @@ -398,13 +394,13 @@ def get_ov_subgraph(self, input_nodes: List[Output]): # TODO: Check if axis is the right-most dimension self.validate_inputs(input_nodes) - max_length = opset10.minimum( - opset10.subtract(input_nodes[1], input_nodes[0]), + max_length = opset.minimum( + opset.subtract(input_nodes[1], input_nodes[0]), make_constant_node(self.max_length, Type.i32), ) return [ input_nodes[0], - opset10.add(input_nodes[0], max_length).output(0), + opset.add(input_nodes[0], max_length).output(0), input_nodes[2], ] @@ -586,8 +582,8 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: if self.max_length == -1 or self.max_length >= 2**31: # Calculate max_length as the maximum ragged length - max_length = opset10.reduce_max( - opset10.subtract(input_nodes[1], input_nodes[0]), + max_length = opset.reduce_max( + opset.subtract(input_nodes[1], input_nodes[0]), make_constant_node(0, Type.i32), ) else: @@ -603,7 +599,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: outputs.append(cur_outputs[0]) if i == 0: - mask = opset10.convert(cur_outputs[1], "i32").output( + mask = opset.convert(cur_outputs[1], "i32").output( 0 ) # TODO: Change RaggedToDense to generate mask of any type @@ -694,7 +690,7 @@ def get_encoder_ov_subgraph(self) -> Model: input_node = step.get_ov_subgraph(input_node) input_node = self.add_ragged_dimention(input_node) - for step in chain(self.pretokenization_steps, self.tokenization_steps): + for step in chain(self.pre_tokenization_steps, self.tokenization_steps): input_node = step.get_ov_subgraph(input_node) processing_outputs.extend(input_node) @@ -709,7 +705,7 @@ def normalization_steps(self) -> List[NormalizationStep]: return [step for step in self.steps if isinstance(step, NormalizationStep)] @property - def pretokenization_steps(self) -> List[PreTokenizatinStep]: + def pre_tokenization_steps(self) -> List[PreTokenizatinStep]: return [step for step in self.steps if isinstance(step, PreTokenizatinStep)] @property @@ -731,17 +727,17 @@ def create_int_input(self, input_type=Type.i32) -> op.Parameter: return op.Parameter(input_type, PartialShape(["?", "?", "?"])) def add_ragged_dimention(self, input_node): - shape = opset10.shape_of(input_node[0]) - batch_size = opset10.gather(shape, as_node(0), as_node(0)) + shape = opset.shape_of(input_node[0]) + batch_size = opset.gather(shape, as_node(0), as_node(0)) # FIXME: Cannot create range with specific data type from python - ragged_begins = opset10.convert( - opset10.range(as_node(0), batch_size, as_node(1)), + ragged_begins = opset.convert( + opset.range(as_node(0), batch_size, as_node(1)), "i32", ).outputs() - ragged_ends = opset10.convert( - opset10.range( + ragged_ends = opset.convert( + opset.range( as_node(1), - opset10.add(batch_size, as_node(1)), + opset.add(batch_size, as_node(1)), as_node(1), ), "i32", @@ -756,7 +752,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return factory.create("StringTensorPack", input_nodes).outputs() def get_greedy_decoding_ov_subgraph(self, input_node: op.Parameter) -> List[Output]: - argmax = opset10.topk( + argmax = opset.topk( data=input_node, k=1, axis=-1, @@ -764,7 +760,7 @@ def get_greedy_decoding_ov_subgraph(self, input_node: op.Parameter) -> List[Outp sort="none", name="ArgMax", ) - return opset10.squeeze( + return opset.squeeze( data=argmax.output(1), axes=-1, ).outputs() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp index f13ad3eed..9dfe42bfd 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -13,6 +13,7 @@ #include "utils.hpp" using sentencepiece::SentencePieceProcessor; +using sentencepiece::util::Status; using namespace TemplateExtension; using namespace ov; using namespace ov::frontend; @@ -54,9 +55,37 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t } SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const std::shared_ptr& sp, - int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse) : m_sp(sp), + int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse) : + m_sp((sp == nullptr) ? std::make_shared(): sp), m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), m_reverse(reverse), Op(args) { + // constructor above without sp argument never called when the node is created with python factory, so need to init and cache m_sp here + if (!m_sp->status().ok()) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + + // form extra options to configure SentencePieceProcessor + std::string extra_options = ""; + if (m_add_bos) { + extra_options += "bos"; + } + if (m_add_eos) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "eos"; + } + if (m_reverse) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "reverse"; + } + // example of extra_options, if "bos:eos:reverse" + CHECK_OK(m_sp->SetEncodeExtraOptions(extra_options)); + }; constructor_validate_and_infer_types(); } @@ -164,17 +193,13 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& #endif #endif - //std::cerr << " Batch size: " << batch_size << "\n"; - size_t max_token_id = 0; for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { #if USE_STRING_TENSORS && !SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS const std::string& sentence = strings[batch_ind]; - //std::cerr << " sentence: " << sentence << "\n"; #else auto begin_ind = begin_ids[batch_ind]; auto end_ind = end_ids[batch_ind]; - //std::string sentence(data + begin_ind, data + end_ind); absl::string_view sentence((const char*)data + begin_ind, end_ind - begin_ind); //std::cerr << "string: " << sentence << "\n"; #endif @@ -197,6 +222,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& memcpy(outputs[1].data(), sparse_values.data(), sizeof(int32_t) * sparse_values.size()); outputs[2].set_shape({ 2 }); memcpy(outputs[2].data(), sparse_dense_shape.data(), sizeof(int64_t) * sparse_dense_shape.size()); + return true; } From e37f89df22026f6d67d0fc1e080bba53c3f7f491 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 27 Sep 2023 19:38:55 +0100 Subject: [PATCH 050/116] Add tests for tokenizers --- .../custom_operations/tests/requirements.txt | 3 + .../tests/tokenizers_test.py | 207 ++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 modules/custom_operations/tests/tokenizers_test.py diff --git a/modules/custom_operations/tests/requirements.txt b/modules/custom_operations/tests/requirements.txt index d7282db88..7007d5d4b 100644 --- a/modules/custom_operations/tests/requirements.txt +++ b/modules/custom_operations/tests/requirements.txt @@ -2,3 +2,6 @@ torch==1.13.1 # open3d==0.16.0 - need to update with new release tensorboard pytest + +# tokenizers requirements +transformers[sentencepiece] diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/tests/tokenizers_test.py new file mode 100644 index 000000000..6e7375322 --- /dev/null +++ b/modules/custom_operations/tests/tokenizers_test.py @@ -0,0 +1,207 @@ +import sys + +sys.path.append("../user_ie_extensions/tokenizer") +# import os +# os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so" + +import pytest +import numpy as np +from openvino import Core +from transformers import AutoTokenizer +from ov_tokenizer import ( + # init_extension, + convert_tokenizer, + connect_models, + pack_strings, + unpack_strings, + convert_sentencepiece_model_tokenizer +) + + +# use `init_extension` function to be able to convert HF tokenizers: +# init_extension("path/to/libuser_ov_extensions.so") # or alternatively: +# set the OV_TOKENIZER_PREBUILD_EXTENSION_PATH env variable BEFORE importing ov_tokenizers +core = Core() + +eng_test_strings = [ + "Eng... test, string?!", + "Multiline\nstring!\nWow!", + "A lot\t w!", + "A lot\t\tof whitespaces!", + "\n\n\n\t\t A lot\t\tof\twhitespaces\n!\n\n\n\t\n\n", + "Eng, but with d1gits: 123; 0987654321, stop." + "0987654321 - eng, but with d1gits: 123" +] +multilingual_test_strings = [ + "Тестовая строка!", + "Testzeichenfolge?", + "Tester, la chaîne...", + "測試字符串", + "سلسلة الاختبار", + "מחרוזת בדיקה", + "Сынақ жолы", + "رشته تست", +] +emoji_test_strings = [ + "😀", + "😁😁", + "🤣🤣🤣😁😁😁😁", + "🫠", # melting face + "🤷‍♂️", + "🤦🏼‍♂️", +] + +wordpiece_models = [ + "bert-base-multilingual-cased", + "bert-large-cased", + "cointegrated/rubert-tiny2", + "distilbert-base-uncased-finetuned-sst-2-english", + "sentence-transformers/all-MiniLM-L6-v2", + "rajiv003/ernie-finetuned-qqp", # ernie model with fast tokenizer + "google/electra-base-discriminator", + "google/mobilebert-uncased", + "jhgan/ko-sbert-sts", + "squeezebert/squeezebert-uncased", + "prajjwal1/bert-mini", + "ProsusAI/finbert", + "rasa/LaBSE", +] +bpe_models = [ + "stabilityai/stablecode-completion-alpha-3b-4k", + "EleutherAI/gpt-neo-125m", + "EleutherAI/gpt-j-6b", + "roberta-base", + "sentence-transformers/all-roberta-large-v1", # standin for setfit + "facebook/bart-large-mnli", + "facebook/opt-66b", + "gpt2", + "EleutherAI/gpt-neox-20b", + "ai-forever/rugpt3large_based_on_gpt2", + "KoboldAI/fairseq-dense-13B", + "facebook/galactica-120b", + "EleutherAI/pythia-12b-deduped", + "Salesforce/codegen-16B-multi", + "microsoft/deberta-base", + "bigscience/bloom", # pack_strings for vocab is taking long time + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", + # "google/flan-t5-xxl", # needs Precompiled/CharsMap + # "decapoda-research/llama-65b-hf", # not importable from hub + # "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer + # "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work +] +sentencepiece_models = [ + "codellama/CodeLlama-7b-hf", + "camembert-base", + "NousResearch/Llama-2-13b-hf", + "xlm-roberta-base", + "microsoft/deberta-v3-base", + "xlnet-base-cased", + # "t5-base", # crashes tests +] + + +@pytest.fixture(scope="session", params=wordpiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_wordpiece_tokenizers(request): + hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=True) + ov_tokenizer = convert_tokenizer(hf_tokenizer) + compiled_tokenizer = core.compile_model(ov_tokenizer) + return hf_tokenizer, compiled_tokenizer + + +@pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_bpe_tokenizers(request): + hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=True) + ov_tokenizer = convert_tokenizer(hf_tokenizer) + compiled_tokenizer = core.compile_model(ov_tokenizer) + return hf_tokenizer, compiled_tokenizer + + +@pytest.fixture(scope="session", params=[True, False], ids=lambda is_fast: "Fast" if is_fast else "Slow") +def fast_tokenzier(request): + return request.param + + +@pytest.fixture(scope="session", params=sentencepiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def sentencepice_model_tokenizers(request, fast_tokenzier): + hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenzier) + ov_tokenizer = convert_sentencepiece_model_tokenizer(hf_tokenizer) + compiled_tokenizer = core.compile_model(ov_tokenizer) + return hf_tokenizer, compiled_tokenizer + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + ] +) +def test_hf_wordpiece_tokenizers_outputs(hf_and_ov_wordpiece_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers + packed_strings = pack_strings([test_string]) + + hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + ov_tokenized = ov_tokenizer(packed_strings) + + for output_name, hf_result in hf_tokenized.items(): + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + eng_test_strings, + multilingual_test_strings, + emoji_test_strings, + ] +) +def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers + packed_strings = pack_strings(test_string) + + hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True) + ov_tokenized = ov_tokenizer(packed_strings) + + for output_name, hf_result in hf_tokenized.items(): + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + ] +) +def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = hf_and_ov_bpe_tokenizers + packed_strings = pack_strings([test_string]) + + hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + ov_tokenized = ov_tokenizer(packed_strings) + + for output_name, hf_result in hf_tokenized.items(): + ov_result = ov_tokenized.get(output_name) + # galactica tokenizer has 3 output, but model has 2 inputs + if ov_result is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + ] +) +def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = sentencepice_model_tokenizers + + hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + ov_tokenized = ov_tokenizer(pack_strings([test_string])) + + for output_name, hf_result in hf_tokenized.items(): + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" From 88bf7c65baceab8f9cbe9cd55ce0742318d69517 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 2 Oct 2023 15:18:03 +0100 Subject: [PATCH 051/116] Add detokenizer for Sentencepiece models --- .../tests/tokenizers_test.py | 25 +++++- .../user_ie_extensions/ov_extension.cpp | 1 + .../ov_tokenizer/common_pipelines.py | 18 ++++ .../ov_tokenizer/convert_tokenizer.py | 2 +- .../tokenizer/ov_tokenizer/hf_parser.py | 31 +++++-- .../ov_tokenizer/tokenizer_pipeline.py | 33 ++----- .../tokenizer/sentence_piece.cpp | 85 +++++++++++++++++++ .../tokenizer/sentence_piece.hpp | 24 ++++++ 8 files changed, 184 insertions(+), 35 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/tests/tokenizers_test.py index 6e7375322..8f86523ce 100644 --- a/modules/custom_operations/tests/tokenizers_test.py +++ b/modules/custom_operations/tests/tokenizers_test.py @@ -124,9 +124,10 @@ def fast_tokenzier(request): @pytest.fixture(scope="session", params=sentencepiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def sentencepice_model_tokenizers(request, fast_tokenzier): hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenzier) - ov_tokenizer = convert_sentencepiece_model_tokenizer(hf_tokenizer) + ov_tokenizer, ov_detokenizer = convert_sentencepiece_model_tokenizer(hf_tokenizer, with_decoder=True) compiled_tokenizer = core.compile_model(ov_tokenizer) - return hf_tokenizer, compiled_tokenizer + compiled_detokenizer = core.compile_model(ov_detokenizer) + return hf_tokenizer, compiled_tokenizer, compiled_detokenizer @pytest.mark.parametrize( @@ -198,10 +199,28 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): ] ) def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): - hf_tokenizer, ov_tokenizer = sentencepice_model_tokenizers + hf_tokenizer, ov_tokenizer, _ = sentencepice_model_tokenizers hf_tokenized = hf_tokenizer(test_string, return_tensors="np") ov_tokenized = ov_tokenizer(pack_strings([test_string])) for output_name, hf_result in hf_tokenized.items(): assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + ] +) +def test_sentencepiece_detokenizer(sentencepice_model_tokenizers, test_string): + hf_tokenizer, _, ov_detokenizer = sentencepice_model_tokenizers + + token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) + ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) + + assert hf_output == ov_output diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 1fec891c9..2518f0ffd 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -78,6 +78,7 @@ std::make_shared("Reshape", translate_reshape), \ std::make_shared("Const", translate_const), \ std::make_shared>(), \ + std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), #else diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py new file mode 100644 index 000000000..0198f2443 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py @@ -0,0 +1,18 @@ +from typing import List + +from openvino.runtime import opset12 as opset, Output, op + + +def get_greedy_decoding_ov_subgraph(logits_node: op.Parameter) -> List[Output]: + argmax = opset.topk( + data=logits_node, + k=1, + axis=-1, + mode="max", + sort="none", + name="ArgMax", + ) + return opset.squeeze( + data=argmax.output(1), + axes=-1, + ).outputs() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py index 75348b9b3..e719e1a3e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py @@ -14,7 +14,7 @@ def convert_tokenizer( - tokenizer_object: Any, number_of_inputs: int = 1, with_decoder=False + tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False ) -> Union[Model, Tuple[Model, Model]]: if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py index 5f0178ac0..e4e37d7c6 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py @@ -6,13 +6,14 @@ import tempfile from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, Optional, Dict, Callable, Union, List +from typing import Any, Optional, Dict, Callable, Union, List, Tuple import numpy as np import openvino.runtime.opset12 as opset from openvino.runtime.exceptions import OVTypeError from openvino.runtime import op, Type, Model, PartialShape from openvino.runtime.utils.types import make_constant_node, as_node +from openvino import save_model from .tokenizer_pipeline import ( TokenizerPipeline, @@ -264,8 +265,10 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizer"): def convert_sentencepiece_model_tokenizer( - hf_tokenizer: "PreTrainedTokenizer", add_attention_mask: bool = True -) -> Model: + hf_tokenizer: "PreTrainedTokenizer", + add_attention_mask: bool = True, + with_decoder: bool = False +) -> Union[Model, Tuple[Model, Model]]: if not is_sentencepiece_model(hf_tokenizer): raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.") @@ -274,6 +277,7 @@ def convert_sentencepiece_model_tokenizer( with tempfile.TemporaryDirectory() as tmp: hf_tokenizer.save_pretrained(tmp) sp_model = np.fromfile(Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"], dtype=np.uint8) + sp_model_node = as_node(sp_model) if hf_tokenizer.is_fast: hf_slow_tokenizer = hf_tokenizer.slow_tokenizer_class.from_pretrained(tmp) @@ -296,7 +300,7 @@ def convert_sentencepiece_model_tokenizer( tokenizer_node = factory.create( "SentencepieceTokenizer", - [as_node(sp_model), input_node], + [sp_model_node, input_node], { "add_bos": add_bos_token, "add_eos": add_eos_token, @@ -335,4 +339,21 @@ def convert_sentencepiece_model_tokenizer( attention_mask.output(0).tensor.add_names({"attention_mask"}) outputs.append(attention_mask.output(0)) - return Model(outputs, [input_node], "sp_tokenizer_encoder") + tokenizer_encoder = Model(outputs, [input_node], "sp_tokenizer_encoder") + tokenizer_encoder.validate_nodes_and_infer_types() + + if with_decoder: + decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) + decoder = factory.create( + "SentencepieceDetokenizer", + [sp_model_node, decoder_input], + ) + string_output = factory.create("StringTensorPack", decoder.outputs()).outputs() + string_output[0].tensor.add_names({"string_output"}) + tokenizer_decoder = Model(string_output, [decoder_input], "sp_tokenizer_decoder") + tokenizer_decoder.validate_nodes_and_infer_types() + + save_model(tokenizer_decoder, "detokenizer.xml") + return tokenizer_encoder, tokenizer_decoder + + return tokenizer_encoder diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py index ead328f7c..ff6ab3e79 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py @@ -13,6 +13,7 @@ from openvino.runtime import Type, PartialShape, op, Model, Output, opset10 as opset from openvino.runtime.utils.types import as_node, make_constant_node +from .common_pipelines import get_greedy_decoding_ov_subgraph from .str_pack import pack_string, pack_strings from .node_factory import factory @@ -681,10 +682,10 @@ def __getitem__(self, item: int) -> BasePipelineStep: return self.steps[item] def get_encoder_ov_subgraph(self) -> Model: - input_nodes = [self.create_string_input() for _ in range(self.number_of_inputs)] + string_inputs = [op.Parameter(Type.u8, PartialShape(["?"])) for _ in range(self.number_of_inputs)] processing_outputs = [] - for input_node in input_nodes: + for input_node in string_inputs: input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() for step in self.normalization_steps: input_node = step.get_ov_subgraph(input_node) @@ -698,7 +699,7 @@ def get_encoder_ov_subgraph(self) -> Model: for step in self.post_tokenization_steps: processing_outputs = step.get_ov_subgraph(processing_outputs) - return Model(processing_outputs, input_nodes, name="tokenizer_encoder") + return Model(processing_outputs, string_inputs, name="tokenizer_encoder") @property def normalization_steps(self) -> List[NormalizationStep]: @@ -720,13 +721,7 @@ def post_tokenization_steps(self) -> List[PostTokenizationStep]: def decoding_steps(self) -> List[DecodingStep]: return [step for step in self.steps if isinstance(step, DecodingStep)] - def create_string_input(self) -> op.Parameter: - return op.Parameter(Type.u8, PartialShape(["?"])) - - def create_int_input(self, input_type=Type.i32) -> op.Parameter: - return op.Parameter(input_type, PartialShape(["?", "?", "?"])) - - def add_ragged_dimention(self, input_node): + def add_ragged_dimention(self, input_node: List[Output]) -> List[Output]: shape = opset.shape_of(input_node[0]) batch_size = opset.gather(shape, as_node(0), as_node(0)) # FIXME: Cannot create range with specific data type from python @@ -751,23 +746,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return factory.create("StringTensorPack", input_nodes).outputs() - def get_greedy_decoding_ov_subgraph(self, input_node: op.Parameter) -> List[Output]: - argmax = opset.topk( - data=input_node, - k=1, - axis=-1, - mode="max", - sort="none", - name="ArgMax", - ) - return opset.squeeze( - data=argmax.output(1), - axes=-1, - ).outputs() - def get_decoder_ov_subgraph(self) -> Model: - input_node = self.create_int_input() - argmax = self.get_greedy_decoding_ov_subgraph(input_node) + input_node = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) + argmax = get_greedy_decoding_ov_subgraph(input_node) outputs = self.create_decoding_pipeline(argmax) model = Model(outputs, [input_node], name="tokenizer_decoder") model.output().tensor.add_names({"string_output"}) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp index 9dfe42bfd..e45a13399 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -232,4 +232,89 @@ bool SentencepieceTokenizer::has_evaluate() const { std::shared_ptr SentencepieceTokenizer::clone_with_new_inputs(const OutputVector& new_args) const { return std::make_shared(new_args, m_sp, m_nbest_size, m_alpha, m_add_bos, m_add_eos, m_reverse); +} + + +// Detokenizer + +SentencepieceDetokenizer::SentencepieceDetokenizer(const OutputVector& args) : + m_sp(std::make_shared()), Op(args) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + constructor_validate_and_infer_types(); +} + +SentencepieceDetokenizer::SentencepieceDetokenizer(const OutputVector& args, const std::shared_ptr& sp) : + m_sp((sp == nullptr) ? std::make_shared(): sp), Op(args) { + // constructor above without sp argument never called when the node is created with python factory, so need to init and cache m_sp here + if (!m_sp->status().ok()) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + }; + constructor_validate_and_infer_types(); +} + +void SentencepieceDetokenizer::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 2, "SentencepieceDetokenizer expects two inputs: sp model and token ids"); + OPENVINO_ASSERT(get_input_element_type(0) == element::u8, "SentencepieceDetokenizer accepts sp model as the first input and it should be of type u8 tensor"); + OPENVINO_ASSERT(get_input_partial_shape(1).size() == 2, "SentencepieceDetokenizer expects 2D tensor as second input"); + + auto batch_size = PartialShape({get_input_partial_shape(1)[0]}); + set_string_output(this, 0, batch_size); +} + +bool SentencepieceDetokenizer::visit_attributes(AttributeVisitor& visitor) { + return true; +} + +bool SentencepieceDetokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + auto batch_size = inputs[1].get_shape()[0]; + auto seq_len = inputs[1].get_shape()[1]; + auto input_data = inputs[1].data(); + + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + auto start = batch * seq_len; + + std::vector token_ids(seq_len); + std::memcpy(&token_ids[0], &input_data[start], sizeof(int32_t) * seq_len); + + std::string detokenized; + CHECK_OK(m_sp->Decode(token_ids, &detokenized)); + std::copy(detokenized.begin(), detokenized.end(), &chars[char_offset]); + + begins[batch] = char_offset; + char_offset += detokenized.size(); + ends[batch] = char_offset; + } + outputs[2].set_shape({char_offset}); + return true; +} + +bool SentencepieceDetokenizer::has_evaluate() const { + return true; +} + +std::shared_ptr SentencepieceDetokenizer::clone_with_new_inputs(const OutputVector& new_args) const { + return std::make_shared(new_args, m_sp); } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp index cec0a9532..0efd60966 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp @@ -38,4 +38,28 @@ namespace TemplateExtension { bool m_add_eos; bool m_reverse; }; + + + class SentencepieceDetokenizer : public ov::op::Op { + public: + OPENVINO_OP("SentencepieceDetokenizer"); + + SentencepieceDetokenizer() = default; + SentencepieceDetokenizer(const ov::OutputVector& args); + SentencepieceDetokenizer(const ov::OutputVector& args, + const std::shared_ptr& sp); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const override; + + private: + std::shared_ptr m_sp; + }; } // namespace TemplateExtension From 79c3e0991c505606c549f58c7fdd95be2e6d128d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 4 Oct 2023 14:34:30 +0100 Subject: [PATCH 052/116] OVTokenizer as python package --- .../tests/tokenizers_test.py | 6 +- .../tokenizer/ov_tokenizer/__init__.py | 5 - .../tokenizer/python/README.md | 127 ++++++++++++++++++ .../tokenizer/python/ov_tokenizer/__init__.py | 8 ++ .../ov_tokenizer/common_pipelines.py | 7 +- .../ov_tokenizer/convert_tokenizer.py | 11 +- .../{ => python}/ov_tokenizer/hf_parser.py | 78 ++++++----- .../{ => python}/ov_tokenizer/node_factory.py | 6 +- .../{ => python}/ov_tokenizer/str_pack.py | 6 +- .../ov_tokenizer/tokenizer_pipeline.py | 25 ++-- .../tokenizer/python/pyproject.toml | 45 +++++++ 11 files changed, 265 insertions(+), 59 deletions(-) delete mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/README.md create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py rename modules/custom_operations/user_ie_extensions/tokenizer/{ => python}/ov_tokenizer/common_pipelines.py (64%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => python}/ov_tokenizer/convert_tokenizer.py (97%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => python}/ov_tokenizer/hf_parser.py (90%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => python}/ov_tokenizer/node_factory.py (82%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => python}/ov_tokenizer/str_pack.py (94%) rename modules/custom_operations/user_ie_extensions/tokenizer/{ => python}/ov_tokenizer/tokenizer_pipeline.py (97%) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/tests/tokenizers_test.py index 8f86523ce..d01cfb24f 100644 --- a/modules/custom_operations/tests/tokenizers_test.py +++ b/modules/custom_operations/tests/tokenizers_test.py @@ -1,6 +1,8 @@ -import sys +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 -sys.path.append("../user_ie_extensions/tokenizer") +import sys # import os # os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py deleted file mode 100644 index 6e0daafb1..000000000 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .node_factory import init_extension - -from .convert_tokenizer import convert_tokenizer, connect_models -from .hf_parser import convert_sentencepiece_model_tokenizer -from .str_pack import pack_strings, unpack_strings diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md new file mode 100644 index 000000000..cc54d531c --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -0,0 +1,127 @@ +# OpenVINO Tokenizers + +## Features + +- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer: + - Fast tokenizers based on Wordpiece and BPE models + - Slow tokenizers based on SentencePiece model file +- Combine OpenVINO models into a single model + +## Installation + +1. Build the extension with the `-DCUSTOM_OPERATIONS="tokenizer"` flag: [instruction](modules/custom_operations/README.md) +2. (Recommended) Create and activate virtual env: +```bash +python3 -m venv venv +source venv/bin/activate +``` +3. Go to `modules/custom_operations/user_ie_extensions/tokenizer/python` and run: +```bash +pip install .[transformers] +# or install all dependencies for the development +pip isntall -e .[all] +``` + +## Usage + +Set `OV_TOKENIZER_PREBUILD_EXTENSION_PATH` environment variable to `libuser_ov_extensions.so` file path +or use `init_extension` function. + +### Convert HuggingFace tokenizer + +```python +from transformers import AutoTokenizer +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings +from openvino import compile_model + + +init_extension("path/to/libuser_ov_extensions.so") + +hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +ov_tokenizer = convert_tokenizer(hf_tokenizer) + +compiled_tokenzier = compile_model(ov_tokenizer) +text_input = "Test string" + +hf_output = hf_tokenizer([text_input], return_tensors="np") +ov_output = compiled_tokenzier(pack_strings([text_input])) + +for output_name in hf_output: + print(f"OpenVINO {output_name} = {ov_output[output_name]}") + print(f"HuggingFace {output_name} = {hf_output[output_name]}") +# OpenVINO input_ids = [[ 101 3231 5164 102]] +# HuggingFace input_ids = [[ 101 3231 5164 102]] +# OpenVINO token_type_ids = [[0 0 0 0]] +# HuggingFace token_type_ids = [[0 0 0 0]] +# OpenVINO attention_mask = [[1 1 1 1]] +# HuggingFace attention_mask = [[1 1 1 1]] +``` + +### Connect Tokenizer to a Model + + +```python +from transformers import AutoTokenizer, AutoModelForSequenceClassification +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings, connect_models +from openvino import compile_model, convert_model + + +init_extension("path/to/libuser_ov_extensions.so") + +checkpoint = "mrm8488/bert-tiny-finetuned-sms-spam-detection" +hf_tokenizer = AutoTokenizer.from_pretrained(checkpoint) +hf_model = AutoModelForSequenceClassification.from_pretrained(checkpoint) + +text_input = ["Free money!!!"] +hf_input = hf_tokenizer(text_input, return_tensors="pt") +hf_output = hf_model(**hf_input) + +ov_tokenizer = convert_tokenizer(hf_tokenizer) +ov_model = convert_model(hf_model, example_input=hf_input.data) +combined_model = connect_models(ov_tokenizer, ov_model) +compiled_combined_model = compile_model(combined_model) + +openvino_output = compiled_combined_model(pack_strings(text_input)) + +print(f"OpenVINO logits: {openvino_output['logits']}") +# OpenVINO logits: [[ 1.2007061 -1.4698029]] +print(f"HuggingFace logits {hf_output.logits}") +# HuggingFace logits tensor([[ 1.2007, -1.4698]], grad_fn=) +``` + +### Convert SentencePiece Model Tokenzier + +```python +from transformers import AutoTokenizer +from ov_tokenizer import init_extension, convert_sentencepiece_model_tokenizer, pack_strings, unpack_strings +from openvino import compile_model + + +init_extension("path/to/libuser_ov_extensions.so") + +checkpoint = "codellama/CodeLlama-7b-hf" +hf_tokenizer = AutoTokenizer.from_pretrained(checkpoint) + +text_input = ["def fibonnaci(n):"] +hf_input = hf_tokenizer(text_input, return_tensors="np") + +ov_tokenizer, ov_detokenizer = convert_sentencepiece_model_tokenizer(hf_tokenizer, with_decoder=True) +compiled_tokenizer = compile_model(ov_tokenizer) +compiled_detokenizer = compile_model(ov_detokenizer) +ov_input = compiled_tokenizer(pack_strings(text_input)) + +for model_input_name in hf_input: + print(f"OpenVINO {model_input_name} = {ov_input[model_input_name]}") + print(f"HuggingFace {model_input_name} = {hf_input[model_input_name]}") +# OpenVINO input_ids = [[ 1 822 18755 11586 455 29898 29876 1125]] +# HuggingFace input_ids = [[ 1 822 18755 11586 455 29898 29876 1125]] +# OpenVINO attention_mask = [[1 1 1 1 1 1 1 1]] +# HuggingFace attention_mask = [[1 1 1 1 1 1 1 1]] + +ov_output = unpack_strings(compiled_detokenizer(hf_input.input_ids)["string_output"]) +hf_output = hf_tokenizer.batch_decode(hf_input.input_ids, skip_special_tokens=True) +print(f"OpenVINO output string: `{ov_output}`") +# OpenVINO output string: ['def fibonnaci(n):'] +print(f"HuggingFace output string: `{hf_output}`") +# HuggingFace output string: ['def fibonnaci(n):'] +``` diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py new file mode 100644 index 000000000..f549dae88 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .convert_tokenizer import connect_models, convert_tokenizer +from .hf_parser import convert_sentencepiece_model_tokenizer +from .node_factory import init_extension +from .str_pack import pack_strings, unpack_strings diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py similarity index 64% rename from modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py index 0198f2443..dc5e41ddd 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/common_pipelines.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py @@ -1,6 +1,11 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from typing import List -from openvino.runtime import opset12 as opset, Output, op +from openvino.runtime import Output, op +from openvino.runtime import opset12 as opset def get_greedy_decoding_ov_subgraph(logits_node: op.Parameter) -> List[Output]: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py similarity index 97% rename from modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index e719e1a3e..d338925d2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -2,22 +2,23 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import sys import logging -from typing import Any, Tuple, Union, Optional, Sequence +import sys +from typing import Any, Optional, Sequence, Tuple, Union -from openvino.runtime.exceptions import OVTypeError from openvino.runtime import Model +from openvino.runtime.exceptions import OVTypeError logger = logging.getLogger(__name__) def convert_tokenizer( - tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False + tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False, greedy_decoder=False ) -> Union[Model, Tuple[Model, Model]]: if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase + from .hf_parser import TransformersTokenizerPipelineParser # TODO: Remove this check @@ -25,7 +26,7 @@ def convert_tokenizer( pipeline = TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs) ov_tokenizer = pipeline.get_encoder_ov_subgraph() if with_decoder: - ov_detokenizer = pipeline.get_decoder_ov_subgraph() + ov_detokenizer = pipeline.get_decoder_ov_subgraph(greedy_decoder) output_names = tokenizer_object.model_input_names ov_tokenizer_output_names = ["input_ids", "attention_mask"] diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py similarity index 90% rename from modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index e4e37d7c6..315cf1274 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -6,38 +6,39 @@ import tempfile from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, Optional, Dict, Callable, Union, List, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import openvino.runtime.opset12 as opset -from openvino.runtime.exceptions import OVTypeError -from openvino.runtime import op, Type, Model, PartialShape -from openvino.runtime.utils.types import make_constant_node, as_node from openvino import save_model +from openvino.runtime import Model, PartialShape, Type, op +from openvino.runtime.exceptions import OVTypeError +from openvino.runtime.utils.types import as_node, make_constant_node +from .common_pipelines import get_greedy_decoding_ov_subgraph +from .node_factory import factory from .tokenizer_pipeline import ( - TokenizerPipeline, + BPETokenizationStep, + BytesToCharsStep, + CaseFoldStep, + CharsToBytesStep, + CombineSegmentsStep, + NMTNormalizationStep, NormalizationStep, NormalizeUnicode, - NMTNormalizationStep, - CaseFoldStep, - RegexNormalizationStep, - StripStringStep, + PaddingStep, PreTokenizatinStep, PunctuationSplitStep, + RegexDecodingStep, + RegexNormalizationStep, RegexSplitStep, - WhitespaceSplitStep, - BytesToCharsStep, - WordPieceTokenizationStep, - BPETokenizationStep, + StripStringStep, + TokenizerPipeline, TruncationStep, - PaddingStep, - CombineSegmentsStep, VocabDecoderStep, - CharsToBytesStep, - RegexDecodingStep, + WhitespaceSplitStep, + WordPieceTokenizationStep, ) -from .node_factory import factory def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep: @@ -221,7 +222,9 @@ def post_tokenization(self) -> None: self.tokenizer_json, self.number_of_inputs ) else: - raise OVTypeError(f"Post-processor type '{self.tokenizer_json['post_processor']['type']}' is not supported") + raise OVTypeError( + f"Post-processor type '{self.tokenizer_json['post_processor']['type']}' is not supported" + ) self.num_of_added_tokens += combine_segments_step.number_of_added_tokens combine_segments_step.set_tokens_ids(self.pipeline.vocab) @@ -260,14 +263,15 @@ def decoding(self) -> None: return -def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizer"): +def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase"): return hf_tokenizer.vocab_files_names.get("vocab_file", "").endswith(".model") def convert_sentencepiece_model_tokenizer( - hf_tokenizer: "PreTrainedTokenizer", + hf_tokenizer: "PreTrainedTokenizerBase", add_attention_mask: bool = True, - with_decoder: bool = False + with_decoder: bool = False, + greedy_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: if not is_sentencepiece_model(hf_tokenizer): raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.") @@ -342,18 +346,24 @@ def convert_sentencepiece_model_tokenizer( tokenizer_encoder = Model(outputs, [input_node], "sp_tokenizer_encoder") tokenizer_encoder.validate_nodes_and_infer_types() - if with_decoder: + if not with_decoder: + return tokenizer_encoder + + if greedy_decoder: + decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) # (batch, sequence, logits) + token_ids = get_greedy_decoding_ov_subgraph(decoder_input)[0] # (batch, sequence) + else: decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) - decoder = factory.create( - "SentencepieceDetokenizer", - [sp_model_node, decoder_input], - ) - string_output = factory.create("StringTensorPack", decoder.outputs()).outputs() - string_output[0].tensor.add_names({"string_output"}) - tokenizer_decoder = Model(string_output, [decoder_input], "sp_tokenizer_decoder") - tokenizer_decoder.validate_nodes_and_infer_types() + token_ids = decoder_input - save_model(tokenizer_decoder, "detokenizer.xml") - return tokenizer_encoder, tokenizer_decoder + decoder = factory.create( + "SentencepieceDetokenizer", + [sp_model_node, token_ids], + ) + string_output = factory.create("StringTensorPack", decoder.outputs()).outputs() + string_output[0].tensor.add_names({"string_output"}) + tokenizer_decoder = Model(string_output, [decoder_input], "sp_tokenizer_decoder") + tokenizer_decoder.validate_nodes_and_infer_types() - return tokenizer_encoder + save_model(tokenizer_decoder, "detokenizer.xml") + return tokenizer_encoder, tokenizer_decoder diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py similarity index 82% rename from modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py index 7e2929465..e2b6ed63c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/node_factory.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py @@ -1,6 +1,10 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os -from typing import Union from pathlib import Path +from typing import Union from openvino.runtime.utils.node_factory import NodeFactory diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/str_pack.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py similarity index 94% rename from modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/str_pack.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py index 1f09a992e..a0edea098 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/str_pack.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py @@ -1,5 +1,9 @@ -from typing import List +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from io import BytesIO +from typing import List import numpy as np from numpy.typing import NDArray diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py similarity index 97% rename from modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index ff6ab3e79..57ba5f035 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + +import weakref from dataclasses import dataclass, field from functools import singledispatchmethod from itertools import chain, islice -from typing import List, Optional, Any, Dict, Union -import weakref +from typing import Any, Dict, List, Optional, Union import numpy as np - -from openvino.runtime.exceptions import UserInputError, OVTypeError -from openvino.runtime import Type, PartialShape, op, Model, Output, opset10 as opset +from openvino.runtime import Model, Output, PartialShape, Type, op +from openvino.runtime import opset10 as opset +from openvino.runtime.exceptions import OVTypeError, UserInputError from openvino.runtime.utils.types import as_node, make_constant_node from .common_pipelines import get_greedy_decoding_ov_subgraph -from .str_pack import pack_string, pack_strings from .node_factory import factory +from .str_pack import pack_string, pack_strings class BasePipelineStep: @@ -746,10 +747,14 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return factory.create("StringTensorPack", input_nodes).outputs() - def get_decoder_ov_subgraph(self) -> Model: - input_node = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) - argmax = get_greedy_decoding_ov_subgraph(input_node) - outputs = self.create_decoding_pipeline(argmax) + def get_decoder_ov_subgraph(self, greedy_decoder: bool = False) -> Model: + if greedy_decoder: + input_node = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) + token_ids = get_greedy_decoding_ov_subgraph(input_node) + else: + input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) + token_ids = input_node + outputs = self.create_decoding_pipeline(token_ids) model = Model(outputs, [input_node], name="tokenizer_decoder") model.output().tensor.add_names({"string_output"}) return model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml new file mode 100644 index 000000000..125defd29 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "ov_tokenizer" +version = "0.0.1" +description = "Convert tokenizers into OpenVINO models" +requires-python = ">=3.8" +authors = [ + { name = "Artur Paniukov", email = "artur.paniukov@intel.com" }, + { name = "Sergey Lyalin", email = "sergey.lyalin@intel.com" }, +] + +dependencies = [ + "openvino>=2023.1", + "numpy" +] + +[project.optional-dependencies] +dev = [ + "black", + "ruff", + "pytest", +] +transformers = [ + "transformers[sentencepiece]" +] +all = [ + "ov_tokenizer[dev,transformers]" +] + + +[tool.black] +line-length = 119 +target-versions = ["py38", "py39", "py310", "py311"] + + +[tool.ruff] +ignore = ["C901", "E501", "E741", "W605"] +select = ["C", "E", "F", "I", "W"] +line-length = 119 + +[tool.ruff.per-file-ignores] +"__init__.py" = ["F401"] +"ov_tokenizer/hf_parser.py" = ["F821"] + +[tool.ruff.isort] +lines-after-imports = 2 From bb1b57aa37079b687beb9faed60581c64fff4def Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 4 Oct 2023 12:35:15 +0100 Subject: [PATCH 053/116] Update README.md --- .../user_ie_extensions/tokenizer/python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index cc54d531c..60bff67ab 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -9,7 +9,7 @@ ## Installation -1. Build the extension with the `-DCUSTOM_OPERATIONS="tokenizer"` flag: [instruction](modules/custom_operations/README.md) +1. Build the extension with the `-DCUSTOM_OPERATIONS="tokenizer"` flag: [instruction](../../../README.md#build-custom-openvino-operation-extension-library) 2. (Recommended) Create and activate virtual env: ```bash python3 -m venv venv From 6b4be05aa0ac791f7bfa0b5b22b0c47f47d07812 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 4 Oct 2023 12:54:17 +0100 Subject: [PATCH 054/116] Update README.md --- .../user_ie_extensions/tokenizer/python/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 60bff67ab..a0af8212b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -91,6 +91,8 @@ print(f"HuggingFace logits {hf_output.logits}") ### Convert SentencePiece Model Tokenzier +To connect a detokenizer to a `logits` model output, set `greedy_decoder=True` when using the `convert_tokenizer` or `convert_sentencepiece_model_tokenizer` function, enabling a greedy decoding pipeline before detoknizer. This allows the detokenizer to be connected to the `logits` model output. + ```python from transformers import AutoTokenizer from ov_tokenizer import init_extension, convert_sentencepiece_model_tokenizer, pack_strings, unpack_strings From 539797fc2f9eb4f47351e19d3f1e3a747f7d8fcf Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 4 Oct 2023 13:02:16 +0100 Subject: [PATCH 055/116] Update README.md --- .../user_ie_extensions/tokenizer/python/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index a0af8212b..56a494a3f 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -91,8 +91,6 @@ print(f"HuggingFace logits {hf_output.logits}") ### Convert SentencePiece Model Tokenzier -To connect a detokenizer to a `logits` model output, set `greedy_decoder=True` when using the `convert_tokenizer` or `convert_sentencepiece_model_tokenizer` function, enabling a greedy decoding pipeline before detoknizer. This allows the detokenizer to be connected to the `logits` model output. - ```python from transformers import AutoTokenizer from ov_tokenizer import init_extension, convert_sentencepiece_model_tokenizer, pack_strings, unpack_strings @@ -127,3 +125,5 @@ print(f"OpenVINO output string: `{ov_output}`") print(f"HuggingFace output string: `{hf_output}`") # HuggingFace output string: ['def fibonnaci(n):'] ``` + +To connect a detokenizer to a `logits` model output, set `greedy_decoder=True` when using the `convert_tokenizer` or `convert_sentencepiece_model_tokenizer` function, enabling a greedy decoding pipeline before detoknizer. This allows the detokenizer to be connected to the `logits` model output. From 45c0068b55da64ee246103ac3b4b894ab04c3c11 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 5 Oct 2023 18:46:56 +0100 Subject: [PATCH 056/116] Update README.md --- .../tokenizer/python/README.md | 33 ++++++++++++++++--- .../tokenizer/python/pyproject.toml | 2 +- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 56a494a3f..37c7f6121 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -17,8 +17,11 @@ source venv/bin/activate ``` 3. Go to `modules/custom_operations/user_ie_extensions/tokenizer/python` and run: ```bash +# to use converted tokenizers or models combined with tokenizers +pip install . +# to convert tokenizers from transformers library pip install .[transformers] -# or install all dependencies for the development +# for development and testing the library pip isntall -e .[all] ``` @@ -31,8 +34,8 @@ or use `init_extension` function. ```python from transformers import AutoTokenizer -from ov_tokenizer import init_extension, convert_tokenizer, pack_strings from openvino import compile_model +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings init_extension("path/to/libuser_ov_extensions.so") @@ -62,8 +65,8 @@ for output_name in hf_output: ```python from transformers import AutoTokenizer, AutoModelForSequenceClassification -from ov_tokenizer import init_extension, convert_tokenizer, pack_strings, connect_models from openvino import compile_model, convert_model +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings, connect_models init_extension("path/to/libuser_ov_extensions.so") @@ -93,8 +96,8 @@ print(f"HuggingFace logits {hf_output.logits}") ```python from transformers import AutoTokenizer -from ov_tokenizer import init_extension, convert_sentencepiece_model_tokenizer, pack_strings, unpack_strings from openvino import compile_model +from ov_tokenizer import init_extension, convert_sentencepiece_model_tokenizer, pack_strings, unpack_strings init_extension("path/to/libuser_ov_extensions.so") @@ -127,3 +130,25 @@ print(f"HuggingFace output string: `{hf_output}`") ``` To connect a detokenizer to a `logits` model output, set `greedy_decoder=True` when using the `convert_tokenizer` or `convert_sentencepiece_model_tokenizer` function, enabling a greedy decoding pipeline before detoknizer. This allows the detokenizer to be connected to the `logits` model output. + +### Use Extension With Converted (De)Tokenizer or Model combined with (De)Tokenizer + +To work with converted tokenizer you need `pack_strings`/`unpack_strings` functions. + +```python +import numpy as np +from openvino import Core +from ov_tokenizer import unpack_strings + + +core = Core() +core.add_extension("path/to/libuser_ov_extensions.so") +# detokenizer from codellama sentencepiece model +compiled_detokenizer = core.compile_model("detokenizer.xml") + +token_ids = np.random.randint(100, 1000, size=(3, 5)) +openvino_output = compiled_detokenizer(token_ids) + +print(unpack_strings(openvino_output["string_output"])) +# ['sc�ouition�', 'intvenord hasient', 'g shouldwer M more'] +``` diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml index 125defd29..a4ecc6a91 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -29,7 +29,7 @@ all = [ [tool.black] line-length = 119 -target-versions = ["py38", "py39", "py310", "py311"] +target-versions = ["py38", "py39", "py310", "py311", "py312"] [tool.ruff] From 64567eacb697cbdc07fdd5a2bcb1df97a77b6d65 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 6 Oct 2023 16:54:29 +0100 Subject: [PATCH 057/116] Add sentencepiece detokenizer test --- .../tests/tokenizers_test.py | 31 +++++++++++++++++-- .../tokenizer/case_fold.cpp | 2 +- .../python/ov_tokenizer/hf_parser.py | 7 ++--- .../python/ov_tokenizer/tokenizer_pipeline.py | 2 +- .../tokenizer/sentence_piece.cpp | 14 +-------- .../user_ie_extensions/tokenizer/utils.cpp | 2 +- .../tokenizer/vocab_decoder.cpp | 1 - 7 files changed, 34 insertions(+), 25 deletions(-) diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/tests/tokenizers_test.py index d01cfb24f..f15ff00ff 100644 --- a/modules/custom_operations/tests/tokenizers_test.py +++ b/modules/custom_operations/tests/tokenizers_test.py @@ -87,7 +87,6 @@ "bigscience/bloom", # pack_strings for vocab is taking long time "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", # "google/flan-t5-xxl", # needs Precompiled/CharsMap - # "decapoda-research/llama-65b-hf", # not importable from hub # "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer # "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work ] @@ -118,6 +117,14 @@ def hf_and_ov_bpe_tokenizers(request): return hf_tokenizer, compiled_tokenizer +@pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_bpe_detokenizer(request): + hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=True) + _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) + compiled_detokenizer = core.compile_model(ov_detokenizer) + return hf_tokenizer, compiled_detokenizer + + @pytest.fixture(scope="session", params=[True, False], ids=lambda is_fast: "Fast" if is_fast else "Slow") def fast_tokenzier(request): return request.param @@ -192,6 +199,24 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + ] +) +def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): + hf_tokenizer, ov_detokenizer = hf_and_ov_bpe_detokenizer + + token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + hf_output = hf_tokenizer.batch_decode(token_ids) + ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) + + assert ov_output == hf_output + + @pytest.mark.parametrize( "test_string", [ @@ -218,11 +243,11 @@ def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_strin *emoji_test_strings, ] ) -def test_sentencepiece_detokenizer(sentencepice_model_tokenizers, test_string): +def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_string): hf_tokenizer, _, ov_detokenizer = sentencepice_model_tokenizers token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) - assert hf_output == ov_output + assert ov_output == hf_output diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp index 7a8cff580..8c5fd681b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp @@ -22,4 +22,4 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input using namespace paddlenlp::fast_tokenizer; return normalizers::NormalizedString(str).Lowercase().GetStr(); }); -} \ No newline at end of file +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 315cf1274..6c1017340 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -291,16 +291,13 @@ def convert_sentencepiece_model_tokenizer( input_node.set_friendly_name("string_input") if hasattr(hf_tokenizer, "add_eos_token"): - add_eos_token = hf_tokenizer.add_eos_token + add_eos_token = hf_tokenizer.add_eos_token or False else: add_eos_token = ( getattr(hf_tokenizer, "truncation_side", "") == "right" or getattr(hf_tokenizer, "padding_side", "") == "right" ) - if hasattr(hf_tokenizer, "add_bos_token"): - add_bos_token = hf_tokenizer.add_bos_token - else: - add_bos_token = add_eos_token + add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False tokenizer_node = factory.create( "SentencepieceTokenizer", diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 57ba5f035..8fe88b65c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -754,7 +754,7 @@ def get_decoder_ov_subgraph(self, greedy_decoder: bool = False) -> Model: else: input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) token_ids = input_node - outputs = self.create_decoding_pipeline(token_ids) + outputs = self.create_decoding_pipeline([token_ids]) model = Model(outputs, [input_node], name="tokenizer_decoder") model.output().tensor.add_names({"string_output"}) return model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp index e45a13399..7e3c5e05a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -173,17 +173,6 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& #else - // const uint8_t* strings = inputs[1].data(); - // auto bitstream_size = inputs[1].get_byte_size(); - - // // check the format of the input bitstream representing the string tensor - // FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - // auto batch_size = *reinterpret_cast(strings + 0); - // FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, - // "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - // auto begin_ids = reinterpret_cast(strings + 4); - // auto end_ids = begin_ids + 1; - // auto data = strings + 4 + 4 + 4 * batch_size; int32_t batch_size; const int32_t* begin_ids; const int32_t* end_ids; @@ -201,7 +190,6 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& auto begin_ind = begin_ids[batch_ind]; auto end_ind = end_ids[batch_ind]; absl::string_view sentence((const char*)data + begin_ind, end_ind - begin_ind); - //std::cerr << "string: " << sentence << "\n"; #endif std::vector ids; CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); @@ -317,4 +305,4 @@ bool SentencepieceDetokenizer::has_evaluate() const { std::shared_ptr SentencepieceDetokenizer::clone_with_new_inputs(const OutputVector& new_args) const { return std::make_shared(new_args, m_sp); -} \ No newline at end of file +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 0509438fd..e503ee8d6 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -221,4 +221,4 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont #else return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); #endif -} \ No newline at end of file +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp index 81e398bb0..310bd99fa 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp @@ -70,4 +70,3 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i outputs[4].set_shape({char_offset}); return true; } - From c42d1bd5e8e9e101094f6c112826481b0c619979 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 9 Oct 2023 18:16:53 +0100 Subject: [PATCH 058/116] Unified interface for fast and sentencepiece tokenizers --- .../tests/tokenizers_test.py | 45 +++++++++-------- .../tokenizer/python/README.md | 4 +- .../tokenizer/python/ov_tokenizer/__init__.py | 1 - .../python/ov_tokenizer/convert_tokenizer.py | 50 +++++++------------ .../python/ov_tokenizer/hf_parser.py | 42 +++++++++++++++- 5 files changed, 83 insertions(+), 59 deletions(-) diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/tests/tokenizers_test.py index f15ff00ff..131097c40 100644 --- a/modules/custom_operations/tests/tokenizers_test.py +++ b/modules/custom_operations/tests/tokenizers_test.py @@ -16,7 +16,6 @@ connect_models, pack_strings, unpack_strings, - convert_sentencepiece_model_tokenizer ) @@ -101,42 +100,44 @@ ] -@pytest.fixture(scope="session", params=wordpiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) -def hf_and_ov_wordpiece_tokenizers(request): - hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=True) - ov_tokenizer = convert_tokenizer(hf_tokenizer) +def get_tokenizer(request, fast_tokenizer=True): + hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenizer) + ov_tokenizer = convert_tokenizer(hf_tokenizer, with_decoder=False) compiled_tokenizer = core.compile_model(ov_tokenizer) return hf_tokenizer, compiled_tokenizer +def get_tokenizer_detokenizer(request, fast_tokenizer=True): + hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenizer) + ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) + compiled_tokenizer = core.compile_model(ov_tokenizer) + compiled_detokenizer = core.compile_model(ov_detokenizer) + return hf_tokenizer, compiled_tokenizer, compiled_detokenizer + + +@pytest.fixture(scope="session", params=wordpiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_wordpiece_tokenizers(request): + return get_tokenizer(request) + + @pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def hf_and_ov_bpe_tokenizers(request): - hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=True) - ov_tokenizer = convert_tokenizer(hf_tokenizer) - compiled_tokenizer = core.compile_model(ov_tokenizer) - return hf_tokenizer, compiled_tokenizer + return get_tokenizer_detokenizer(request) @pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def hf_and_ov_bpe_detokenizer(request): - hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=True) - _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) - compiled_detokenizer = core.compile_model(ov_detokenizer) - return hf_tokenizer, compiled_detokenizer + return get_tokenizer_detokenizer(request) @pytest.fixture(scope="session", params=[True, False], ids=lambda is_fast: "Fast" if is_fast else "Slow") -def fast_tokenzier(request): +def fast_tokenizer(request): return request.param @pytest.fixture(scope="session", params=sentencepiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) -def sentencepice_model_tokenizers(request, fast_tokenzier): - hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenzier) - ov_tokenizer, ov_detokenizer = convert_sentencepiece_model_tokenizer(hf_tokenizer, with_decoder=True) - compiled_tokenizer = core.compile_model(ov_tokenizer) - compiled_detokenizer = core.compile_model(ov_detokenizer) - return hf_tokenizer, compiled_tokenizer, compiled_detokenizer +def sentencepice_model_tokenizers(request, fast_tokenizer): + return get_tokenizer_detokenizer(request, fast_tokenizer) @pytest.mark.parametrize( @@ -186,7 +187,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers ] ) def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): - hf_tokenizer, ov_tokenizer = hf_and_ov_bpe_tokenizers + hf_tokenizer, ov_tokenizer, _ = hf_and_ov_bpe_tokenizers packed_strings = pack_strings([test_string]) hf_tokenized = hf_tokenizer([test_string], return_tensors="np") @@ -208,7 +209,7 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): ] ) def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): - hf_tokenizer, ov_detokenizer = hf_and_ov_bpe_detokenizer + hf_tokenizer, _, ov_detokenizer = hf_and_ov_bpe_detokenizer token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids hf_output = hf_tokenizer.batch_decode(token_ids) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 37c7f6121..134030a4e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -97,7 +97,7 @@ print(f"HuggingFace logits {hf_output.logits}") ```python from transformers import AutoTokenizer from openvino import compile_model -from ov_tokenizer import init_extension, convert_sentencepiece_model_tokenizer, pack_strings, unpack_strings +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings, unpack_strings init_extension("path/to/libuser_ov_extensions.so") @@ -108,7 +108,7 @@ hf_tokenizer = AutoTokenizer.from_pretrained(checkpoint) text_input = ["def fibonnaci(n):"] hf_input = hf_tokenizer(text_input, return_tensors="np") -ov_tokenizer, ov_detokenizer = convert_sentencepiece_model_tokenizer(hf_tokenizer, with_decoder=True) +ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) compiled_tokenizer = compile_model(ov_tokenizer) compiled_detokenizer = compile_model(ov_detokenizer) ov_input = compiled_tokenizer(pack_strings(text_input)) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py index f549dae88..bcee3adac 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py @@ -3,6 +3,5 @@ # SPDX-License-Identifier: Apache-2.0 from .convert_tokenizer import connect_models, convert_tokenizer -from .hf_parser import convert_sentencepiece_model_tokenizer from .node_factory import init_extension from .str_pack import pack_strings, unpack_strings diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index d338925d2..5114902f0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -16,45 +16,31 @@ def convert_tokenizer( tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False, greedy_decoder=False ) -> Union[Model, Tuple[Model, Model]]: + # todo: add support for more then 1 input + if number_of_inputs > 1: + raise ValueError("Tokenizers with more then one input are not supported yet.") + if "transformers" in sys.modules: - from transformers import PreTrainedTokenizerBase + from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast - from .hf_parser import TransformersTokenizerPipelineParser + from .hf_parser import is_sentencepiece_model, convert_sentencepiece_model_tokenizer, convert_fast_tokenizer - # TODO: Remove this check if isinstance(tokenizer_object, PreTrainedTokenizerBase): - pipeline = TransformersTokenizerPipelineParser(tokenizer_object).parse(number_of_inputs=number_of_inputs) - ov_tokenizer = pipeline.get_encoder_ov_subgraph() - if with_decoder: - ov_detokenizer = pipeline.get_decoder_ov_subgraph(greedy_decoder) - output_names = tokenizer_object.model_input_names - - ov_tokenizer_output_names = ["input_ids", "attention_mask"] - if len(output_names) == 3 and len(ov_tokenizer.outputs) == 3: - ov_tokenizer_output_names.insert(1, "token_type_ids") - - filtered_outputs = [] - for i, output_name in enumerate(ov_tokenizer_output_names): - current_output = next( - (output for output in ov_tokenizer.outputs if output.any_name == output_name), - False, + if is_sentencepiece_model(tokenizer_object): + return convert_sentencepiece_model_tokenizer( + tokenizer_object, + add_attention_mask=True, + with_decoder=with_decoder, + greedy_decoder=greedy_decoder, ) - if current_output: - filtered_outputs.append(current_output) - continue - - if output_name in output_names: - ov_tokenizer.output(i).tensor.add_names({output_name}) - filtered_outputs.append(ov_tokenizer.output(i)) - - if with_decoder: - return ( - Model(filtered_outputs, ov_tokenizer.get_parameters()), - ov_detokenizer, + elif isinstance(tokenizer_object, PreTrainedTokenizerFast): + return convert_fast_tokenizer( + tokenizer_object, + number_of_inputs=number_of_inputs, + with_decoder=with_decoder, + greedy_decoder=greedy_decoder, ) - return Model(filtered_outputs, ov_tokenizer.get_parameters()) - raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 6c1017340..49ab10602 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -263,8 +263,46 @@ def decoding(self) -> None: return -def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase"): - return hf_tokenizer.vocab_files_names.get("vocab_file", "").endswith(".model") +def convert_fast_tokenizer( + hf_tokenizer: "PreTrainedTokenizerBase", + number_of_inputs: int = 1, + with_decoder: bool = False, + greedy_decoder: bool = False, +) -> Union[Model, Tuple[Model, Model]]: + pipeline = TransformersTokenizerPipelineParser(hf_tokenizer).parse(number_of_inputs=number_of_inputs) + ov_tokenizer = pipeline.get_encoder_ov_subgraph() + output_names = hf_tokenizer.model_input_names + + ov_tokenizer_output_names = ["input_ids", "attention_mask"] + if len(output_names) == 3 and len(ov_tokenizer.outputs) == 3: + ov_tokenizer_output_names.insert(1, "token_type_ids") + + filtered_outputs = [] + for i, output_name in enumerate(ov_tokenizer_output_names): + current_output = next( + (output for output in ov_tokenizer.outputs if output.any_name == output_name), + False, + ) + if current_output: + filtered_outputs.append(current_output) + continue + + if output_name in output_names: + ov_tokenizer.output(i).tensor.add_names({output_name}) + filtered_outputs.append(ov_tokenizer.output(i)) + + if with_decoder: + ov_detokenizer = pipeline.get_decoder_ov_subgraph(greedy_decoder) + return ( + Model(filtered_outputs, ov_tokenizer.get_parameters()), + ov_detokenizer, + ) + + return Model(filtered_outputs, ov_tokenizer.get_parameters()) + + +def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: + return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model") def convert_sentencepiece_model_tokenizer( From 8b2944392044c35fc8fb14f6b91e61fe2e55feeb Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 11 Oct 2023 12:13:37 +0100 Subject: [PATCH 059/116] Add Full Pipeline example for Sentencepiece Move greedy decoding pipeline from detokenizer to model --- .../tests/tokenizers_test.py | 40 +++---- .../tokenizer/python/README.md | 111 +++++++++++------- .../tokenizer/python/ov_tokenizer/__init__.py | 3 +- .../python/ov_tokenizer/common_pipelines.py | 23 ---- .../python/ov_tokenizer/convert_tokenizer.py | 63 +--------- .../python/ov_tokenizer/hf_parser.py | 19 +-- .../python/ov_tokenizer/tokenizer_pipeline.py | 30 ++--- .../tokenizer/python/ov_tokenizer/utils.py | 97 +++++++++++++++ 8 files changed, 209 insertions(+), 177 deletions(-) delete mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/tests/tokenizers_test.py index 131097c40..87f9154d7 100644 --- a/modules/custom_operations/tests/tokenizers_test.py +++ b/modules/custom_operations/tests/tokenizers_test.py @@ -186,18 +186,14 @@ def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers *emoji_test_strings, ] ) -def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): - hf_tokenizer, ov_tokenizer, _ = hf_and_ov_bpe_tokenizers - packed_strings = pack_strings([test_string]) +def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): + hf_tokenizer, ov_tokenizer, _ = sentencepice_model_tokenizers - hf_tokenized = hf_tokenizer([test_string], return_tensors="np") - ov_tokenized = ov_tokenizer(packed_strings) + hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + ov_tokenized = ov_tokenizer(pack_strings([test_string])) for output_name, hf_result in hf_tokenized.items(): - ov_result = ov_tokenized.get(output_name) - # galactica tokenizer has 3 output, but model has 2 inputs - if ov_result is not None: - assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" @pytest.mark.parametrize( @@ -208,11 +204,11 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): *emoji_test_strings, ] ) -def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): - hf_tokenizer, _, ov_detokenizer = hf_and_ov_bpe_detokenizer +def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_string): + hf_tokenizer, _, ov_detokenizer = sentencepice_model_tokenizers token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids - hf_output = hf_tokenizer.batch_decode(token_ids) + hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) assert ov_output == hf_output @@ -226,14 +222,18 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): *emoji_test_strings, ] ) -def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): - hf_tokenizer, ov_tokenizer, _ = sentencepice_model_tokenizers +def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): + hf_tokenizer, ov_tokenizer, _ = hf_and_ov_bpe_tokenizers + packed_strings = pack_strings([test_string]) - hf_tokenized = hf_tokenizer(test_string, return_tensors="np") - ov_tokenized = ov_tokenizer(pack_strings([test_string])) + hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + ov_tokenized = ov_tokenizer(packed_strings) for output_name, hf_result in hf_tokenized.items(): - assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + ov_result = ov_tokenized.get(output_name) + # galactica tokenizer has 3 output, but model has 2 inputs + if ov_result is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" @pytest.mark.parametrize( @@ -244,11 +244,11 @@ def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_strin *emoji_test_strings, ] ) -def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_string): - hf_tokenizer, _, ov_detokenizer = sentencepice_model_tokenizers +def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): + hf_tokenizer, _, ov_detokenizer = hf_and_ov_bpe_detokenizer token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids - hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) + hf_output = hf_tokenizer.batch_decode(token_ids) ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) assert ov_output == hf_output diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 134030a4e..2ca6d83bb 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -6,6 +6,7 @@ - Fast tokenizers based on Wordpiece and BPE models - Slow tokenizers based on SentencePiece model file - Combine OpenVINO models into a single model +- Add greedy decoding pipeline to text generation model ## Installation @@ -62,7 +63,6 @@ for output_name in hf_output: ### Connect Tokenizer to a Model - ```python from transformers import AutoTokenizer, AutoModelForSequenceClassification from openvino import compile_model, convert_model @@ -92,46 +92,7 @@ print(f"HuggingFace logits {hf_output.logits}") # HuggingFace logits tensor([[ 1.2007, -1.4698]], grad_fn=) ``` -### Convert SentencePiece Model Tokenzier - -```python -from transformers import AutoTokenizer -from openvino import compile_model -from ov_tokenizer import init_extension, convert_tokenizer, pack_strings, unpack_strings - - -init_extension("path/to/libuser_ov_extensions.so") - -checkpoint = "codellama/CodeLlama-7b-hf" -hf_tokenizer = AutoTokenizer.from_pretrained(checkpoint) - -text_input = ["def fibonnaci(n):"] -hf_input = hf_tokenizer(text_input, return_tensors="np") - -ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) -compiled_tokenizer = compile_model(ov_tokenizer) -compiled_detokenizer = compile_model(ov_detokenizer) -ov_input = compiled_tokenizer(pack_strings(text_input)) - -for model_input_name in hf_input: - print(f"OpenVINO {model_input_name} = {ov_input[model_input_name]}") - print(f"HuggingFace {model_input_name} = {hf_input[model_input_name]}") -# OpenVINO input_ids = [[ 1 822 18755 11586 455 29898 29876 1125]] -# HuggingFace input_ids = [[ 1 822 18755 11586 455 29898 29876 1125]] -# OpenVINO attention_mask = [[1 1 1 1 1 1 1 1]] -# HuggingFace attention_mask = [[1 1 1 1 1 1 1 1]] - -ov_output = unpack_strings(compiled_detokenizer(hf_input.input_ids)["string_output"]) -hf_output = hf_tokenizer.batch_decode(hf_input.input_ids, skip_special_tokens=True) -print(f"OpenVINO output string: `{ov_output}`") -# OpenVINO output string: ['def fibonnaci(n):'] -print(f"HuggingFace output string: `{hf_output}`") -# HuggingFace output string: ['def fibonnaci(n):'] -``` - -To connect a detokenizer to a `logits` model output, set `greedy_decoder=True` when using the `convert_tokenizer` or `convert_sentencepiece_model_tokenizer` function, enabling a greedy decoding pipeline before detoknizer. This allows the detokenizer to be connected to the `logits` model output. - -### Use Extension With Converted (De)Tokenizer or Model combined with (De)Tokenizer +### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer To work with converted tokenizer you need `pack_strings`/`unpack_strings` functions. @@ -152,3 +113,71 @@ openvino_output = compiled_detokenizer(token_ids) print(unpack_strings(openvino_output["string_output"])) # ['sc�ouition�', 'intvenord hasient', 'g shouldwer M more'] ``` + +### Text generation pipeline + +```python +import numpy as np +from openvino import compile_model, convert_model +from transformers import AutoModelForCausalLM, AutoTokenizer +from ov_tokenizer import ( + add_greedy_decoding, + convert_tokenizer, + init_extension, + pack_strings, + unpack_strings, +) + + +init_extension("path/to/libuser_ov_extensions.so") + +# Use different repo for the tokenizer because the original repo doesn't have .model file +# Sentencepiece(Unigram) tokenizer supported only with .model file +tokenizer_checkpoint = "microsoft/Llama2-7b-WhoIsHarryPotter" +model_checkpoint = "nickypro/tinyllama-15M" +hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) +hf_model = AutoModelForCausalLM.from_pretrained(model_checkpoint, use_cache=False) + +# convert hf tokenizer +text_input = ["Quick brown fox was"] +ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) +compiled_tokenizer = compile_model(ov_tokenizer) + +# transform input text into tokens +ov_input = compiled_tokenizer(pack_strings(text_input)) +hf_input = hf_tokenizer(text_input, return_tensors="pt") + +# convert Pytorch model to OpenVINO IR and add greedy decoding pipeline to it +ov_model = convert_model(hf_model, example_input=hf_input.data) +ov_model_with_greedy_decoding = add_greedy_decoding(ov_model) +compiled_model = compile_model(ov_model_with_greedy_decoding) + +# generate new tokens +new_tokens_size = 10 +prompt_size = ov_input["input_ids"].shape[-1] +input_dict = { + output.any_name: np.hstack([tensor, np.zeros(shape=(1, new_tokens_size), dtype=np.int_)]) + for output, tensor in ov_input.items() +} +for idx in range(prompt_size, prompt_size + new_tokens_size): + output = compiled_model(input_dict)["token_ids"] + input_dict["input_ids"][:, idx] = output[:, idx - 1] + input_dict["attention_mask"][:, idx] = 1 +ov_token_ids = input_dict["input_ids"] + +hf_token_ids = hf_model.generate( + **hf_input, + min_new_tokens=new_tokens_size, + max_new_tokens=new_tokens_size, + temperature=0, # greedy decoding +) + +# decode model output +compiled_detokenizer = compile_model(ov_detokenizer) +ov_output = unpack_strings(compiled_detokenizer(ov_token_ids)["string_output"]) +hf_output = hf_tokenizer.batch_decode(hf_token_ids, skip_special_tokens=True) +print(f"OpenVINO output string: `{ov_output}`") +# OpenVINO output string: `['Quick brown fox was walking through the forest. He was looking for something']` +print(f"HuggingFace output string: `{hf_output}`") +# HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']` +``` diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py index bcee3adac..ce757b861 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py @@ -2,6 +2,7 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from .convert_tokenizer import connect_models, convert_tokenizer +from .convert_tokenizer import convert_tokenizer from .node_factory import init_extension from .str_pack import pack_strings, unpack_strings +from .utils import add_greedy_decoding, connect_models diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py deleted file mode 100644 index dc5e41ddd..000000000 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/common_pipelines.py +++ /dev/null @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from typing import List - -from openvino.runtime import Output, op -from openvino.runtime import opset12 as opset - - -def get_greedy_decoding_ov_subgraph(logits_node: op.Parameter) -> List[Output]: - argmax = opset.topk( - data=logits_node, - k=1, - axis=-1, - mode="max", - sort="none", - name="ArgMax", - ) - return opset.squeeze( - data=argmax.output(1), - axes=-1, - ).outputs() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index 5114902f0..402d35bb4 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -4,7 +4,7 @@ import logging import sys -from typing import Any, Optional, Sequence, Tuple, Union +from typing import Any, Tuple, Union from openvino.runtime import Model from openvino.runtime.exceptions import OVTypeError @@ -14,7 +14,7 @@ def convert_tokenizer( - tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False, greedy_decoder=False + tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False ) -> Union[Model, Tuple[Model, Model]]: # todo: add support for more then 1 input if number_of_inputs > 1: @@ -23,75 +23,22 @@ def convert_tokenizer( if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast - from .hf_parser import is_sentencepiece_model, convert_sentencepiece_model_tokenizer, convert_fast_tokenizer + from .hf_parser import convert_fast_tokenizer, convert_sentencepiece_model_tokenizer, is_sentencepiece_model if isinstance(tokenizer_object, PreTrainedTokenizerBase): if is_sentencepiece_model(tokenizer_object): + logger.info("Convert tokenizer using SentencePiece .model file.") return convert_sentencepiece_model_tokenizer( tokenizer_object, add_attention_mask=True, with_decoder=with_decoder, - greedy_decoder=greedy_decoder, ) elif isinstance(tokenizer_object, PreTrainedTokenizerFast): + logger.info("Convert Huggingface Fast tokenizer pipeline.") return convert_fast_tokenizer( tokenizer_object, number_of_inputs=number_of_inputs, with_decoder=with_decoder, - greedy_decoder=greedy_decoder, ) raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") - - -def connect_models( - first: Model, - second: Model, - name_map: Optional[Sequence[Tuple[str, str]]] = None, - by_indices: bool = False, - keep_unaligned_inputs: bool = True, - keep_unaligned_outputs: bool = False, -) -> Model: - if by_indices: - min_len = min(len(first.outputs), len(second.inputs)) - aligned_first_outputs = first.outputs[:min_len] - aligned_second_inputs = second.inputs[:min_len] - elif name_map is None: - aligned_first_outputs = first.outputs - aligned_second_inputs = [second.input(model1_output.get_any_name()) for model1_output in aligned_first_outputs] - else: - aligned_first_outputs = [first.output(name1) for name1, _ in name_map] - aligned_second_inputs = [second.input(name2) for _, name2 in name_map] - - for second_input, first_output in zip(aligned_second_inputs, aligned_first_outputs): - logger.debug(f"Connecting: {first_output.get_any_name()} -> {second_input.get_any_name()}") - for target in second_input.get_target_inputs(): - target.replace_source_output(first_output.get_node().input_value(0)) - # target.replace_source_output(model1_output) # TODO: Produces incorrect topology - - new_inputs = first.get_parameters() - remaining_inputs = [input_ for input_ in second.inputs if input_ not in aligned_second_inputs] - if keep_unaligned_inputs: - new_inputs.extend(remaining_inputs) - elif remaining_inputs: - logger.info( - "Some inputs of the second model were left uncovered and not included in the connected model: " - + ", ".join(input_.name for input_ in remaining_inputs) - + ". To add them set `keep_unaligned_inputs` to `True`" - ) - - new_outputs = second.outputs - remaining_outputs = [output for output in first.outputs if output not in aligned_first_outputs] - if keep_unaligned_outputs: - new_outputs.extend(remaining_outputs) - elif remaining_outputs: - logger.info( - "Some outputs of the first model were left uncovered and not included in the connected model: " - + ", ".join(output.name for output in remaining_outputs) - + ". To add them set `keep_unaligned_outputs` to `True`" - ) - - connected_model = Model(new_outputs, new_inputs, f"{first.get_name()}_{second.get_name()}") - # TODO: Cleanup model1 and mode2 to avoid using them, they are ill-formed after the reconnection - connected_model.validate_nodes_and_infer_types() - return connected_model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 49ab10602..06f0bea41 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -15,7 +15,6 @@ from openvino.runtime.exceptions import OVTypeError from openvino.runtime.utils.types import as_node, make_constant_node -from .common_pipelines import get_greedy_decoding_ov_subgraph from .node_factory import factory from .tokenizer_pipeline import ( BPETokenizationStep, @@ -291,14 +290,11 @@ def convert_fast_tokenizer( ov_tokenizer.output(i).tensor.add_names({output_name}) filtered_outputs.append(ov_tokenizer.output(i)) + tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters()) if with_decoder: - ov_detokenizer = pipeline.get_decoder_ov_subgraph(greedy_decoder) - return ( - Model(filtered_outputs, ov_tokenizer.get_parameters()), - ov_detokenizer, - ) + return tokenizer_model, pipeline.get_decoder_ov_subgraph() - return Model(filtered_outputs, ov_tokenizer.get_parameters()) + return tokenizer_model def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: @@ -309,7 +305,6 @@ def convert_sentencepiece_model_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", add_attention_mask: bool = True, with_decoder: bool = False, - greedy_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: if not is_sentencepiece_model(hf_tokenizer): raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.") @@ -384,12 +379,8 @@ def convert_sentencepiece_model_tokenizer( if not with_decoder: return tokenizer_encoder - if greedy_decoder: - decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) # (batch, sequence, logits) - token_ids = get_greedy_decoding_ov_subgraph(decoder_input)[0] # (batch, sequence) - else: - decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) - token_ids = decoder_input + decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) + token_ids = decoder_input decoder = factory.create( "SentencepieceDetokenizer", diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 8fe88b65c..7d262cfb4 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -10,11 +10,10 @@ import numpy as np from openvino.runtime import Model, Output, PartialShape, Type, op -from openvino.runtime import opset10 as opset +from openvino.runtime import opset12 as opset from openvino.runtime.exceptions import OVTypeError, UserInputError from openvino.runtime.utils.types import as_node, make_constant_node -from .common_pipelines import get_greedy_decoding_ov_subgraph from .node_factory import factory from .str_pack import pack_string, pack_strings @@ -690,7 +689,7 @@ def get_encoder_ov_subgraph(self) -> Model: input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() for step in self.normalization_steps: input_node = step.get_ov_subgraph(input_node) - input_node = self.add_ragged_dimention(input_node) + input_node = self.add_ragged_dimension(input_node) for step in chain(self.pre_tokenization_steps, self.tokenization_steps): input_node = step.get_ov_subgraph(input_node) @@ -722,21 +721,16 @@ def post_tokenization_steps(self) -> List[PostTokenizationStep]: def decoding_steps(self) -> List[DecodingStep]: return [step for step in self.steps if isinstance(step, DecodingStep)] - def add_ragged_dimention(self, input_node: List[Output]) -> List[Output]: + @staticmethod + def add_ragged_dimension(input_node: List[Output]) -> List[Output]: shape = opset.shape_of(input_node[0]) batch_size = opset.gather(shape, as_node(0), as_node(0)) - # FIXME: Cannot create range with specific data type from python - ragged_begins = opset.convert( - opset.range(as_node(0), batch_size, as_node(1)), - "i32", - ).outputs() - ragged_ends = opset.convert( - opset.range( + ragged_begins = opset.range(as_node(0), batch_size, as_node(1), output_type="i32").outputs() + ragged_ends = opset.range( as_node(1), opset.add(batch_size, as_node(1)), as_node(1), - ), - "i32", + output_type="i32" ).outputs() return ragged_begins + ragged_ends + input_node @@ -747,13 +741,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return factory.create("StringTensorPack", input_nodes).outputs() - def get_decoder_ov_subgraph(self, greedy_decoder: bool = False) -> Model: - if greedy_decoder: - input_node = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) - token_ids = get_greedy_decoding_ov_subgraph(input_node) - else: - input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) - token_ids = input_node + def get_decoder_ov_subgraph(self) -> Model: + input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) + token_ids = input_node outputs = self.create_decoding_pipeline([token_ids]) model = Model(outputs, [input_node], name="tokenizer_decoder") model.output().tensor.add_names({"string_output"}) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py new file mode 100644 index 000000000..bdcff7378 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Dict, Optional, Sequence, Tuple, Union + +from openvino import Model, PartialShape, Type +from openvino.runtime import op +from openvino.runtime import opset12 as opset + + +logger = logging.getLogger(__name__) + + +def get_greedy_decoding_ov_model() -> Model: + logits = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) + logits.set_friendly_name("logits") + argmax = opset.topk( + data=logits, + k=1, + axis=-1, + mode="max", + sort="none", + name="ArgMax", + ) + token_ids = opset.squeeze( + data=argmax.output(1), + axes=-1, + ) + token_ids.output(0).tensor.add_names({"token_ids"}) + return Model(token_ids.outputs(), [logits], name="greedy_decoder") + + +def connect_models( + first: Model, + second: Model, + name_map: Optional[Union[Sequence[Tuple[str, str]], Dict[str, str]]] = None, + by_indices: bool = False, + keep_second_model_unaligned_inputs: bool = True, + keep_remaining_first_model_outputs: bool = False, +) -> Model: + if by_indices: + min_len = min(len(first.outputs), len(second.inputs)) + aligned_first_outputs = first.outputs[:min_len] + aligned_second_inputs = second.inputs[:min_len] + elif name_map is None: + aligned_first_outputs = first.outputs + aligned_second_inputs = [second.input(model1_output.get_any_name()) for model1_output in aligned_first_outputs] + else: + if isinstance(name_map, dict): + name_map = list(name_map.items()) + aligned_first_outputs = [first.output(name1) for name1, _ in name_map] + aligned_second_inputs = [second.input(name2) for _, name2 in name_map] + + for second_input, first_output in zip(aligned_second_inputs, aligned_first_outputs): + logger.debug(f"Connecting: {first_output.get_any_name()} -> {second_input.get_any_name()}") + for target in second_input.get_target_inputs(): + target.replace_source_output(first_output.get_node().input_value(0)) + # target.replace_source_output(model1_output) # TODO: Produces incorrect topology + + new_inputs = first.get_parameters() + remaining_inputs = [input_ for input_ in second.inputs if input_ not in aligned_second_inputs] + if keep_second_model_unaligned_inputs: + new_inputs.extend(remaining_inputs) + elif remaining_inputs: + logger.info( + "Some inputs of the second model were left uncovered and not included in the connected model: " + + ", ".join(input_.name for input_ in remaining_inputs) + + ". To add them set `keep_unaligned_inputs` to `True`" + ) + + new_outputs = second.outputs + remaining_outputs = [output for output in first.outputs if output not in aligned_first_outputs] + if keep_remaining_first_model_outputs: + new_outputs.extend(remaining_outputs) + elif remaining_outputs: + logger.info( + "Some outputs of the first model were left uncovered and not included in the connected model: " + + ", ".join(output.name for output in remaining_outputs) + + ". To add them set `keep_unaligned_outputs` to `True`" + ) + + connected_model = Model(new_outputs, new_inputs, f"{first.get_name()}_with_{second.get_name()}") + # TODO: Cleanup model1 and mode2 to avoid using them, they are ill-formed after the reconnection + connected_model.validate_nodes_and_infer_types() + return connected_model + + +def add_greedy_decoding(text_generation_model: Model, logits_output: str = "logits") -> Model: + return connect_models( + first=text_generation_model, + second=get_greedy_decoding_ov_model(), + name_map={logits_output: "logits"}, + keep_second_model_unaligned_inputs=True, + keep_remaining_first_model_outputs=True, + ) From 2ee3707d0dbe4ae8db8e8714004e29b12310225b Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 11 Oct 2023 17:04:29 +0100 Subject: [PATCH 060/116] Update third-party-programs.txt --- third-party-programs.txt | 495 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 495 insertions(+) diff --git a/third-party-programs.txt b/third-party-programs.txt index dde3d200c..f6c85bed0 100644 --- a/third-party-programs.txt +++ b/third-party-programs.txt @@ -3269,3 +3269,498 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +------------------------------------------------------------- + +fast_tokenizer + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +------------------------------------------------------------- + +re2 + +// Copyright (c) 2009 The RE2 Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------- + +icu4c + +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +------------------------------------------------------------- + +sentencepiece + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file From 803d831a55e37387cd8f6d107f916b9348803e22 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 13 Oct 2023 16:58:12 +0100 Subject: [PATCH 061/116] Add Constants --- .../python/ov_tokenizer/constants.py | 19 +++++++++++++++ .../python/ov_tokenizer/hf_parser.py | 23 ++++++++++++------- .../python/ov_tokenizer/tokenizer_pipeline.py | 23 +++++++++++-------- .../tokenizer/python/ov_tokenizer/utils.py | 16 +++++++++---- 4 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py new file mode 100644 index 000000000..72db7ca10 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py @@ -0,0 +1,19 @@ +from enum import StrEnum, auto + + +class DecodingType(StrEnum): + greedy = auto() + + +ATTENTION_MASK_INPUT_NAME = "attention_mask" +TOKEN_IDS_INPUT_NAME = "input_ids" +TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids" + +LOGITS_OUTPUT_NAME = "logits" +TOKEN_IDS_OUTPUT_NAME = "token_ids" +STRING_OUTPUT_NAME = "string_output" + +GREEDY_DECODER_NAME = "greedy_decoder" + +TOKENIZER_ENCODER_NAME = "tokenizer_encoder" +TOKENIZER_DECODER_NAME = "tokenizer_decoder" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 06f0bea41..c22429ed7 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -10,6 +10,14 @@ import numpy as np import openvino.runtime.opset12 as opset +from constants import ( + ATTENTION_MASK_INPUT_NAME, + STRING_OUTPUT_NAME, + TOKEN_IDS_INPUT_NAME, + TOKEN_TYPE_IDS_INPUT_NAME, + TOKENIZER_DECODER_NAME, + TOKENIZER_ENCODER_NAME, +) from openvino import save_model from openvino.runtime import Model, PartialShape, Type, op from openvino.runtime.exceptions import OVTypeError @@ -272,9 +280,9 @@ def convert_fast_tokenizer( ov_tokenizer = pipeline.get_encoder_ov_subgraph() output_names = hf_tokenizer.model_input_names - ov_tokenizer_output_names = ["input_ids", "attention_mask"] + ov_tokenizer_output_names = [TOKEN_IDS_INPUT_NAME, ATTENTION_MASK_INPUT_NAME] if len(output_names) == 3 and len(ov_tokenizer.outputs) == 3: - ov_tokenizer_output_names.insert(1, "token_type_ids") + ov_tokenizer_output_names.insert(1, TOKEN_TYPE_IDS_INPUT_NAME) filtered_outputs = [] for i, output_name in enumerate(ov_tokenizer_output_names): @@ -354,7 +362,7 @@ def convert_sentencepiece_model_tokenizer( "ScatterNDUpdate", [broadcast, indices, values], # FIXME: pad left side instead of right ) - scatternd_input_ids.output(0).tensor.add_names({"input_ids"}) + scatternd_input_ids.output(0).tensor.add_names({TOKEN_IDS_INPUT_NAME}) outputs = scatternd_input_ids.outputs() @@ -370,10 +378,10 @@ def convert_sentencepiece_model_tokenizer( ), ], ) - attention_mask.output(0).tensor.add_names({"attention_mask"}) + attention_mask.output(0).tensor.add_names({ATTENTION_MASK_INPUT_NAME}) outputs.append(attention_mask.output(0)) - tokenizer_encoder = Model(outputs, [input_node], "sp_tokenizer_encoder") + tokenizer_encoder = Model(outputs, [input_node], TOKENIZER_ENCODER_NAME) tokenizer_encoder.validate_nodes_and_infer_types() if not with_decoder: @@ -387,9 +395,8 @@ def convert_sentencepiece_model_tokenizer( [sp_model_node, token_ids], ) string_output = factory.create("StringTensorPack", decoder.outputs()).outputs() - string_output[0].tensor.add_names({"string_output"}) - tokenizer_decoder = Model(string_output, [decoder_input], "sp_tokenizer_decoder") + string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) + tokenizer_decoder = Model(string_output, [decoder_input], TOKENIZER_DECODER_NAME) tokenizer_decoder.validate_nodes_and_infer_types() - save_model(tokenizer_decoder, "detokenizer.xml") return tokenizer_encoder, tokenizer_decoder diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 7d262cfb4..488a3ce34 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -9,6 +9,14 @@ from typing import Any, Dict, List, Optional, Union import numpy as np +from constants import ( + ATTENTION_MASK_INPUT_NAME, + STRING_OUTPUT_NAME, + TOKEN_IDS_INPUT_NAME, + TOKEN_TYPE_IDS_INPUT_NAME, + TOKENIZER_DECODER_NAME, + TOKENIZER_ENCODER_NAME, +) from openvino.runtime import Model, Output, PartialShape, Type, op from openvino.runtime import opset12 as opset from openvino.runtime.exceptions import OVTypeError, UserInputError @@ -590,7 +598,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: else: max_length = make_constant_node(self.max_length, Type.i32) - names = ["input_ids", "token_type_ids"][: len(input_nodes) // 3] + names = [TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME][: len(input_nodes) // 3] for i, name in enumerate(names): cur_outputs = factory.create( "RaggedToDense", @@ -604,7 +612,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: 0 ) # TODO: Change RaggedToDense to generate mask of any type - mask.tensor.add_names({"attention_mask"}) + mask.tensor.add_names({ATTENTION_MASK_INPUT_NAME}) outputs.append(mask) return outputs @@ -699,7 +707,7 @@ def get_encoder_ov_subgraph(self) -> Model: for step in self.post_tokenization_steps: processing_outputs = step.get_ov_subgraph(processing_outputs) - return Model(processing_outputs, string_inputs, name="tokenizer_encoder") + return Model(processing_outputs, string_inputs, name=TOKENIZER_ENCODER_NAME) @property def normalization_steps(self) -> List[NormalizationStep]: @@ -727,10 +735,7 @@ def add_ragged_dimension(input_node: List[Output]) -> List[Output]: batch_size = opset.gather(shape, as_node(0), as_node(0)) ragged_begins = opset.range(as_node(0), batch_size, as_node(1), output_type="i32").outputs() ragged_ends = opset.range( - as_node(1), - opset.add(batch_size, as_node(1)), - as_node(1), - output_type="i32" + as_node(1), opset.add(batch_size, as_node(1)), as_node(1), output_type="i32" ).outputs() return ragged_begins + ragged_ends + input_node @@ -745,6 +750,6 @@ def get_decoder_ov_subgraph(self) -> Model: input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) token_ids = input_node outputs = self.create_decoding_pipeline([token_ids]) - model = Model(outputs, [input_node], name="tokenizer_decoder") - model.output().tensor.add_names({"string_output"}) + model = Model(outputs, [input_node], name=TOKENIZER_DECODER_NAME) + model.output().tensor.add_names({STRING_OUTPUT_NAME}) return model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py index bdcff7378..bdee5bcab 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -5,6 +5,7 @@ import logging from typing import Dict, Optional, Sequence, Tuple, Union +from constants import GREEDY_DECODER_NAME, LOGITS_OUTPUT_NAME, TOKEN_IDS_OUTPUT_NAME, DecodingType from openvino import Model, PartialShape, Type from openvino.runtime import op from openvino.runtime import opset12 as opset @@ -15,7 +16,7 @@ def get_greedy_decoding_ov_model() -> Model: logits = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) - logits.set_friendly_name("logits") + logits.set_friendly_name(LOGITS_OUTPUT_NAME) argmax = opset.topk( data=logits, k=1, @@ -28,8 +29,8 @@ def get_greedy_decoding_ov_model() -> Model: data=argmax.output(1), axes=-1, ) - token_ids.output(0).tensor.add_names({"token_ids"}) - return Model(token_ids.outputs(), [logits], name="greedy_decoder") + token_ids.output(0).tensor.add_names({TOKEN_IDS_OUTPUT_NAME}) + return Model(token_ids.outputs(), [logits], name=GREEDY_DECODER_NAME) def connect_models( @@ -87,11 +88,16 @@ def connect_models( return connected_model -def add_greedy_decoding(text_generation_model: Model, logits_output: str = "logits") -> Model: +def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model: return connect_models( first=text_generation_model, second=get_greedy_decoding_ov_model(), - name_map={logits_output: "logits"}, + name_map={logits_output: LOGITS_OUTPUT_NAME}, keep_second_model_unaligned_inputs=True, keep_remaining_first_model_outputs=True, ) + + +class Generator: + def __init__(self, generation_model: Model, decoding_type: DecodingType = DecodingType.greedy) -> None: + pass From 72f6d9f896c8eea991b799bd7a58d5ecca9bc722 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 16 Oct 2023 19:37:13 +0100 Subject: [PATCH 062/116] Add CPP pack/unpack_strings functions Refactor greedy decoding --- .../python/ov_tokenizer/constants.py | 7 --- .../python/ov_tokenizer/hf_parser.py | 1 - .../tokenizer/python/ov_tokenizer/utils.py | 54 +++++++----------- .../tokenizer/string_tensor_unpack.cpp | 2 +- .../tokenizer/tensorflow_translators.cpp | 2 +- .../user_ie_extensions/tokenizer/utils.cpp | 57 ++++++++++++++++++- .../user_ie_extensions/tokenizer/utils.hpp | 7 ++- 7 files changed, 86 insertions(+), 44 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py index 72db7ca10..46d2626a9 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py @@ -1,10 +1,3 @@ -from enum import StrEnum, auto - - -class DecodingType(StrEnum): - greedy = auto() - - ATTENTION_MASK_INPUT_NAME = "attention_mask" TOKEN_IDS_INPUT_NAME = "input_ids" TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index c22429ed7..c46fdac70 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -18,7 +18,6 @@ TOKENIZER_DECODER_NAME, TOKENIZER_ENCODER_NAME, ) -from openvino import save_model from openvino.runtime import Model, PartialShape, Type, op from openvino.runtime.exceptions import OVTypeError from openvino.runtime.utils.types import as_node, make_constant_node diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py index bdee5bcab..76390d8f5 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -5,34 +5,15 @@ import logging from typing import Dict, Optional, Sequence, Tuple, Union -from constants import GREEDY_DECODER_NAME, LOGITS_OUTPUT_NAME, TOKEN_IDS_OUTPUT_NAME, DecodingType -from openvino import Model, PartialShape, Type -from openvino.runtime import op +from constants import LOGITS_OUTPUT_NAME, TOKEN_IDS_OUTPUT_NAME +from openvino import Model +from openvino.preprocess import PrePostProcessor from openvino.runtime import opset12 as opset logger = logging.getLogger(__name__) -def get_greedy_decoding_ov_model() -> Model: - logits = op.Parameter(Type.i32, PartialShape(["?", "?", "?"])) - logits.set_friendly_name(LOGITS_OUTPUT_NAME) - argmax = opset.topk( - data=logits, - k=1, - axis=-1, - mode="max", - sort="none", - name="ArgMax", - ) - token_ids = opset.squeeze( - data=argmax.output(1), - axes=-1, - ) - token_ids.output(0).tensor.add_names({TOKEN_IDS_OUTPUT_NAME}) - return Model(token_ids.outputs(), [logits], name=GREEDY_DECODER_NAME) - - def connect_models( first: Model, second: Model, @@ -88,16 +69,25 @@ def connect_models( return connected_model -def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model: - return connect_models( - first=text_generation_model, - second=get_greedy_decoding_ov_model(), - name_map={logits_output: LOGITS_OUTPUT_NAME}, - keep_second_model_unaligned_inputs=True, - keep_remaining_first_model_outputs=True, +def greedy_decoder(input) -> Model: + argmax = opset.topk( + data=input, + k=1, + axis=-1, + mode="max", + sort="none", + name="ArgMax", + ) + token_ids = opset.squeeze( + data=argmax.output(1), + axes=-1, ) + return token_ids.output(0) -class Generator: - def __init__(self, generation_model: Model, decoding_type: DecodingType = DecodingType.greedy) -> None: - pass +def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model: + ppp = PrePostProcessor(text_generation_model) + ppp.output(logits_output).postprocess().custom(greedy_decoder) + model = ppp.build() + model.output(logits_output).tensor.set_names({TOKEN_IDS_OUTPUT_NAME}) + return model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp index 119bbf9b8..ec9e22bb5 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp @@ -97,7 +97,7 @@ bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVec if(tensor.get_element_type() == element::string) { Shape input_shape = tensor.get_shape(); const std::string* input_strings = tensor.data(); - unpack_strings(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); + unpack_strings_to_tensors(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); return true; } else { #endif diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp index 6eea48158..51179dcac 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp @@ -222,7 +222,7 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { auto value_as_any = node.get_attribute_as_any("value"); const auto& values = value_as_any.as>(); ov::Tensor begins(element::i32, {}), ends(element::i32, {}), chars(element::u8, {}); - unpack_strings(&values[0], {values.size()}, begins, ends, chars); + unpack_strings_to_tensors(&values[0], {values.size()}, begins, ends, chars); const_node = std::make_shared(OutputVector{ std::make_shared(begins), std::make_shared(ends), diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index e503ee8d6..f9cbc8c7a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -90,7 +90,7 @@ void set_ragged_output(Node* node, size_t output_index, const PartialShape& shap } -void unpack_strings (const std::string* strings, const Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars) { // TODO: no need for a reference to a ov::Tensor? +void unpack_strings_to_tensors (const std::string* strings, const Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars) { // TODO: no need for a reference to a ov::Tensor? auto nelements = shape_size(shape); size_t total = 0; @@ -222,3 +222,58 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); #endif } + + +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according tha input data +template +void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + auto symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference s) + { return accum + s.length(); }); + + auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + auto data = destination.data(); + auto pbatch_size = reinterpret_cast(data); + auto pindices = pbatch_size + 1; + auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + size_t current_symbols_pos = 0; + + *pbatch_size = batch_size; + *pindices = 0; + + for(auto s: strings) { + psymbols = std::copy(s.begin(), s.end(), psymbols); + current_symbols_pos += s.length(); + *++pindices = current_symbols_pos; + } +} + + +std::vector unpack_strings (const ov::Tensor& source) { + auto strings = source.data(); + auto length = source.get_byte_size(); + // check the format of the input bitstream representing the string tensor + OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + auto batch_size = *reinterpret_cast(strings + 0); + OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + auto begin_ids = reinterpret_cast(strings + 4); + auto end_ids = begin_ids + 1; + auto symbols = strings + 4 + 4 + 4 * batch_size; + + std::vector result; + result.reserve(batch_size); + for(size_t i = 0; i < batch_size; ++i) { + result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); + } + return result; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp index f44333553..a0d72b5fc 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -48,7 +48,7 @@ void set_ragged_string_output(ov::Node* node, size_t output_index, const ov::Par void set_ragged_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape, ov::element::Type type); -void unpack_strings (const std::string* strings, const ov::Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars); +void unpack_strings_to_tensors(const std::string* strings, const ov::Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars); void override_parameter (std::shared_ptr node, ov::element::Type type, const ov::PartialShape& shape); @@ -68,3 +68,8 @@ bool evaluate_normalization_helper ( std::function normalizer); std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); + +template +void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination); + +std::vector unpack_strings(const ov::Tensor& source); From 79bd05f8c2a0b500fc0737afd34316d18039adc9 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 17 Oct 2023 15:03:30 +0100 Subject: [PATCH 063/116] Move tests to tokenizer dir --- modules/custom_operations/tests/requirements.txt | 3 --- .../tokenizer/python}/tests/tokenizers_test.py | 0 2 files changed, 3 deletions(-) rename modules/custom_operations/{ => user_ie_extensions/tokenizer/python}/tests/tokenizers_test.py (100%) diff --git a/modules/custom_operations/tests/requirements.txt b/modules/custom_operations/tests/requirements.txt index 7007d5d4b..d7282db88 100644 --- a/modules/custom_operations/tests/requirements.txt +++ b/modules/custom_operations/tests/requirements.txt @@ -2,6 +2,3 @@ torch==1.13.1 # open3d==0.16.0 - need to update with new release tensorboard pytest - -# tokenizers requirements -transformers[sentencepiece] diff --git a/modules/custom_operations/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py similarity index 100% rename from modules/custom_operations/tests/tokenizers_test.py rename to modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py From 24a60b3ec4d7211c72e473f822b310b7ab16ab36 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 18 Oct 2023 12:45:51 +0100 Subject: [PATCH 064/116] Fix import --- .../tokenizer/python/ov_tokenizer/utils.py | 3 ++- .../tokenizer/python/tests/tokenizers_test.py | 21 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py index 76390d8f5..057977e19 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -5,11 +5,12 @@ import logging from typing import Dict, Optional, Sequence, Tuple, Union -from constants import LOGITS_OUTPUT_NAME, TOKEN_IDS_OUTPUT_NAME from openvino import Model from openvino.preprocess import PrePostProcessor from openvino.runtime import opset12 as opset +from .constants import LOGITS_OUTPUT_NAME, TOKEN_IDS_OUTPUT_NAME + logger = logging.getLogger(__name__) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 87f9154d7..d538eb090 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -2,18 +2,18 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import sys + # import os # os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so" -import pytest import numpy as np +import pytest from openvino import Core from transformers import AutoTokenizer + from ov_tokenizer import ( # init_extension, convert_tokenizer, - connect_models, pack_strings, unpack_strings, ) @@ -30,8 +30,7 @@ "A lot\t w!", "A lot\t\tof whitespaces!", "\n\n\n\t\t A lot\t\tof\twhitespaces\n!\n\n\n\t\n\n", - "Eng, but with d1gits: 123; 0987654321, stop." - "0987654321 - eng, but with d1gits: 123" + "Eng, but with d1gits: 123; 0987654321, stop." "0987654321 - eng, but with d1gits: 123", ] multilingual_test_strings = [ "Тестовая строка!", @@ -146,7 +145,7 @@ def sentencepice_model_tokenizers(request, fast_tokenizer): *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, - ] + ], ) def test_hf_wordpiece_tokenizers_outputs(hf_and_ov_wordpiece_tokenizers, test_string): hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers @@ -165,7 +164,7 @@ def test_hf_wordpiece_tokenizers_outputs(hf_and_ov_wordpiece_tokenizers, test_st eng_test_strings, multilingual_test_strings, emoji_test_strings, - ] + ], ) def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers, test_string): hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers @@ -184,7 +183,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, - ] + ], ) def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): hf_tokenizer, ov_tokenizer, _ = sentencepice_model_tokenizers @@ -202,7 +201,7 @@ def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_strin *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, - ] + ], ) def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_string): hf_tokenizer, _, ov_detokenizer = sentencepice_model_tokenizers @@ -220,7 +219,7 @@ def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_str *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, - ] + ], ) def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): hf_tokenizer, ov_tokenizer, _ = hf_and_ov_bpe_tokenizers @@ -242,7 +241,7 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, - ] + ], ) def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): hf_tokenizer, _, ov_detokenizer = hf_and_ov_bpe_detokenizer From b22569fe7893b69c7507cc2e22c6c5a38e060826 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 18 Oct 2023 12:57:00 +0100 Subject: [PATCH 065/116] Fix imports --- .../tokenizer/python/ov_tokenizer/hf_parser.py | 2 +- .../tokenizer/python/ov_tokenizer/tokenizer_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index c46fdac70..2d3c1ba54 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -10,7 +10,7 @@ import numpy as np import openvino.runtime.opset12 as opset -from constants import ( +from .constants import ( ATTENTION_MASK_INPUT_NAME, STRING_OUTPUT_NAME, TOKEN_IDS_INPUT_NAME, diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 488a3ce34..4e0af7451 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional, Union import numpy as np -from constants import ( +from .constants import ( ATTENTION_MASK_INPUT_NAME, STRING_OUTPUT_NAME, TOKEN_IDS_INPUT_NAME, From 96673f5d3745d48378fee5ed19a7f49b0a566214 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 18 Oct 2023 15:20:01 +0100 Subject: [PATCH 066/116] Sort Imports --- .../tokenizer/python/ov_tokenizer/hf_parser.py | 8 ++++---- .../python/ov_tokenizer/tokenizer_pipeline.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 2d3c1ba54..0bcf535f5 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -10,6 +10,10 @@ import numpy as np import openvino.runtime.opset12 as opset +from openvino.runtime import Model, PartialShape, Type, op +from openvino.runtime.exceptions import OVTypeError +from openvino.runtime.utils.types import as_node, make_constant_node + from .constants import ( ATTENTION_MASK_INPUT_NAME, STRING_OUTPUT_NAME, @@ -18,10 +22,6 @@ TOKENIZER_DECODER_NAME, TOKENIZER_ENCODER_NAME, ) -from openvino.runtime import Model, PartialShape, Type, op -from openvino.runtime.exceptions import OVTypeError -from openvino.runtime.utils.types import as_node, make_constant_node - from .node_factory import factory from .tokenizer_pipeline import ( BPETokenizationStep, diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 4e0af7451..c1c1ad412 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -9,6 +9,11 @@ from typing import Any, Dict, List, Optional, Union import numpy as np +from openvino.runtime import Model, Output, PartialShape, Type, op +from openvino.runtime import opset12 as opset +from openvino.runtime.exceptions import OVTypeError, UserInputError +from openvino.runtime.utils.types import as_node, make_constant_node + from .constants import ( ATTENTION_MASK_INPUT_NAME, STRING_OUTPUT_NAME, @@ -17,11 +22,6 @@ TOKENIZER_DECODER_NAME, TOKENIZER_ENCODER_NAME, ) -from openvino.runtime import Model, Output, PartialShape, Type, op -from openvino.runtime import opset12 as opset -from openvino.runtime.exceptions import OVTypeError, UserInputError -from openvino.runtime.utils.types import as_node, make_constant_node - from .node_factory import factory from .str_pack import pack_string, pack_strings From 0e7ae87b2c65d092daedc9a659448353f81b9476 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 19 Oct 2023 16:44:15 +0100 Subject: [PATCH 067/116] Add Streaming Sentencepiece Decoder --- .../user_ie_extensions/ov_extension.cpp | 1 + .../python/ov_tokenizer/convert_tokenizer.py | 3 +- .../python/ov_tokenizer/hf_parser.py | 28 ++++--- .../python/ov_tokenizer/tokenizer_pipeline.py | 14 ++++ .../tokenizer/sentence_piece.cpp | 84 +++++++++++++++++++ .../tokenizer/sentence_piece.hpp | 24 ++++++ 6 files changed, 143 insertions(+), 11 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 2518f0ffd..360fc6f67 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -79,6 +79,7 @@ std::make_shared("Const", translate_const), \ std::make_shared>(), \ std::make_shared>(), \ + std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), #else diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index 402d35bb4..302077766 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -14,7 +14,7 @@ def convert_tokenizer( - tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False + tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False, streaming_decoder: bool = False ) -> Union[Model, Tuple[Model, Model]]: # todo: add support for more then 1 input if number_of_inputs > 1: @@ -32,6 +32,7 @@ def convert_tokenizer( tokenizer_object, add_attention_mask=True, with_decoder=with_decoder, + streaming_decoder=streaming_decoder, ) elif isinstance(tokenizer_object, PreTrainedTokenizerFast): logger.info("Convert Huggingface Fast tokenizer pipeline.") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 0bcf535f5..86b8469cb 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -10,7 +10,8 @@ import numpy as np import openvino.runtime.opset12 as opset -from openvino.runtime import Model, PartialShape, Type, op +from openvino import Model, PartialShape, Type +from openvino.runtime import Node, op from openvino.runtime.exceptions import OVTypeError from openvino.runtime.utils.types import as_node, make_constant_node @@ -273,7 +274,6 @@ def convert_fast_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", number_of_inputs: int = 1, with_decoder: bool = False, - greedy_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: pipeline = TransformersTokenizerPipelineParser(hf_tokenizer).parse(number_of_inputs=number_of_inputs) ov_tokenizer = pipeline.get_encoder_ov_subgraph() @@ -312,6 +312,7 @@ def convert_sentencepiece_model_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", add_attention_mask: bool = True, with_decoder: bool = False, + streaming_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: if not is_sentencepiece_model(hf_tokenizer): raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.") @@ -386,16 +387,23 @@ def convert_sentencepiece_model_tokenizer( if not with_decoder: return tokenizer_encoder - decoder_input = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) - token_ids = decoder_input + return tokenizer_encoder, get_sp_decoder(sp_model_node, streaming_decoder=streaming_decoder) + + +def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model: + token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) decoder = factory.create( - "SentencepieceDetokenizer", + "SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer", [sp_model_node, token_ids], - ) - string_output = factory.create("StringTensorPack", decoder.outputs()).outputs() + ).outputs() + + if streaming_decoder: + decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) + decoder = RegexDecodingStep.replace_sp_newlines().get_ov_subgraph(decoder) + + string_output = factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) - tokenizer_decoder = Model(string_output, [decoder_input], TOKENIZER_DECODER_NAME) + tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) tokenizer_decoder.validate_nodes_and_infer_types() - - return tokenizer_encoder, tokenizer_decoder + return tokenizer_decoder diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index c1c1ad412..469e6c7bf 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -651,6 +651,20 @@ def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep": replace_term=r"\1", ) + @classmethod + def replace_sp_spaces(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern="▁", + replace_term=" ", + ) + + @classmethod + def replace_sp_newlines(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern="<0x0A>", + replace_term="\n", + ) + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend( ( diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp index 7e3c5e05a..58bf4821e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -306,3 +306,87 @@ bool SentencepieceDetokenizer::has_evaluate() const { std::shared_ptr SentencepieceDetokenizer::clone_with_new_inputs(const OutputVector& new_args) const { return std::make_shared(new_args, m_sp); } + + +// Stream Detokenizer + +SentencepieceStreamDetokenizer::SentencepieceStreamDetokenizer(const OutputVector& args) : + m_sp(std::make_shared()), Op(args) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + constructor_validate_and_infer_types(); +} + +SentencepieceStreamDetokenizer::SentencepieceStreamDetokenizer(const OutputVector& args, const std::shared_ptr& sp) : + m_sp((sp == nullptr) ? std::make_shared(): sp), Op(args) { + // constructor above without sp argument never called when the node is created with python factory, so need to init and cache m_sp here + if (!m_sp->status().ok()) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + }; + constructor_validate_and_infer_types(); +} + +void SentencepieceStreamDetokenizer::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 2, "SentencepieceDetokenizer expects two inputs: sp model and token ids"); + OPENVINO_ASSERT(get_input_element_type(0) == element::u8, "SentencepieceDetokenizer accepts sp model as the first input and it should be of type u8 tensor"); + OPENVINO_ASSERT(get_input_partial_shape(1).size() == 2, "SentencepieceDetokenizer expects 2D tensor as second input"); + + auto batch_size = PartialShape({get_input_partial_shape(1)[0]}); + set_string_output(this, 0, batch_size); +} + +bool SentencepieceStreamDetokenizer::visit_attributes(AttributeVisitor& visitor) { + return true; +} + +bool SentencepieceStreamDetokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + auto batch_size = inputs[1].get_shape()[0]; + auto seq_len = inputs[1].get_shape()[1]; + auto input_data = inputs[1].data(); + + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + const auto start = batch * seq_len; + + begins[batch] = char_offset; + for(size_t seq = start; seq < start + seq_len; ++seq) { + const auto token_id = input_data[seq]; + const auto token = m_sp->IdToPiece(token_id); + + std::copy(token.begin(), token.end(), &chars[char_offset]); + char_offset += token.size(); + }; + ends[batch] = char_offset; + } + outputs[2].set_shape({char_offset}); + return true; +} + +bool SentencepieceStreamDetokenizer::has_evaluate() const { + return true; +} + +std::shared_ptr SentencepieceStreamDetokenizer::clone_with_new_inputs(const OutputVector& new_args) const { + return std::make_shared(new_args, m_sp); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp index 0efd60966..fbc0394aa 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp @@ -62,4 +62,28 @@ namespace TemplateExtension { private: std::shared_ptr m_sp; }; + + + class SentencepieceStreamDetokenizer : public ov::op::Op { + public: + OPENVINO_OP("SentencepieceStreamDetokenizer"); + + SentencepieceStreamDetokenizer() = default; + SentencepieceStreamDetokenizer(const ov::OutputVector& args); + SentencepieceStreamDetokenizer(const ov::OutputVector& args, + const std::shared_ptr& sp); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const override; + + private: + std::shared_ptr m_sp; + }; } // namespace TemplateExtension From 5ebdb1f0c7ca3d65e5b77b2a760b720bfc2c99a6 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 19 Oct 2023 17:09:19 +0100 Subject: [PATCH 068/116] Change Authors --- .../user_ie_extensions/tokenizer/python/pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml index a4ecc6a91..04463f4cf 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -4,8 +4,7 @@ version = "0.0.1" description = "Convert tokenizers into OpenVINO models" requires-python = ">=3.8" authors = [ - { name = "Artur Paniukov", email = "artur.paniukov@intel.com" }, - { name = "Sergey Lyalin", email = "sergey.lyalin@intel.com" }, + { name = "OpenVINO Developers", email = "openvino@intel.com" }, ] dependencies = [ From 6a55877b4a4551cca1aab8515c3151b2f71de6f9 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 23 Oct 2023 12:09:06 +0100 Subject: [PATCH 069/116] Update modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp Co-authored-by: Zlobin Vladimir --- .../custom_operations/user_ie_extensions/tokenizer/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index f9cbc8c7a..9ee3e15ba 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -227,7 +227,7 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont // Pack any container with string to ov::Tensor with element type u8 // Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() // so basically any STL container with std::string is compatible -// Tensor destination will be reshaped according tha input data +// Tensor destination will be reshaped according the input data template void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { auto batch_size = strings.size(); From 06d515900cdb9bed56a737d66ee6a29b19889fb3 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 23 Oct 2023 18:57:14 +0100 Subject: [PATCH 070/116] Configure tests --- .../tokenizer/python/tests/conftest.py | 49 +++++++++++++++++++ .../tokenizer/python/tests/pass_rates.json | 9 ++++ .../tokenizer/python/tests/tokenizers_test.py | 8 +-- 3 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py new file mode 100644 index 000000000..d8810e63f --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py @@ -0,0 +1,49 @@ +import json +import os +from math import isclose + +import pytest + + +def prebuild_extenson_path(): + ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH") + if not ext_path: + raise EnvironmentError( + "No extension path found in the environment. " + "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable." + ) + return ext_path + + +os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path() +PASS_RATES_FILE = "pass_rates.json" + + +@pytest.hookimpl(trylast=True) +def pytest_sessionfinish(session, exitstatus) -> None: + """ + Tests fail if the test pass rate decreases + """ + if exitstatus != pytest.ExitCode.TESTS_FAILED: + return + + parent = os.path.commonprefix([item.nodeid for item in session.items]).strip("[]") + + with open(PASS_RATES_FILE) as f: + previous_rates = json.load(f) + + pass_rate = 1 - session.testsfailed / session.testscollected + previous = previous_rates.get(parent, 0) + + if isclose(pass_rate, previous): + return + + if pass_rate > previous: + session.exitstatus = pytest.ExitCode.OK + previous_rates[parent] = pass_rate + + with open(PASS_RATES_FILE, "w") as f: + json.dump(previous_rates, f, indent=4) + else: + reporter = session.config.pluginmanager.get_plugin("terminalreporter") + reporter.write_line(f"Pass rate is lower! Current: {pass_rate}, previous: {previous}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json new file mode 100644 index 000000000..82dc40df8 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -0,0 +1,9 @@ +{ + "tokenizers_test.py::test_hf_wordpiece_tokenizers_outputs": 0.9423076923076923, + "tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641, + "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.4, + "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5458333333333334, + "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, + "tokenizers_test.py::test_bpe_detokenizer": 0.93125, + "tokenizers_test.py::test_": 0.7512332628611699 +} \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index d538eb090..108b41e99 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -12,16 +12,12 @@ from transformers import AutoTokenizer from ov_tokenizer import ( - # init_extension, convert_tokenizer, pack_strings, unpack_strings, ) -# use `init_extension` function to be able to convert HF tokenizers: -# init_extension("path/to/libuser_ov_extensions.so") # or alternatively: -# set the OV_TOKENIZER_PREBUILD_EXTENSION_PATH env variable BEFORE importing ov_tokenizers core = Core() eng_test_strings = [ @@ -80,10 +76,10 @@ "KoboldAI/fairseq-dense-13B", "facebook/galactica-120b", "EleutherAI/pythia-12b-deduped", - "Salesforce/codegen-16B-multi", "microsoft/deberta-base", - "bigscience/bloom", # pack_strings for vocab is taking long time + "bigscience/bloom", "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", + # "Salesforce/codegen-16B-multi", # Segfalts on ""A lot\t\tof whitespaces!"" # "google/flan-t5-xxl", # needs Precompiled/CharsMap # "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer # "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work From fa5360d0378699726b7151c3f5106e9bf75221a1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 13:15:41 +0100 Subject: [PATCH 071/116] Skip Java Tests --- .ci/azure/linux.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 6aeaf27c1..00afba734 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -154,15 +154,15 @@ jobs: - script: ls -alR $(INSTALL_DIR) displayName: 'List install files' - - script: | - set -e - export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} - . $(SETUPVARS) gradle clean build --info - for d in CPU HETERO:CPU; do - gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; - done - workingDirectory: $(REPO_DIR)/modules/java_api - displayName: 'Java tests' +# - script: | +# set -e +# export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} +# . $(SETUPVARS) gradle clean build --info +# for d in CPU HETERO:CPU; do +# gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; +# done +# workingDirectory: $(REPO_DIR)/modules/java_api +# displayName: 'Java tests' - script: | python3 -m pip install --user virtualenv From e855193b6d7dc46e7ffeb4cfa5ba5205c10e2130 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 13:47:53 +0100 Subject: [PATCH 072/116] Add Regression Test --- .ci/azure/linux.yml | 8 ++++++++ .../user_ie_extensions/tokenizer/python/tests/conftest.py | 4 +++- .../tokenizer/python/tests/pass_rates.json | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 00afba734..c696add5f 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -170,6 +170,7 @@ jobs: source .env3/bin/activate python -m pip install --upgrade pip python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt + python -m pip install -r $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] cd ${OPENVINO_REPO_DIR}/tools && python -m pip install mo/ workingDirectory: $(WORK_DIR) displayName: 'Create user custom operations env' @@ -181,3 +182,10 @@ jobs: python -m pytest -k "not sparse_conv" tests/run_tests.py workingDirectory: $(REPO_DIR)/modules/custom_operations displayName: 'Custom user operation tests' + + - script: | + . $(SETUPVARS) + source $(WORK_DIR)/.env3/bin/activate + python -m pytest user_ie_extensions/tokenizer/python/tests/tokenizers_test.py + workingDirectory: $(WORK_DIR) + displayName: 'Tokenizers extension regression test' diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py index d8810e63f..918684c80 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py @@ -1,6 +1,7 @@ import json import os from math import isclose +from pathlib import Path import pytest @@ -16,7 +17,7 @@ def prebuild_extenson_path(): os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path() -PASS_RATES_FILE = "pass_rates.json" +PASS_RATES_FILE = Path(__file__).parent / "pass_rates.json" @pytest.hookimpl(trylast=True) @@ -36,6 +37,7 @@ def pytest_sessionfinish(session, exitstatus) -> None: previous = previous_rates.get(parent, 0) if isclose(pass_rate, previous): + session.exitstatus = pytest.ExitCode.OK return if pass_rate > previous: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index 82dc40df8..eae786455 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -5,5 +5,6 @@ "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5458333333333334, "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, "tokenizers_test.py::test_bpe_detokenizer": 0.93125, - "tokenizers_test.py::test_": 0.7512332628611699 + "tokenizers_test.py::test_": 0.7512332628611699, + "user_ie_extensions/tokenizer/python/tests/tokenizers_test.py::test_": 0.7512332628611699 } \ No newline at end of file From d495d3ba5967a38f1d1cb52f5edd122ed96fc169 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 13:50:22 +0100 Subject: [PATCH 073/116] Skip traceback --- .ci/azure/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index c696add5f..3d760cb5b 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -186,6 +186,6 @@ jobs: - script: | . $(SETUPVARS) source $(WORK_DIR)/.env3/bin/activate - python -m pytest user_ie_extensions/tokenizer/python/tests/tokenizers_test.py + python -m pytest --tb=no user_ie_extensions/tokenizer/python/tests/tokenizers_test.py workingDirectory: $(WORK_DIR) displayName: 'Tokenizers extension regression test' From d7bebd0bab5de5f8b5e922fa8c3a299f7676530e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 14:45:57 +0100 Subject: [PATCH 074/116] Add Win64 Fast Tokenizer lib --- .../tokenizer/CMakeLists.txt | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 1a8edc33e..910b340e8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -20,11 +20,20 @@ FetchContent_Declare( URL_HASH SHA256=a7c105aca0131b4a899155a6c44ea9728e63514edaa8d71fa92e7a5de53b6ca0 ) -FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz - URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc -) +if(WIN32) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip + URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d + ) +else() + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz + URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc + ) +endif() + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") set(cxx_flags "-Wno-undef") From b2e35eda96903f38c72cd6a6ffdb5a227a072851 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 14:48:56 +0100 Subject: [PATCH 075/116] Fix WorkingDir --- .ci/azure/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 3d760cb5b..05a372095 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -187,5 +187,5 @@ jobs: . $(SETUPVARS) source $(WORK_DIR)/.env3/bin/activate python -m pytest --tb=no user_ie_extensions/tokenizer/python/tests/tokenizers_test.py - workingDirectory: $(WORK_DIR) + workingDirectory: $(REPO_DIR)/modules/custom_operations displayName: 'Tokenizers extension regression test' From f81bd18b63b463bfc9b78092624d48c1a0836d4f Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 16:07:37 +0100 Subject: [PATCH 076/116] Return TB --- .ci/azure/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 05a372095..8c6ef37c1 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -186,6 +186,6 @@ jobs: - script: | . $(SETUPVARS) source $(WORK_DIR)/.env3/bin/activate - python -m pytest --tb=no user_ie_extensions/tokenizer/python/tests/tokenizers_test.py + python -m pytest user_ie_extensions/tokenizer/python/tests/tokenizers_test.py workingDirectory: $(REPO_DIR)/modules/custom_operations displayName: 'Tokenizers extension regression test' From 0bd23b5df72201b43288b3d3a5515caff14526ca Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 17:12:13 +0100 Subject: [PATCH 077/116] Fix dependencies install --- .ci/azure/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 8c6ef37c1..b10a8d3fc 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -170,7 +170,7 @@ jobs: source .env3/bin/activate python -m pip install --upgrade pip python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt - python -m pip install -r $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] + python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] cd ${OPENVINO_REPO_DIR}/tools && python -m pip install mo/ workingDirectory: $(WORK_DIR) displayName: 'Create user custom operations env' From 12ac9f835506a55575967a9882e894985b3cf494 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 24 Oct 2023 19:33:28 +0100 Subject: [PATCH 078/116] Add byte tokens handling for sentencepiece --- .ci/azure/linux.yml | 2 +- .../tokenizer/python/ov_tokenizer/hf_parser.py | 1 - .../user_ie_extensions/tokenizer/sentence_piece.cpp | 11 +++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index b10a8d3fc..7365a849d 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -186,6 +186,6 @@ jobs: - script: | . $(SETUPVARS) source $(WORK_DIR)/.env3/bin/activate - python -m pytest user_ie_extensions/tokenizer/python/tests/tokenizers_test.py + python -m pytest --tb=no user_ie_extensions/tokenizer/python/tests/tokenizers_test.py workingDirectory: $(REPO_DIR)/modules/custom_operations displayName: 'Tokenizers extension regression test' diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 86b8469cb..c18eaeded 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -400,7 +400,6 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode if streaming_decoder: decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) - decoder = RegexDecodingStep.replace_sp_newlines().get_ov_subgraph(decoder) string_output = factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp index 58bf4821e..c46472a2d 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -5,6 +5,7 @@ #include #include "normalizer.h" +#include "model_interface.h" #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset10.hpp" @@ -374,8 +375,14 @@ bool SentencepieceStreamDetokenizer::evaluate(TensorVector& outputs, const Tenso const auto token_id = input_data[seq]; const auto token = m_sp->IdToPiece(token_id); - std::copy(token.begin(), token.end(), &chars[char_offset]); - char_offset += token.size(); + if(token.rfind("<") == 0 && token.rfind(">") == 5) { + // convert "byte tokens" into bytes + int ch = sentencepiece::PieceToByte(token); + chars[char_offset++] = ch; + } else { + std::copy(token.begin(), token.end(), &chars[char_offset]); + char_offset += token.size(); + }; }; ends[batch] = char_offset; } From 9e6ae6f8bc642a6e049900f343f7ee278fb123b3 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 25 Oct 2023 12:25:51 +0100 Subject: [PATCH 079/116] Drop black, use ruff format instead --- .../user_ie_extensions/tokenizer/python/pyproject.toml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml index 04463f4cf..5cad40674 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -14,7 +14,6 @@ dependencies = [ [project.optional-dependencies] dev = [ - "black", "ruff", "pytest", ] @@ -25,12 +24,6 @@ all = [ "ov_tokenizer[dev,transformers]" ] - -[tool.black] -line-length = 119 -target-versions = ["py38", "py39", "py310", "py311", "py312"] - - [tool.ruff] ignore = ["C901", "E501", "E741", "W605"] select = ["C", "E", "F", "I", "W"] From f5d2d4caa2775809e27e0e7ff11efead496a600e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 26 Oct 2023 11:18:59 +0100 Subject: [PATCH 080/116] Temp remove tokenizers from windows CI --- .ci/azure/windows.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 4abffac6d..8e05b089a 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -128,6 +128,7 @@ jobs: -DENABLE_PYTHON=ON ^ -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" ^ -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" ^ + -DCUSTOM_OPERATIONS="calculate_grid;complex_mul;fft;sparse_conv_transpose;sparse_conv" ^ $(OPENVINO_REPO_DIR) workingDirectory: $(BUILD_DIR) displayName: 'CMake OpenVINO Contrib' From cf039b925de9ae73dee131a84a2102a68a512b81 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 26 Oct 2023 12:52:30 +0100 Subject: [PATCH 081/116] CI check --- .ci/azure/windows.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 8e05b089a..c3fc72246 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -161,6 +161,7 @@ jobs: - script: | call C:\tools\opencv\build\setup_vars_opencv4.cmd call $(SETUPVARS) + python -m pip list python -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' From 795306d37bdd7958afb81c8d8d78c55d91cd2c48 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 28 Oct 2023 16:15:55 +0400 Subject: [PATCH 082/116] Compile fast_tokenizers from source code --- .../user_ie_extensions/CMakeLists.txt | 4 +- .../tokenizer/CMakeLists.txt | 42 +++++++++---------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 6c6a9fcbe..97d332ae7 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -4,7 +4,9 @@ set(TARGET_NAME "user_ov_extensions") -set(CMAKE_CXX_STANDARD 11) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) +endif() find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB COMPONENTS tbb tbbmalloc) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 910b340e8..eeb642d04 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -16,24 +16,24 @@ include(FetchContent) FetchContent_Declare( sentencepiece - URL https://github.com/google/sentencepiece/archive/87721596842ab099c603b23357d948906813e853.tar.gz - URL_HASH SHA256=a7c105aca0131b4a899155a6c44ea9728e63514edaa8d71fa92e7a5de53b6ca0 + URL https://github.com/google/sentencepiece/archive/refs/tags/v0.1.99.tar.gz + URL_HASH SHA256=63617eaf56c7a3857597dcd8780461f57dd21381b56a27716ef7d7e02e14ced4 ) -if(WIN32) - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip - URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d - ) -else() - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz - URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc - ) -endif() +FetchContent_Declare( + fast_tokenizer + URL https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz + URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a +) +FetchContent_GetProperties(fast_tokenizer) +if(NOT fast_tokenizer_POPULATED) + FetchContent_Populate( + fast_tokenizer + ) + set(WITH_PYTHON OFF CACHE BOOL "Disable Python API for fast_tokenizer") + add_subdirectory(${fast_tokenizer_SOURCE_DIR}/fast_tokenizer ${CMAKE_CURRENT_BINARY_DIR}/fast_tokenizer) +endif() if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") set(cxx_flags "-Wno-undef") @@ -51,26 +51,26 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") FetchContent_MakeAvailable(sentencepiece) -FetchContent_MakeAvailable(fast_tokenizer) - -include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") # set include dirs for specific source files target_include_directories(${TARGET_NAME} PRIVATE + # sentensepiece "${sentencepiece_SOURCE_DIR}/src/builtin_pb" "${sentencepiece_SOURCE_DIR}/src" "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite" "${sentencepiece_SOURCE_DIR}" "${sentencepiece_SOURCE_DIR}" "${sentencepiece_BINARY_DIR}" - "${FAST_TOKENIZER_INCS}") - + # fast_tokenizer + "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer" + "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/" + "${CMAKE_BINARY_DIR}/third_party/install/re2/include/") if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) endif() -target_link_libraries(${TARGET_NAME} PRIVATE sentencepiece-static ${FAST_TOKENIZER_LIBS}) +target_link_libraries(${TARGET_NAME} PRIVATE core_tokenizers sentencepiece-static) # string_view is used from cxx17 string(REPLACE " " ";" cxx_flags "${cxx_flags}") From 9c200c2d09d139860e974296ddee130d989dae60 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Mon, 30 Oct 2023 15:27:20 +0400 Subject: [PATCH 083/116] Export pack_strings() and unpack_strings() --- .../tokenizer/CMakeLists.txt | 1 + .../user_ie_extensions/tokenizer/utils.cpp | 35 ------------------- .../user_ie_extensions/tokenizer/utils.hpp | 32 ++++++++++++++++- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 910b340e8..d2790e155 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -65,6 +65,7 @@ target_include_directories(${TARGET_NAME} PRIVATE "${sentencepiece_BINARY_DIR}" "${FAST_TOKENIZER_INCS}") +target_include_directories(${TARGET_NAME} PUBLIC .) if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 9ee3e15ba..199cd314e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -223,41 +223,6 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont #endif } - -// Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() -// so basically any STL container with std::string is compatible -// Tensor destination will be reshaped according the input data -template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { - auto batch_size = strings.size(); - - // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( - strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); - - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; - destination.set_shape({total_size}); - - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); - size_t current_symbols_pos = 0; - - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; - } -} - - std::vector unpack_strings (const ov::Tensor& source) { auto strings = source.data(); auto length = source.get_byte_size(); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp index a0d72b5fc..32bb34110 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -69,7 +69,37 @@ bool evaluate_normalization_helper ( std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according the input data template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination); +void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + auto symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference s) + { return accum + s.length(); }); + + auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + auto data = destination.data(); + auto pbatch_size = reinterpret_cast(data); + auto pindices = pbatch_size + 1; + auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + size_t current_symbols_pos = 0; + + *pbatch_size = batch_size; + *pindices = 0; + + for(auto s: strings) { + psymbols = std::copy(s.begin(), s.end(), psymbols); + current_symbols_pos += s.length(); + *++pindices = current_symbols_pos; + } +} std::vector unpack_strings(const ov::Tensor& source); From f23e59bfdc10cdf35675656d07f698f58e6a8cff Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 31 Oct 2023 16:19:14 +0000 Subject: [PATCH 084/116] Build tokenizer target on windows --- .ci/azure/windows.yml | 2 -- .../user_ie_extensions/tokenizer/python/tests/conftest.py | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index c3fc72246..4abffac6d 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -128,7 +128,6 @@ jobs: -DENABLE_PYTHON=ON ^ -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" ^ -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" ^ - -DCUSTOM_OPERATIONS="calculate_grid;complex_mul;fft;sparse_conv_transpose;sparse_conv" ^ $(OPENVINO_REPO_DIR) workingDirectory: $(BUILD_DIR) displayName: 'CMake OpenVINO Contrib' @@ -161,7 +160,6 @@ jobs: - script: | call C:\tools\opencv\build\setup_vars_opencv4.cmd call $(SETUPVARS) - python -m pip list python -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py index 918684c80..054388410 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py @@ -36,16 +36,18 @@ def pytest_sessionfinish(session, exitstatus) -> None: pass_rate = 1 - session.testsfailed / session.testscollected previous = previous_rates.get(parent, 0) + reporter = session.config.pluginmanager.get_plugin("terminalreporter") if isclose(pass_rate, previous): session.exitstatus = pytest.ExitCode.OK + reporter.write_line(f"New pass rate isclose to previous: {pass_rate}") return if pass_rate > previous: + reporter.write_line(f"New pass rate {pass_rate} is bigger then previous: {previous}") session.exitstatus = pytest.ExitCode.OK previous_rates[parent] = pass_rate with open(PASS_RATES_FILE, "w") as f: json.dump(previous_rates, f, indent=4) else: - reporter = session.config.pluginmanager.get_plugin("terminalreporter") reporter.write_line(f"Pass rate is lower! Current: {pass_rate}, previous: {previous}") From ce25397662946d8048c8db3c942dd38232507007 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 3 Nov 2023 19:58:02 +0000 Subject: [PATCH 085/116] Add icu4c patch --- .../user_ie_extensions/tokenizer/CMakeLists.txt | 1 + .../user_ie_extensions/tokenizer/icu4c.patch | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index eeb642d04..5b13dcabc 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -24,6 +24,7 @@ FetchContent_Declare( fast_tokenizer URL https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a + PATCH_COMMAND git apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/icu4c.patch" ) FetchContent_GetProperties(fast_tokenizer) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch b/modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch new file mode 100644 index 000000000..2c7d9e82a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch @@ -0,0 +1,13 @@ +diff --git a/fast_tokenizer/cmake/external/icu.cmake b/fast_tokenizer/cmake/external/icu.cmake +index cd604d38..6be44bdb 100644 +--- a/fast_tokenizer/cmake/external/icu.cmake ++++ b/fast_tokenizer/cmake/external/icu.cmake +@@ -113,7 +113,7 @@ ExternalProject_Add( + GIT_PROGRESS 1 + PREFIX ${ICU_PREFIX_DIR} + UPDATE_COMMAND "" +- CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux/gcc" --enable-static --disable-shared --enable-rpath ++ CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux/gcc" --enable-static --enable-rpath + BUILD_COMMAND make -j4 + INSTALL_COMMAND make install prefix="" DESTDIR=${ICU_INSTALL_DIR} install + BUILD_BYPRODUCTS ${ICU_LIBRARIES} From d46f594800a8bf79c407644222b0e6203eb9512c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 8 Nov 2023 20:37:13 +0800 Subject: [PATCH 086/116] Added include dir to nlohmann headers --- .../user_ie_extensions/tokenizer/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 5b13dcabc..0d0cb7053 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -65,6 +65,7 @@ target_include_directories(${TARGET_NAME} PRIVATE # fast_tokenizer "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer" "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/" + "${CMAKE_BINARY_DIR}/third_party/json/src/extern_json/single_include/" "${CMAKE_BINARY_DIR}/third_party/install/re2/include/") if(CMAKE_CL_64) From 6f213ab7a8253611b5458ee9ae7dfe9448d5325b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 8 Nov 2023 16:53:18 +0400 Subject: [PATCH 087/116] Fixed compilation on ubuntu 18.04 arm64 --- .../user_ie_extensions/tokenizer/vocab_decoder.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp index 310bd99fa..1d173abce 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp @@ -33,8 +33,7 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern"); for(size_t id = 0; id < vocab_size; ++id) { - std::vector token = std::vector(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); - vocab[id] = token; + vocab[id] = std::vector(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); } // Set output shapes outputs[0].set_shape({batch_size}); From 6ed52e471541f97e4fd7c4cdae79c3fca1fb9167 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 8 Nov 2023 17:22:17 +0400 Subject: [PATCH 088/116] Fixed Windows --- .../user_ie_extensions/tokenizer/string_tensor_unpack.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp index ec9e22bb5..bd1320f44 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp @@ -25,15 +25,18 @@ void StringTensorUnpack::validate_and_infer_types() { // - when string tensor is passed and we are after the hack in CPU (element::u8) and // - when stirng tensor is not really used, and we expect a packed string tensor in this case (element::u8) - OPENVINO_ASSERT( #if OPENVINO_ELEMENT_STRING_SUPPORTED + OPENVINO_ASSERT( get_input_element_type(0) == element::string || + get_input_element_type(0) == element::dynamic, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); #endif #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS + OPENVINO_ASSERT( get_input_element_type(0) == element::u8 || -#endif get_input_element_type(0) == element::dynamic, "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); +#endif #if OPENVINO_ELEMENT_STRING_SUPPORTED if(get_input_element_type(0) == element::string) { From 52bfe5a59fcd3fe5ba00b74f72643ea2879e8e22 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 8 Nov 2023 21:30:47 +0400 Subject: [PATCH 089/116] Supported prebuild Fast Tokenizers on all platforms --- .../user_ie_extensions/CMakeLists.txt | 3 + .../user_ie_extensions/cmake/platforms.cmake | 89 +++++++++++++++ .../tokenizer/CMakeLists.txt | 103 ++++++++++++++---- 3 files changed, 173 insertions(+), 22 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/cmake/platforms.cmake diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 97d332ae7..88a9266a5 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -8,6 +8,8 @@ if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 11) endif() +include(cmake/platforms.cmake) + find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB COMPONENTS tbb tbbmalloc) find_package(OpenCV COMPONENTS core) @@ -21,6 +23,7 @@ set(OP_REQ_TBB "complex_mul" "fft") if(NOT CUSTOM_OPERATIONS) file(GLOB op_src "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") file(GLOB op_dirs LIST_DIRECTORIES true "${CMAKE_CURRENT_SOURCE_DIR}/*") + list(REMOVE_ITEM op_dirs "${CMAKE_CURRENT_SOURCE_DIR}/cmake") foreach(op IN LISTS op_src) get_filename_component(op_name ${op} NAME_WE) diff --git a/modules/custom_operations/user_ie_extensions/cmake/platforms.cmake b/modules/custom_operations/user_ie_extensions/cmake/platforms.cmake new file mode 100644 index 000000000..67c7f3c82 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/cmake/platforms.cmake @@ -0,0 +1,89 @@ + +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +if(CMAKE_CL_64) + set(MSVC64 ON) +endif() + +if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine + OUTPUT_VARIABLE OPENVINO_GCC_TARGET_MACHINE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(OPENVINO_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + set(MINGW64 ON) + endif() +endif() + +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(OV_HOST_ARCH X86_64) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") + set(OV_HOST_ARCH X86) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") + set(OV_HOST_ARCH AARCH64) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(OV_HOST_ARCH ARM) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^riscv64$") + set(OV_HOST_ARCH RISCV64) +endif() + +macro(_ov_user_ext_detect_arch_by_processor_type) + if(CMAKE_OSX_ARCHITECTURES AND APPLE) + if(CMAKE_OSX_ARCHITECTURES STREQUAL "arm64") + set(OV_ARCH AARCH64) + elseif(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") + set(OV_ARCH X86_64) + elseif(CMAKE_OSX_ARCHITECTURES MATCHES ".*x86_64.*" AND CMAKE_OSX_ARCHITECTURES MATCHES ".*arm64.*") + set(OV_ARCH UNIVERSAL2) + else() + message(FATAL_ERROR "Unsupported value: CMAKE_OSX_ARCHITECTURES = ${CMAKE_OSX_ARCHITECTURES}") + endif() + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(OV_ARCH X86_64) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*|wasm") + set(OV_ARCH X86) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*|armv8)") + set(OV_ARCH AARCH64) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(OV_ARCH ARM) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64$") + set(OV_ARCH RISCV64) + endif() +endmacro() + +macro(_ov_user_ext_process_msvc_generator_platform) + # if cmake -A is passed + if(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") + set(OV_ARCH AARCH64) + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM") + set(OV_ARCH ARM) + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "x64") + set(OV_ARCH X86_64) + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "Win32") + set(OV_ARCH X86) + else() + _ov_user_ext_detect_arch_by_processor_type() + endif() +endmacro() + +if(MSVC64 OR MINGW64) + _ov_user_ext_process_msvc_generator_platform() +elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) + _ov_user_ext_process_msvc_generator_platform() +else() + _ov_user_ext_detect_arch_by_processor_type() +endif() + +set(HOST_${OV_HOST_ARCH} ON) +set(${OV_ARCH} ON) + +unset(OV_ARCH) + +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(EMSCRIPTEN ON) +endif() + +if(UNIX AND NOT (APPLE OR ANDROID OR EMSCRIPTEN OR CYGWIN)) + set(LINUX ON) +endif() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 0d0cb7053..ee443916e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -6,11 +6,14 @@ if(POLICY CMP0135) cmake_policy(SET CMP0135 NEW) endif() -include(CheckCXXCompilerFlag) +option(BUILD_FAST_TOKENIZERS OFF) # to build only sentencepiece-static target -set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - PROPERTY EXCLUDE_FROM_ALL ON) +set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL ON) + +# +# Dependencies +# include(FetchContent) @@ -19,23 +22,80 @@ FetchContent_Declare( URL https://github.com/google/sentencepiece/archive/refs/tags/v0.1.99.tar.gz URL_HASH SHA256=63617eaf56c7a3857597dcd8780461f57dd21381b56a27716ef7d7e02e14ced4 ) +FetchContent_MakeAvailable(sentencepiece) -FetchContent_Declare( - fast_tokenizer - URL https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz - URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a - PATCH_COMMAND git apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/icu4c.patch" -) - -FetchContent_GetProperties(fast_tokenizer) -if(NOT fast_tokenizer_POPULATED) - FetchContent_Populate( +if(BUILD_FAST_TOKENIZERS) + FetchContent_Declare( fast_tokenizer + URL https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz + URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a + PATCH_COMMAND git apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/icu4c.patch" ) - set(WITH_PYTHON OFF CACHE BOOL "Disable Python API for fast_tokenizer") - add_subdirectory(${fast_tokenizer_SOURCE_DIR}/fast_tokenizer ${CMAKE_CURRENT_BINARY_DIR}/fast_tokenizer) + + FetchContent_GetProperties(fast_tokenizer) + if(NOT fast_tokenizer_POPULATED) + FetchContent_Populate( + fast_tokenizer + ) + set(WITH_PYTHON OFF CACHE BOOL "Disable Python API for fast_tokenizer") + add_subdirectory(${fast_tokenizer_SOURCE_DIR}/fast_tokenizer ${CMAKE_CURRENT_BINARY_DIR}/fast_tokenizer) + endif() + + # variables used later + set(FAST_TOKENIZER_INCS + "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer" + "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/" + "${CMAKE_BINARY_DIR}/third_party/json/src/extern_json/single_include/" + "${CMAKE_BINARY_DIR}/third_party/install/re2/include/") + set(FAST_TOKENIZER_LIBS core_tokenizers) +else() + if(WIN32 AND X86_64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip + URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d + ) + elseif(LINUX AND X86_64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz + URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc + ) + elseif(LINUX AND AARCH64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-1.0.2.tgz + URL_HASH SHA256=fc16c51b24a954ae3d659e1b233ce15349eafc1e4c72710b51a4f12fb2c03033 + ) + elseif(APPLE AND X86_64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz + URL_HASH SHA256=5ef2e389cee985b5cef5ebf8d375704cf63030d8ec66a0b5c7bcd8771a250109 + ) + elseif(APPLE AND AARCH64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-1.0.2.tgz + URL_HASH SHA256=ffb0f16ec96b2f5dbdb681d00d74e932e273ec1c2108196d13f2fd28abc4d266 + ) + else() + message(FATAL_ERROR "Platform ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR} does not have prebuilt Fast Tokenizer" + "Please, use -DBUILD_FAST_TOKENIZERS=ON cmake option to enable build from soures") + endif() + FetchContent_MakeAvailable(fast_tokenizer) + include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") + + # we use re2 library in regex_normalization operation, so have to add to this list + if(WIN32 AND X86_64) + list(APPEND FAST_TOKENIZER_LIBS "${CMAKE_BINARY_DIR}/third_party/install/re2/lib/re2.lib") + endif() endif() +# +# Compile flags +# + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") set(cxx_flags "-Wno-undef") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") @@ -44,6 +104,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") set(cxx_flags "/wd4244 /wd4267") endif() +include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-Wsuggest-override" SUGGEST_OVERRIDE_SUPPORTED) if(SUGGEST_OVERRIDE_SUPPORTED) set(cxx_flags "${cxx_flags} -Wno-suggest-override") @@ -51,7 +112,9 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") -FetchContent_MakeAvailable(sentencepiece) +# +# Target include dirs, link libraries and other properties +# # set include dirs for specific source files target_include_directories(${TARGET_NAME} PRIVATE @@ -60,19 +123,15 @@ target_include_directories(${TARGET_NAME} PRIVATE "${sentencepiece_SOURCE_DIR}/src" "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite" "${sentencepiece_SOURCE_DIR}" - "${sentencepiece_SOURCE_DIR}" "${sentencepiece_BINARY_DIR}" # fast_tokenizer - "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer" - "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/" - "${CMAKE_BINARY_DIR}/third_party/json/src/extern_json/single_include/" - "${CMAKE_BINARY_DIR}/third_party/install/re2/include/") + ${FAST_TOKENIZER_INCS}) if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) endif() -target_link_libraries(${TARGET_NAME} PRIVATE core_tokenizers sentencepiece-static) +target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} sentencepiece-static) # string_view is used from cxx17 string(REPLACE " " ";" cxx_flags "${cxx_flags}") From cc663dcddcadeb7bcb20df853617d66b2c34388e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Nov 2023 17:13:35 +0000 Subject: [PATCH 090/116] Add tiktoken support WIP --- .../python/ov_tokenizer/convert_tokenizer.py | 16 ++++- .../python/ov_tokenizer/hf_parser.py | 29 ++++++++ .../python/ov_tokenizer/tiktoken_parser.py | 67 +++++++++++++++++++ .../python/ov_tokenizer/tokenizer_pipeline.py | 14 ++++ .../tokenizer/python/pyproject.toml | 7 +- .../tokenizer/python/tests/pass_rates.json | 2 +- .../tokenizer/python/tests/tokenizers_test.py | 37 ++++++++-- 7 files changed, 164 insertions(+), 8 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index 302077766..cb9d66c2c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -23,7 +23,13 @@ def convert_tokenizer( if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast - from .hf_parser import convert_fast_tokenizer, convert_sentencepiece_model_tokenizer, is_sentencepiece_model + from .hf_parser import ( + convert_fast_tokenizer, + convert_sentencepiece_model_tokenizer, + convert_tiktoken_model_tokenizer, + is_sentencepiece_model, + is_tiktoken_model, + ) if isinstance(tokenizer_object, PreTrainedTokenizerBase): if is_sentencepiece_model(tokenizer_object): @@ -34,6 +40,14 @@ def convert_tokenizer( with_decoder=with_decoder, streaming_decoder=streaming_decoder, ) + elif is_tiktoken_model(tokenizer_object): + logger.info("Convert tiktoken-based tokenizer") + return convert_tiktoken_model_tokenizer( + tokenizer_object, + add_attention_mask=True, + with_decoder=with_decoder, + streaming_decoder=streaming_decoder, + ) elif isinstance(tokenizer_object, PreTrainedTokenizerFast): logger.info("Convert Huggingface Fast tokenizer pipeline.") return convert_fast_tokenizer( diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index c18eaeded..ccf459f2e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -406,3 +406,32 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) tokenizer_decoder.validate_nodes_and_infer_types() return tokenizer_decoder + + +def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: + return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken") + + +def convert_tiktoken_model_tokenizer( + hf_tokenizer: "PreTrainedTokenizerBase", + add_attention_mask: bool = True, + with_decoder: bool = False, + streaming_decoder: bool = False, +) -> Union[Model, Tuple[Model, Model]]: + encoding = hf_tokenizer.tokenizer + split_pattern = encoding._pat_str + + pipeline = TokenizerPipeline() + pipeline.add_steps( + [ + NormalizeUnicode("NFC"), + RegexSplitStep(split_pattern), + BPETokenizationStep.from_tiktoken_encoding(encoding), + TruncationStep( + max_length=hf_tokenizer.model_max_length, truncate_right=(hf_tokenizer.truncation_side == "right") + ), + PaddingStep(pad_right=(hf_tokenizer.padding_side == "right")), + ] + ) + + return pipeline.get_encoder_ov_subgraph() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py new file mode 100644 index 000000000..a9f2cc78b --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py @@ -0,0 +1,67 @@ +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + +from tiktoken import Encoding + + +# from transformers.models.gpt2.tokenization_gpt2 +@lru_cache() +def bytes_to_unicode() -> Dict[bytes, str]: + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = (chr(n) for n in cs) + return dict(zip(bs, cs)) + + +# https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee +byte_encoder = bytes_to_unicode() + + +def token_bytes_to_string(b: bytes) -> str: + return "".join(byte_encoder[ord(char)] for char in b.decode("latin-1")) + + +def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> List[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :] + return parts + + +def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[str]]: + mergeable_ranks = encoding._mergeable_ranks + + merges = [] + vocab = {} + for token, rank in mergeable_ranks.items(): + vocab[token_bytes_to_string(token)] = rank + + if len(token) == 1: + continue + merged = tuple(bpe(mergeable_ranks, token, max_rank=rank)) + assert len(merged) == 2 + + merges.append(" ".join(map(token_bytes_to_string, merged))) + + # Also add special tokens + vocab.update(encoding._special_tokens) + + return vocab, merges diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 469e6c7bf..8b7c476c9 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -338,6 +338,20 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": merges=tokenizer_json["model"]["merges"], ) + @classmethod + def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep": # noqa + from .tiktoken_parser import generate_vocab_and_merges + + vocab, merges = generate_vocab_and_merges(encoding) + return cls( + unk_token="", + fuse_unk=False, + suffix_indicator="", + end_suffix="", + vocab=[token for token, idx in sorted(vocab.items(), key=lambda x: x[1])], + merges=merges, + ) + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: pipeline = self.get_pipeline() pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml index 5cad40674..b89e14158 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -18,10 +18,13 @@ dev = [ "pytest", ] transformers = [ - "transformers[sentencepiece]" + "transformers[sentencepiece,tiktoken]" +] +tiktoken = [ + "tiktoken" ] all = [ - "ov_tokenizer[dev,transformers]" + "ov_tokenizer[dev,transformers,tiktoken]" ] [tool.ruff] diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index eae786455..1840a37cf 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -5,6 +5,6 @@ "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5458333333333334, "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, "tokenizers_test.py::test_bpe_detokenizer": 0.93125, - "tokenizers_test.py::test_": 0.7512332628611699, + "tokenizers_test.py::test_": 0.7580534612748457, "user_ie_extensions/tokenizer/python/tests/tokenizers_test.py::test_": 0.7512332628611699 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 108b41e99..0c78ebcf2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -93,17 +93,22 @@ "xlnet-base-cased", # "t5-base", # crashes tests ] +tiktiken_models = ["Qwen/Qwen-14B-Chat"] -def get_tokenizer(request, fast_tokenizer=True): - hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenizer) +def get_tokenizer(request, fast_tokenizer=True, trust_remote_code=False): + hf_tokenizer = AutoTokenizer.from_pretrained( + request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code + ) ov_tokenizer = convert_tokenizer(hf_tokenizer, with_decoder=False) compiled_tokenizer = core.compile_model(ov_tokenizer) return hf_tokenizer, compiled_tokenizer -def get_tokenizer_detokenizer(request, fast_tokenizer=True): - hf_tokenizer = AutoTokenizer.from_pretrained(request.param, use_fast=fast_tokenizer) +def get_tokenizer_detokenizer(request, fast_tokenizer=True, trust_remote_code=False): + hf_tokenizer = AutoTokenizer.from_pretrained( + request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code + ) ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) compiled_tokenizer = core.compile_model(ov_tokenizer) compiled_detokenizer = core.compile_model(ov_detokenizer) @@ -135,6 +140,11 @@ def sentencepice_model_tokenizers(request, fast_tokenizer): return get_tokenizer_detokenizer(request, fast_tokenizer) +@pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def tiktoken_model_tokenizers(request, fast_tokenizer): + return get_tokenizer(request, trust_remote_code=True) + + @pytest.mark.parametrize( "test_string", [ @@ -247,3 +257,22 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) assert ov_output == hf_output + + +@pytest.mark.skip(reason="tiktoken tokenizer is WIP") +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + ], +) +def test_tiktoken_tokenizers_output(tiktoken_model_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = tiktoken_model_tokenizers + + hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + ov_tokenized = ov_tokenizer(pack_strings([test_string])) + + for output_name, hf_result in hf_tokenized.items(): + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" From 745e969d3eaf3d89e8d44228a6e35a2c1ff77101 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Nov 2023 17:19:36 +0000 Subject: [PATCH 091/116] Unskip java tests --- .ci/azure/linux.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 7365a849d..75568971d 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -154,15 +154,15 @@ jobs: - script: ls -alR $(INSTALL_DIR) displayName: 'List install files' -# - script: | -# set -e -# export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} -# . $(SETUPVARS) gradle clean build --info -# for d in CPU HETERO:CPU; do -# gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; -# done -# workingDirectory: $(REPO_DIR)/modules/java_api -# displayName: 'Java tests' + - script: | + set -e + export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} + . $(SETUPVARS) gradle clean build --info + for d in CPU HETERO:CPU; do + gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; + done + workingDirectory: $(REPO_DIR)/modules/java_api + displayName: 'Java tests' - script: | python3 -m pip install --user virtualenv From 056eb9f6ead66100d377f2111c37fa6167b4d66b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 10 Nov 2023 13:18:52 +0400 Subject: [PATCH 092/116] Fixed compilation with re2 on Windows --- .../user_ie_extensions/tokenizer/CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index ee443916e..fcb5df69b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -55,6 +55,12 @@ else() URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d ) + FetchContent_Declare( + re2 + URL https://github.com/google/re2/archive/refs/tags/2022-04-01.tar.gz + URL_HASH SHA256=1ae8ccfdb1066a731bba6ee0881baad5efd2cd661acd9569b689f2586e1a50e9 + ) + FetchContent_MakeAvailable(re2) elseif(LINUX AND X86_64) FetchContent_Declare( fast_tokenizer @@ -86,9 +92,10 @@ else() FetchContent_MakeAvailable(fast_tokenizer) include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") - # we use re2 library in regex_normalization operation, so have to add to this list if(WIN32 AND X86_64) - list(APPEND FAST_TOKENIZER_LIBS "${CMAKE_BINARY_DIR}/third_party/install/re2/lib/re2.lib") + # we use re2 library in regex_normalization operation, so have to add to this list + # because prebuilt fast_tokenizers package does not provide this library + list(APPEND FAST_TOKENIZER_LIBS re2) endif() endif() From debcb5d36990891f9312b78ec3d9c8851e4da785 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Fri, 10 Nov 2023 15:17:39 +0400 Subject: [PATCH 093/116] Move unpack_strings(), create sepparate include dir --- .../tokenizer/CMakeLists.txt | 2 +- .../tokenizer/include/tokenizer/strings.hpp | 60 +++++++++++++++++++ .../user_ie_extensions/tokenizer/utils.cpp | 20 ------- .../user_ie_extensions/tokenizer/utils.hpp | 35 ----------- 4 files changed, 61 insertions(+), 56 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index d94ce04ec..5b799dd44 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -134,7 +134,7 @@ target_include_directories(${TARGET_NAME} PRIVATE # fast_tokenizer ${FAST_TOKENIZER_INCS}) -target_include_directories(${TARGET_NAME} PUBLIC .) +target_include_directories(${TARGET_NAME} PUBLIC ./include/) if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp new file mode 100644 index 000000000..bb1c85a8f --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according the input data +template +void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + auto symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference s) + { return accum + s.length(); }); + + auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + auto data = destination.data(); + auto pbatch_size = reinterpret_cast(data); + auto pindices = pbatch_size + 1; + auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + size_t current_symbols_pos = 0; + + *pbatch_size = batch_size; + *pindices = 0; + + for(auto s: strings) { + psymbols = std::copy(s.begin(), s.end(), psymbols); + current_symbols_pos += s.length(); + *++pindices = current_symbols_pos; + } +} + +std::vector unpack_strings(const ov::Tensor& source) { + auto strings = source.data(); + auto length = source.get_byte_size(); + // check the format of the input bitstream representing the string tensor + OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + auto batch_size = *reinterpret_cast(strings + 0); + OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + auto begin_ids = reinterpret_cast(strings + 4); + auto end_ids = begin_ids + 1; + auto symbols = strings + 4 + 4 + 4 * batch_size; + + std::vector result; + result.reserve(batch_size); + for(size_t i = 0; i < batch_size; ++i) { + result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); + } + return result; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 199cd314e..3aaf6989e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -222,23 +222,3 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); #endif } - -std::vector unpack_strings (const ov::Tensor& source) { - auto strings = source.data(); - auto length = source.get_byte_size(); - // check the format of the input bitstream representing the string tensor - OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); - OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, - "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto symbols = strings + 4 + 4 + 4 * batch_size; - - std::vector result; - result.reserve(batch_size); - for(size_t i = 0; i < batch_size; ++i) { - result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); - } - return result; -} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp index 32bb34110..8ffbc9e04 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -68,38 +68,3 @@ bool evaluate_normalization_helper ( std::function normalizer); std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); - -// Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() -// so basically any STL container with std::string is compatible -// Tensor destination will be reshaped according the input data -template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { - auto batch_size = strings.size(); - - // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( - strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); - - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; - destination.set_shape({total_size}); - - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); - size_t current_symbols_pos = 0; - - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; - } -} - -std::vector unpack_strings(const ov::Tensor& source); From b739ffd0bf097f6d54e29b10090b9265d072ee30 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Sat, 11 Nov 2023 03:24:20 +0400 Subject: [PATCH 094/116] openvino_extensions --- .../user_ie_extensions/CMakeLists.txt | 1 + .../openvino_extensions}/strings.hpp | 49 ++++++++++--------- .../tokenizer/CMakeLists.txt | 2 - 3 files changed, 26 insertions(+), 26 deletions(-) rename modules/custom_operations/user_ie_extensions/{tokenizer/include/tokenizer => include/openvino_extensions}/strings.hpp (54%) diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 26f438b07..c830c0a21 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -101,3 +101,4 @@ endif() target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS}) +target_include_directories(${TARGET_NAME} PUBLIC ./include/) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp similarity index 54% rename from modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp rename to modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp index bb1c85a8f..5bfe85e5a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp +++ b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp @@ -6,8 +6,9 @@ #include +namespace openvino_extensions { // Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .size() // so basically any STL container with std::string is compatible // Tensor destination will be reshaped according the input data template @@ -15,46 +16,46 @@ void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) { auto batch_size = strings.size(); // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( + size_t symbols_size = std::accumulate( strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); + [](size_t accum, typename BatchOfStrings::const_reference str) + { return accum + str.size(); }); - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + size_t total_size = 4 * (1 + 1 + batch_size) + symbols_size; destination.set_shape({total_size}); - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + int32_t* pindices = reinterpret_cast(destination.data()); + pindices[0] = batch_size; + pindices[1] = 0; + pindices += 2; + char* psymbols = reinterpret_cast(pindices + batch_size); size_t current_symbols_pos = 0; - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; + for (const auto& str: strings) { + psymbols = std::copy(str.begin(), str.end(), psymbols); + current_symbols_pos += str.size(); + *pindices = current_symbols_pos; + ++pindices; } } std::vector unpack_strings(const ov::Tensor& source) { - auto strings = source.data(); - auto length = source.get_byte_size(); + int32_t length = source.get_byte_size(); // check the format of the input bitstream representing the string tensor OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); + const int32_t* pindices = reinterpret_cast(source.data()); + int32_t batch_size = pindices[0]; OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto symbols = strings + 4 + 4 + 4 * batch_size; + const int32_t* begin_ids = pindices + 1; + const int32_t* end_ids = pindices + 2; + const char* symbols = reinterpret_cast(pindices + 2 + batch_size); std::vector result; result.reserve(batch_size); - for(size_t i = 0; i < batch_size; ++i) { - result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); + for (int32_t idx = 0; idx < batch_size; ++idx) { + result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]); } return result; } +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 5b799dd44..fcb5df69b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -134,8 +134,6 @@ target_include_directories(${TARGET_NAME} PRIVATE # fast_tokenizer ${FAST_TOKENIZER_INCS}) -target_include_directories(${TARGET_NAME} PUBLIC ./include/) - if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) endif() From e70a3f2c1c81f43b29bd23658b76f9b7de90f04c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 13 Nov 2023 15:12:07 +0400 Subject: [PATCH 095/116] Fixed link stage on Windows --- .../user_ie_extensions/tokenizer/bpe_tokenizer.hpp | 2 +- .../user_ie_extensions/tokenizer/bytes_to_chars.hpp | 2 +- .../user_ie_extensions/tokenizer/case_fold.hpp | 2 +- .../user_ie_extensions/tokenizer/chars_to_bytes.hpp | 2 +- .../user_ie_extensions/tokenizer/combine_segments.hpp | 2 +- .../user_ie_extensions/tokenizer/normalize_unicode.hpp | 2 +- .../user_ie_extensions/tokenizer/ragged_to_dense.hpp | 2 +- .../user_ie_extensions/tokenizer/regex_normalization.hpp | 2 +- .../user_ie_extensions/tokenizer/regex_split.hpp | 2 +- .../user_ie_extensions/tokenizer/vocab_decoder.hpp | 2 +- .../user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp index 99073b1b2..97fb2db03 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp @@ -13,7 +13,7 @@ using namespace paddlenlp::fast_tokenizer; #undef tokenizer #undef m_tokenizer -class OPENVINO_API BPETokenizer : public ov::op::Op { +class BPETokenizer : public ov::op::Op { public: OPENVINO_OP("BPETokenizer"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp index d064467a0..77b30b0c1 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp @@ -9,7 +9,7 @@ const std::array, 256> create_bytes_to_chars_map(); -class OPENVINO_API BytesToChars : public ov::op::Op { +class BytesToChars : public ov::op::Op { public: OPENVINO_OP("BytesToChars"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp index 3f6e86e65..6c273ad82 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp @@ -6,7 +6,7 @@ #include -class OPENVINO_API CaseFold : public ov::op::Op { +class CaseFold : public ov::op::Op { public: OPENVINO_OP("CaseFold"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp index 25dd91dc3..4a79a72b3 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp @@ -6,7 +6,7 @@ #include -class OPENVINO_API CharsToBytes : public ov::op::Op { +class CharsToBytes : public ov::op::Op { public: OPENVINO_OP("CharsToBytes"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp index 9399f959a..a4d904a55 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp @@ -6,7 +6,7 @@ #include -class OPENVINO_API CombineSegments : public ov::op::Op { +class CombineSegments : public ov::op::Op { public: OPENVINO_OP("CombineSegments"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp index 0d2e27d89..cacdec18c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp @@ -6,7 +6,7 @@ #include -class OPENVINO_API NormalizeUnicode : public ov::op::Op { +class NormalizeUnicode : public ov::op::Op { public: OPENVINO_OP("NormalizeUnicode"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp index 698b16157..2d543fdb2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp @@ -7,7 +7,7 @@ #include // Takes a ragged tensor with one ragged right-most dimension and produces a normal tensor -class OPENVINO_API RaggedToDense : public ov::op::Op { +class RaggedToDense : public ov::op::Op { public: OPENVINO_OP("RaggedToDense"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp index a3ec22397..2f3924ec7 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp @@ -13,7 +13,7 @@ using namespace ov; using namespace ov::opset10; -class OPENVINO_API RegexNormalization : public ov::op::Op { +class RegexNormalization : public ov::op::Op { public: OPENVINO_OP("RegexNormalization"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp index e2729cce0..a1f001f64 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -12,7 +12,7 @@ using namespace ov; using namespace paddlenlp::fast_tokenizer; -class OPENVINO_API RegexSplit : public ov::op::Op { +class RegexSplit : public ov::op::Op { public: OPENVINO_OP("RegexSplit"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp index 1479dcb5a..14d91032c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp @@ -6,7 +6,7 @@ #include -class OPENVINO_API VocabDecoder : public ov::op::Op { +class VocabDecoder : public ov::op::Op { public: OPENVINO_OP("VocabDecoder"); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp index cbfe664a8..c6e785e55 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp @@ -11,7 +11,7 @@ using namespace paddlenlp::fast_tokenizer; #undef tokenizer -class OPENVINO_API WordpieceTokenizer : public ov::op::Op { +class WordpieceTokenizer : public ov::op::Op { public: OPENVINO_OP("WordpieceTokenizer"); From 3022a5a0a99e7303a8db7ae43a9aec060c7787bb Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 14 Nov 2023 18:05:09 +0000 Subject: [PATCH 096/116] i64 is default tokenizer output type --- .../python/ov_tokenizer/convert_tokenizer.py | 29 +++++++++++++++---- .../tokenizer/python/ov_tokenizer/utils.py | 9 +++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index cb9d66c2c..ca789ce4b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -6,20 +6,28 @@ import sys from typing import Any, Tuple, Union -from openvino.runtime import Model +from openvino.runtime import Model, Type from openvino.runtime.exceptions import OVTypeError +from .utils import change_outputs_type + logger = logging.getLogger(__name__) def convert_tokenizer( - tokenizer_object: Any, number_of_inputs: int = 1, with_decoder: bool = False, streaming_decoder: bool = False + tokenizer_object: Any, + number_of_inputs: int = 1, + with_decoder: bool = False, + streaming_decoder: bool = False, + tokenizer_output_type: Type = Type.i64, ) -> Union[Model, Tuple[Model, Model]]: # todo: add support for more then 1 input if number_of_inputs > 1: raise ValueError("Tokenizers with more then one input are not supported yet.") + ov_tokenizers = None + if "transformers" in sys.modules: from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast @@ -34,7 +42,7 @@ def convert_tokenizer( if isinstance(tokenizer_object, PreTrainedTokenizerBase): if is_sentencepiece_model(tokenizer_object): logger.info("Convert tokenizer using SentencePiece .model file.") - return convert_sentencepiece_model_tokenizer( + ov_tokenizers = convert_sentencepiece_model_tokenizer( tokenizer_object, add_attention_mask=True, with_decoder=with_decoder, @@ -42,7 +50,7 @@ def convert_tokenizer( ) elif is_tiktoken_model(tokenizer_object): logger.info("Convert tiktoken-based tokenizer") - return convert_tiktoken_model_tokenizer( + ov_tokenizers = convert_tiktoken_model_tokenizer( tokenizer_object, add_attention_mask=True, with_decoder=with_decoder, @@ -50,10 +58,19 @@ def convert_tokenizer( ) elif isinstance(tokenizer_object, PreTrainedTokenizerFast): logger.info("Convert Huggingface Fast tokenizer pipeline.") - return convert_fast_tokenizer( + ov_tokenizers = convert_fast_tokenizer( tokenizer_object, number_of_inputs=number_of_inputs, with_decoder=with_decoder, ) - raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") + if ov_tokenizers is None: + raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") + + if tokenizer_output_type == Type.i32: + return ov_tokenizers + + if isinstance(ov_tokenizers, tuple): + return change_outputs_type(ov_tokenizers[0], tokenizer_output_type), ov_tokenizers[1] + + return change_outputs_type(ov_tokenizers, tokenizer_output_type) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py index 057977e19..c083b6f99 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -5,7 +5,7 @@ import logging from typing import Dict, Optional, Sequence, Tuple, Union -from openvino import Model +from openvino import Model, Type from openvino.preprocess import PrePostProcessor from openvino.runtime import opset12 as opset @@ -92,3 +92,10 @@ def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGIT model = ppp.build() model.output(logits_output).tensor.set_names({TOKEN_IDS_OUTPUT_NAME}) return model + + +def change_outputs_type(model: Model, output_type: Type) -> Model: + ppp = PrePostProcessor(model) + for idx, _ in enumerate(model.outputs): + ppp.output(idx).tensor().set_element_type(output_type) + return ppp.build() From c467a8c07408483c8acd7f60d85927174fad3cb1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 14 Nov 2023 18:35:12 +0000 Subject: [PATCH 097/116] Add support for more tiktoken tokenizers --- .../python/ov_tokenizer/hf_parser.py | 13 ++++++-- .../python/ov_tokenizer/tiktoken_parser.py | 11 +++++-- .../tokenizer/python/tests/pass_rates.json | 7 ++-- .../tokenizer/python/tests/tokenizers_test.py | 33 ++++++++++++++----- 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index ccf459f2e..12432a1d1 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -409,7 +409,14 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: - return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken") + try: + from tiktoken import Encoding + except ImportError: + return False + + return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken") or isinstance( + getattr(hf_tokenizer, "encoder", None), Encoding + ) def convert_tiktoken_model_tokenizer( @@ -418,7 +425,7 @@ def convert_tiktoken_model_tokenizer( with_decoder: bool = False, streaming_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: - encoding = hf_tokenizer.tokenizer + encoding = getattr(hf_tokenizer, "tokenizer", None) or hf_tokenizer.encoder split_pattern = encoding._pat_str pipeline = TokenizerPipeline() @@ -426,6 +433,7 @@ def convert_tiktoken_model_tokenizer( [ NormalizeUnicode("NFC"), RegexSplitStep(split_pattern), + BytesToCharsStep(), BPETokenizationStep.from_tiktoken_encoding(encoding), TruncationStep( max_length=hf_tokenizer.model_max_length, truncate_right=(hf_tokenizer.truncation_side == "right") @@ -433,5 +441,4 @@ def convert_tiktoken_model_tokenizer( PaddingStep(pad_right=(hf_tokenizer.padding_side == "right")), ] ) - return pipeline.get_encoder_ov_subgraph() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py index a9f2cc78b..270124b57 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py @@ -1,3 +1,4 @@ +import logging from functools import lru_cache from typing import Dict, List, Optional, Tuple @@ -57,9 +58,15 @@ def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[ if len(token) == 1: continue merged = tuple(bpe(mergeable_ranks, token, max_rank=rank)) - assert len(merged) == 2 - merges.append(" ".join(map(token_bytes_to_string, merged))) + # if special tokens added to the tokenizer and the bpe split might produce more than 2 tokens + # if there are "\t" in the vocab and special token "\t\t\t" was added before "\t\t" it will + # be tokenized into 3 tokens: bpe("\t\t\t") -> ["\t", "\t", "\t"] which is cannot be included + # in merges + if len(merged) == 2: + merges.append(" ".join(map(token_bytes_to_string, merged))) + else: + logging.warning("Skip merges for added tokens. Tokenization results might be different.") # Also add special tokens vocab.update(encoding._special_tokens) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index 1840a37cf..6728e5572 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -5,6 +5,9 @@ "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5458333333333334, "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, "tokenizers_test.py::test_bpe_detokenizer": 0.93125, - "tokenizers_test.py::test_": 0.7580534612748457, - "user_ie_extensions/tokenizer/python/tests/tokenizers_test.py::test_": 0.7512332628611699 + "tokenizers_test.py::test_": 0.746031746031746, + "user_ie_extensions/tokenizer/python/tests/tokenizers_test.py::test_": 0.746031746031746, + "tokenizers_test.py::test_tiktoken_tokenizers[Qwen-14B-Chat-": 0.9166666666666666, + "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, + "tokenizers_test.py::test_tiktoken_tokenizers[xgen-7b-8k-base-": 0.9166666666666666 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 0c78ebcf2..8461772e2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -46,6 +46,13 @@ "🤷‍♂️", "🤦🏼‍♂️", ] +misc_strings = [ + "", + " ", + " " * 10, + "\n", + " \t\n", +] wordpiece_models = [ "bert-base-multilingual-cased", @@ -93,7 +100,10 @@ "xlnet-base-cased", # "t5-base", # crashes tests ] -tiktiken_models = ["Qwen/Qwen-14B-Chat"] +tiktiken_models = [ + "Qwen/Qwen-14B-Chat", + "Salesforce/xgen-7b-8k-base", +] def get_tokenizer(request, fast_tokenizer=True, trust_remote_code=False): @@ -141,7 +151,7 @@ def sentencepice_model_tokenizers(request, fast_tokenizer): @pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) -def tiktoken_model_tokenizers(request, fast_tokenizer): +def tiktoken_model_tokenizers(request): return get_tokenizer(request, trust_remote_code=True) @@ -151,9 +161,10 @@ def tiktoken_model_tokenizers(request, fast_tokenizer): *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, + *misc_strings, ], ) -def test_hf_wordpiece_tokenizers_outputs(hf_and_ov_wordpiece_tokenizers, test_string): +def test_hf_wordpiece_tokenizers(hf_and_ov_wordpiece_tokenizers, test_string): hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers packed_strings = pack_strings([test_string]) @@ -170,6 +181,7 @@ def test_hf_wordpiece_tokenizers_outputs(hf_and_ov_wordpiece_tokenizers, test_st eng_test_strings, multilingual_test_strings, emoji_test_strings, + misc_strings, ], ) def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers, test_string): @@ -189,6 +201,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, + *misc_strings, ], ) def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): @@ -207,6 +220,7 @@ def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_strin *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, + *misc_strings, ], ) def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_string): @@ -225,6 +239,7 @@ def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_str *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, + *misc_strings, ], ) def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): @@ -235,9 +250,8 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): ov_tokenized = ov_tokenizer(packed_strings) for output_name, hf_result in hf_tokenized.items(): - ov_result = ov_tokenized.get(output_name) # galactica tokenizer has 3 output, but model has 2 inputs - if ov_result is not None: + if (ov_result := ov_tokenized.get(output_name)) is not None: assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" @@ -247,6 +261,7 @@ def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, + *misc_strings, ], ) def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): @@ -259,20 +274,22 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): assert ov_output == hf_output -@pytest.mark.skip(reason="tiktoken tokenizer is WIP") +# @pytest.mark.skip(reason="tiktoken tokenizer is WIP") @pytest.mark.parametrize( "test_string", [ *eng_test_strings, *multilingual_test_strings, *emoji_test_strings, + *misc_strings, ], ) -def test_tiktoken_tokenizers_output(tiktoken_model_tokenizers, test_string): +def test_tiktoken_tokenizers(tiktoken_model_tokenizers, test_string): hf_tokenizer, ov_tokenizer = tiktoken_model_tokenizers hf_tokenized = hf_tokenizer(test_string, return_tensors="np") ov_tokenized = ov_tokenizer(pack_strings([test_string])) for output_name, hf_result in hf_tokenized.items(): - assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + if (ov_result := ov_tokenized.get(output_name)) is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" From 8505b51ac7987f87366aed1b611b68b0b67e8dbb Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 15 Nov 2023 11:52:17 +0000 Subject: [PATCH 098/116] Check Azure CI --- .ci/azure/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 4abffac6d..3bfb0e6d2 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -159,7 +159,7 @@ jobs: - script: | call C:\tools\opencv\build\setup_vars_opencv4.cmd - call $(SETUPVARS) + call $(SETUPVARS) && ^ python -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' From 82639e6cceab9f40225b0c7be147da02748f6f8b Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 15 Nov 2023 13:59:46 +0000 Subject: [PATCH 099/116] Fix Azure Win CI --- .ci/azure/windows.yml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 3bfb0e6d2..57ca2eefb 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -54,14 +54,13 @@ jobs: SETUPVARS: $(INSTALL_DIR)\setupvars.bat CUSTOM_OP_LIB: $(BIN_DIR)\user_ov_extensions.dll GRADLE_VER: 7.1.1 + PYTHON_EXE: C:\hostedtoolcache\windows\Python\3.8.2\x64\python.exe steps: - script: | powershell -command "Invoke-RestMethod -Headers @{\"Metadata\"=\"true\"} -Method GET -Uri http://169.254.169.254/metadata/instance/compute?api-version=2019-06-01 | format-custom" - where python3 - python3 --version - where python - python --version + where $(PYTHON_EXE) + $(PYTHON_EXE) --version where java java -version wmic computersystem get TotalPhysicalMemory @@ -99,11 +98,11 @@ jobs: powershell -command "Expand-Archive -Force ninja-win.zip" powershell -command "Invoke-WebRequest https://services.gradle.org/distributions/gradle-$(GRADLE_VER)-bin.zip -OutFile gradle-$(GRADLE_VER)-bin.zip" powershell -command "Expand-Archive -Force gradle-$(GRADLE_VER)-bin.zip" - python -m pip install --upgrade pip - python -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\src\compatibility\openvino\requirements-dev.txt - python -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\requirements.txt - python -m pip install -r $(REPO_DIR)\modules\custom_operations\tests\requirements.txt - python -m pip install $(OPENVINO_REPO_DIR)\tools\mo + $(PYTHON_EXE) -m pip install --upgrade pip + $(PYTHON_EXE) -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\src\compatibility\openvino\requirements-dev.txt + $(PYTHON_EXE) -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\requirements.txt + $(PYTHON_EXE) -m pip install -r $(REPO_DIR)\modules\custom_operations\tests\requirements.txt + $(PYTHON_EXE) -m pip install $(OPENVINO_REPO_DIR)\tools\mo powershell -command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" choco install opencv -y workingDirectory: $(WORK_DIR) @@ -160,6 +159,6 @@ jobs: - script: | call C:\tools\opencv\build\setup_vars_opencv4.cmd call $(SETUPVARS) && ^ - python -m pytest -k "not sparse_conv" tests\run_tests.py + $(PYTHON_EXE) -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' From a45b826495b80be38f62eb909ea44eef081a8f23 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 15 Nov 2023 17:26:05 +0000 Subject: [PATCH 100/116] Define python version for setupvars.bat --- .ci/azure/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 57ca2eefb..9bd7ee5a7 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -158,7 +158,7 @@ jobs: - script: | call C:\tools\opencv\build\setup_vars_opencv4.cmd - call $(SETUPVARS) && ^ + call $(SETUPVARS) -pyver 3.8 && ^ $(PYTHON_EXE) -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' From 244a593c9c3fca9d5411b1b7b376d45d73c612a1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 15 Nov 2023 20:03:11 +0000 Subject: [PATCH 101/116] Add support for tiktoken detokenizers --- .ci/azure/linux.yml | 4 +-- .../python/ov_tokenizer/hf_parser.py | 9 +++-- .../tokenizer/python/tests/pass_rates.json | 7 ++-- .../tokenizer/python/tests/tokenizers_test.py | 35 +++++++++++++++---- 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 75568971d..d1d66355d 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -186,6 +186,6 @@ jobs: - script: | . $(SETUPVARS) source $(WORK_DIR)/.env3/bin/activate - python -m pytest --tb=no user_ie_extensions/tokenizer/python/tests/tokenizers_test.py - workingDirectory: $(REPO_DIR)/modules/custom_operations + python -m pytest --tb=no tokenizers_test.py + workingDirectory: $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/ displayName: 'Tokenizers extension regression test' diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 12432a1d1..a6726e344 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -297,7 +297,7 @@ def convert_fast_tokenizer( ov_tokenizer.output(i).tensor.add_names({output_name}) filtered_outputs.append(ov_tokenizer.output(i)) - tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters()) + tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_ENCODER_NAME) if with_decoder: return tokenizer_model, pipeline.get_decoder_ov_subgraph() @@ -439,6 +439,11 @@ def convert_tiktoken_model_tokenizer( max_length=hf_tokenizer.model_max_length, truncate_right=(hf_tokenizer.truncation_side == "right") ), PaddingStep(pad_right=(hf_tokenizer.padding_side == "right")), + VocabDecoderStep(), + CharsToBytesStep(), ] ) - return pipeline.get_encoder_ov_subgraph() + if not with_decoder: + return pipeline.get_encoder_ov_subgraph() + + return pipeline.get_encoder_ov_subgraph(), pipeline.get_decoder_ov_subgraph() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index 6728e5572..e3a86b128 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -5,9 +5,6 @@ "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5458333333333334, "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, "tokenizers_test.py::test_bpe_detokenizer": 0.93125, - "tokenizers_test.py::test_": 0.746031746031746, - "user_ie_extensions/tokenizer/python/tests/tokenizers_test.py::test_": 0.746031746031746, - "tokenizers_test.py::test_tiktoken_tokenizers[Qwen-14B-Chat-": 0.9166666666666666, - "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, - "tokenizers_test.py::test_tiktoken_tokenizers[xgen-7b-8k-base-": 0.9166666666666666 + "tokenizers_test.py::test_": 0.7527970165157165, + "tokenizers_test.py::test_tiktoken_tokenizers": 0.9 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 8461772e2..1796d5a0a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -141,19 +141,23 @@ def hf_and_ov_bpe_detokenizer(request): @pytest.fixture(scope="session", params=[True, False], ids=lambda is_fast: "Fast" if is_fast else "Slow") -def fast_tokenizer(request): +def is_fast_tokenizer(request): return request.param @pytest.fixture(scope="session", params=sentencepiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) -def sentencepice_model_tokenizers(request, fast_tokenizer): - return get_tokenizer_detokenizer(request, fast_tokenizer) +def sentencepice_model_tokenizers(request, is_fast_tokenizer): + return get_tokenizer_detokenizer(request, is_fast_tokenizer) @pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) -def tiktoken_model_tokenizers(request): +def tiktoken_tokenizers(request): return get_tokenizer(request, trust_remote_code=True) +@pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def tiktoken_detokenizers(request): + return get_tokenizer_detokenizer(request, trust_remote_code=True) + @pytest.mark.parametrize( "test_string", @@ -284,8 +288,8 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): *misc_strings, ], ) -def test_tiktoken_tokenizers(tiktoken_model_tokenizers, test_string): - hf_tokenizer, ov_tokenizer = tiktoken_model_tokenizers +def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = tiktoken_tokenizers hf_tokenized = hf_tokenizer(test_string, return_tensors="np") ov_tokenized = ov_tokenizer(pack_strings([test_string])) @@ -293,3 +297,22 @@ def test_tiktoken_tokenizers(tiktoken_model_tokenizers, test_string): for output_name, hf_result in hf_tokenized.items(): if (ov_result := ov_tokenized.get(output_name)) is not None: assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_tiktoken_detokenizer(tiktoken_detokenizers, test_string): + hf_tokenizer, _, ov_detokenizer = tiktoken_detokenizers + + token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) + ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) + + assert ov_output == hf_output From ad1c589206f0f0c862707146645e4e358fd55ef9 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 16 Nov 2023 18:32:03 +0000 Subject: [PATCH 102/116] Add ChatGLM tokenization support. --- .../python/ov_tokenizer/hf_parser.py | 22 +++++++++++++++---- .../python/ov_tokenizer/tokenizer_pipeline.py | 14 ------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index a6726e344..86fb102e4 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -331,7 +331,10 @@ def convert_sentencepiece_model_tokenizer( input_node = op.Parameter(Type.u8, PartialShape(["?"])) input_node.set_friendly_name("string_input") - if hasattr(hf_tokenizer, "add_eos_token"): + # for ChatGLM tokenizer support + if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer": + add_eos_token = False + elif hasattr(hf_tokenizer, "add_eos_token"): add_eos_token = hf_tokenizer.add_eos_token or False else: add_eos_token = ( @@ -362,6 +365,13 @@ def convert_sentencepiece_model_tokenizer( "ScatterNDUpdate", [broadcast, indices, values], # FIXME: pad left side instead of right ) + + if is_chatglm: + prefix_tokens = make_constant_node( + np.array([hf_tokenizer.get_prefix_tokens()]), dtype=scatternd_input_ids.output(0).element_type + ) + scatternd_input_ids = opset.concat([prefix_tokens, scatternd_input_ids], axis=-1) + scatternd_input_ids.output(0).tensor.add_names({TOKEN_IDS_INPUT_NAME}) outputs = scatternd_input_ids.outputs() @@ -378,6 +388,13 @@ def convert_sentencepiece_model_tokenizer( ), ], ) + + if is_chatglm: + attention_prefix = make_constant_node( + np.array([[1 for _ in hf_tokenizer.get_prefix_tokens()]]), dtype=attention_mask.output(0).element_type + ) + attention_mask = opset.concat([attention_prefix, attention_mask], axis=-1) + attention_mask.output(0).tensor.add_names({ATTENTION_MASK_INPUT_NAME}) outputs.append(attention_mask.output(0)) @@ -398,9 +415,6 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode [sp_model_node, token_ids], ).outputs() - if streaming_decoder: - decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) - string_output = factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 8b7c476c9..dc2867797 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -665,20 +665,6 @@ def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep": replace_term=r"\1", ) - @classmethod - def replace_sp_spaces(cls) -> "RegexDecodingStep": - return cls( - regex_search_pattern="▁", - replace_term=" ", - ) - - @classmethod - def replace_sp_newlines(cls) -> "RegexDecodingStep": - return cls( - regex_search_pattern="<0x0A>", - replace_term="\n", - ) - def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend( ( From 0f63c3daf0e250cf39825839fdd9bf1b8267b115 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 16 Nov 2023 20:02:20 +0000 Subject: [PATCH 103/116] Add ChatGLM detokenization and tests --- .../python/ov_tokenizer/convert_tokenizer.py | 2 -- .../python/ov_tokenizer/hf_parser.py | 30 +++++++++++++++---- .../tokenizer/python/tests/pass_rates.json | 8 ++--- .../tokenizer/python/tests/tokenizers_test.py | 10 +++++-- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index ca789ce4b..fed5ba4f7 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -52,9 +52,7 @@ def convert_tokenizer( logger.info("Convert tiktoken-based tokenizer") ov_tokenizers = convert_tiktoken_model_tokenizer( tokenizer_object, - add_attention_mask=True, with_decoder=with_decoder, - streaming_decoder=streaming_decoder, ) elif isinstance(tokenizer_object, PreTrainedTokenizerFast): logger.info("Convert Huggingface Fast tokenizer pipeline.") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 86fb102e4..71f8d8e89 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -4,6 +4,7 @@ import json import tempfile +from copy import deepcopy from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -14,6 +15,7 @@ from openvino.runtime import Node, op from openvino.runtime.exceptions import OVTypeError from openvino.runtime.utils.types import as_node, make_constant_node +from transformers.convert_slow_tokenizer import import_protobuf from .constants import ( ATTENTION_MASK_INPUT_NAME, @@ -308,6 +310,22 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model") +def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None: + model_pb = import_protobuf() + model = model_pb.ModelProto() + with open(sp_model_path, "rb") as model_file: + model.ParseFromString(model_file.read()) + + add_token_dict = hf_tokenizer.tokenizer.index_special_tokens + for idx, token in sorted(add_token_dict.items()): + new_piece = deepcopy(model.pieces[-1]) + new_piece.piece = token + model.pieces.append(new_piece) + + with open(sp_model_path, "wb") as model_file: + model_file.write(model.SerializeToString()) + + def convert_sentencepiece_model_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", add_attention_mask: bool = True, @@ -321,7 +339,12 @@ def convert_sentencepiece_model_tokenizer( with tempfile.TemporaryDirectory() as tmp: hf_tokenizer.save_pretrained(tmp) - sp_model = np.fromfile(Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"], dtype=np.uint8) + vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"] + + if (is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer"): + add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer) + + sp_model = np.fromfile(vocab_file, dtype=np.uint8) sp_model_node = as_node(sp_model) if hf_tokenizer.is_fast: @@ -331,8 +354,7 @@ def convert_sentencepiece_model_tokenizer( input_node = op.Parameter(Type.u8, PartialShape(["?"])) input_node.set_friendly_name("string_input") - # for ChatGLM tokenizer support - if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer": + if is_chatglm: add_eos_token = False elif hasattr(hf_tokenizer, "add_eos_token"): add_eos_token = hf_tokenizer.add_eos_token or False @@ -435,9 +457,7 @@ def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: def convert_tiktoken_model_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", - add_attention_mask: bool = True, with_decoder: bool = False, - streaming_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: encoding = getattr(hf_tokenizer, "tokenizer", None) or hf_tokenizer.encoder split_pattern = encoding._pat_str diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index e3a86b128..ee5917ce2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -1,10 +1,10 @@ { "tokenizers_test.py::test_hf_wordpiece_tokenizers_outputs": 0.9423076923076923, "tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641, - "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.4, - "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5458333333333334, + "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.6875, + "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5525, "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, "tokenizers_test.py::test_bpe_detokenizer": 0.93125, - "tokenizers_test.py::test_": 0.7527970165157165, - "tokenizers_test.py::test_tiktoken_tokenizers": 0.9 + "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, + "tokenizers_test.py::test_": 0.7761194029850746 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 1796d5a0a..288e77a1a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -98,6 +98,9 @@ "xlm-roberta-base", "microsoft/deberta-v3-base", "xlnet-base-cased", + # "THUDM/chatglm-6b", # hf_tokenizer init error + "THUDM/chatglm2-6b", # detokenizer cannot filter special tokens + "THUDM/chatglm3-6b", # "t5-base", # crashes tests ] tiktiken_models = [ @@ -147,13 +150,14 @@ def is_fast_tokenizer(request): @pytest.fixture(scope="session", params=sentencepiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def sentencepice_model_tokenizers(request, is_fast_tokenizer): - return get_tokenizer_detokenizer(request, is_fast_tokenizer) + return get_tokenizer_detokenizer(request, is_fast_tokenizer, trust_remote_code=True) @pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def tiktoken_tokenizers(request): return get_tokenizer(request, trust_remote_code=True) + @pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def tiktoken_detokenizers(request): return get_tokenizer_detokenizer(request, trust_remote_code=True) @@ -215,7 +219,9 @@ def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_strin ov_tokenized = ov_tokenizer(pack_strings([test_string])) for output_name, hf_result in hf_tokenized.items(): - assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + # chatglm has token_type_ids output that we omit + if (ov_result := ov_tokenized.get(output_name)) is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" @pytest.mark.parametrize( From 0f1c1cc7e8e257199e3f7db774b844bb1118fe2a Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 17 Nov 2023 17:11:46 +0000 Subject: [PATCH 104/116] Add ChatGLM detokenization and tests --- modules/custom_operations/tests/run_tests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modules/custom_operations/tests/run_tests.py b/modules/custom_operations/tests/run_tests.py index 984e70263..365feb507 100644 --- a/modules/custom_operations/tests/run_tests.py +++ b/modules/custom_operations/tests/run_tests.py @@ -1,5 +1,6 @@ # Copyright (C) 2018-2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from pathlib import Path from openvino.runtime import Core from openvino.tools.mo import convert_model @@ -7,6 +8,13 @@ import pytest import numpy as np import os +import sys + + +ext_path_dir = Path(os.getenv('CUSTOM_OP_LIB')).parent +if sys.platform == "win32": + # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH. + os.add_dll_directory(str(ext_path_dir.absolute())) def run_test(ref_inputs, ref_res, test_onnx=False, threshold=1e-5): From 3edb73b1f17541eeae1aa3a31949fedf5c11441b Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 17 Nov 2023 18:23:26 +0000 Subject: [PATCH 105/116] Fix mac sha256 --- .../user_ie_extensions/tokenizer/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index fcb5df69b..4af023c5a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -77,7 +77,7 @@ else() FetchContent_Declare( fast_tokenizer URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz - URL_HASH SHA256=5ef2e389cee985b5cef5ebf8d375704cf63030d8ec66a0b5c7bcd8771a250109 + URL_HASH SHA256=9e84ed6ce35e7e1dbdde21226ad8bf2ade78b9f71c9f56bdc3dfc293e20b3864 ) elseif(APPLE AND AARCH64) FetchContent_Declare( From 48bba341152c6b2385126980f4a50f7eda94844b Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 17 Nov 2023 19:08:21 +0000 Subject: [PATCH 106/116] Skip Lin Java Tests --- .ci/azure/linux.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index d1d66355d..3ea08beb9 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -154,15 +154,15 @@ jobs: - script: ls -alR $(INSTALL_DIR) displayName: 'List install files' - - script: | - set -e - export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} - . $(SETUPVARS) gradle clean build --info - for d in CPU HETERO:CPU; do - gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; - done - workingDirectory: $(REPO_DIR)/modules/java_api - displayName: 'Java tests' +# - script: | +# set -e +# export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} +# . $(SETUPVARS) gradle clean build --info +# for d in CPU HETERO:CPU; do +# gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; +# done +# workingDirectory: $(REPO_DIR)/modules/java_api +# displayName: 'Java tests' - script: | python3 -m pip install --user virtualenv From fe507ff341b4ba031318de4ef6519e1342697930 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 17 Nov 2023 21:05:51 +0000 Subject: [PATCH 107/116] Add Mac Tokenziers Tests and Skip Mac Java Step --- .ci/azure/linux.yml | 2 +- .ci/azure/mac.yml | 32 +++++++++++++++----- modules/custom_operations/tests/run_tests.py | 10 +----- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 3ea08beb9..1b51cd8b2 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -170,8 +170,8 @@ jobs: source .env3/bin/activate python -m pip install --upgrade pip python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt - python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] cd ${OPENVINO_REPO_DIR}/tools && python -m pip install mo/ + python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] workingDirectory: $(WORK_DIR) displayName: 'Create user custom operations env' diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index d902709f9..c6dbf6254 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -137,11 +137,29 @@ jobs: - script: ls -alR $(INSTALL_DIR) displayName: 'List install files' +# - script: | +# . $(SETUPVARS) gradle clean build --info +# for d in CPU HETERO:CPU; do +# gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; +# done +# workingDirectory: $(REPO_DIR)/modules/java_api +# displayName: 'Java tests' +# condition: eq(variables['CMAKE_OSX_ARCHITECTURES'], 'x86_64') + + - script: | + python3 -m pip install --user virtualenv + python3 -m virtualenv -p /usr/bin/python3.8 .env3 + source .env3/bin/activate + python -m pip install --upgrade pip + python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt + cd ${OPENVINO_REPO_DIR}/tools && python -m pip install mo/ + python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] + workingDirectory: $(WORK_DIR) + displayName: 'Create user custom operations env' + - script: | - . $(SETUPVARS) gradle clean build --info - for d in CPU HETERO:CPU; do - gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; - done - workingDirectory: $(REPO_DIR)/modules/java_api - displayName: 'Java tests' - condition: eq(variables['CMAKE_OSX_ARCHITECTURES'], 'x86_64') + . $(SETUPVARS) + source $(WORK_DIR)/.env3/bin/activate + python -m pytest --tb=no tokenizers_test.py + workingDirectory: $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/ + displayName: 'Tokenizers extension regression test' diff --git a/modules/custom_operations/tests/run_tests.py b/modules/custom_operations/tests/run_tests.py index 365feb507..a387f7eea 100644 --- a/modules/custom_operations/tests/run_tests.py +++ b/modules/custom_operations/tests/run_tests.py @@ -1,6 +1,5 @@ # Copyright (C) 2018-2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from pathlib import Path from openvino.runtime import Core from openvino.tools.mo import convert_model @@ -8,16 +7,9 @@ import pytest import numpy as np import os -import sys -ext_path_dir = Path(os.getenv('CUSTOM_OP_LIB')).parent -if sys.platform == "win32": - # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH. - os.add_dll_directory(str(ext_path_dir.absolute())) - - -def run_test(ref_inputs, ref_res, test_onnx=False, threshold=1e-5): +def run_test(ref_inputs, ref_res, test_onnx=False, threshold=1e-5): inputs = {} shapes = {} for i in range(len(ref_inputs)): From 4b0c4ec5c646e5a82b4b6942b73ddabd5ac3dd7c Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 17 Nov 2023 21:22:34 +0000 Subject: [PATCH 108/116] Fix Mac SHA --- .../user_ie_extensions/tokenizer/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 4af023c5a..3ac63e1c6 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -77,7 +77,7 @@ else() FetchContent_Declare( fast_tokenizer URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz - URL_HASH SHA256=9e84ed6ce35e7e1dbdde21226ad8bf2ade78b9f71c9f56bdc3dfc293e20b3864 + URL_HASH SHA256=5421e5fa5bab8690eaba9f74bbb055c855fb0be921492a47fb99a489fe1f6ef2 ) elseif(APPLE AND AARCH64) FetchContent_Declare( From 4656238853b47fa2dabb7fe73537ae8e636c480d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 17 Nov 2023 22:52:53 +0000 Subject: [PATCH 109/116] Del WA for CPU Bug --- .../tokenizer/python/ov_tokenizer/str_pack.py | 2 +- .../user_ie_extensions/tokenizer/python/tests/pass_rates.json | 2 +- .../user_ie_extensions/tokenizer/regex_normalization.cpp | 4 ++-- .../custom_operations/user_ie_extensions/tokenizer/utils.cpp | 3 +-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py index a0edea098..ed7c1d9e7 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py @@ -15,7 +15,7 @@ def to_bytes(number: int) -> bytes: def pack_string(string: str) -> NDArray: - return np.frombuffer(bytes(string + " ", "utf-8"), dtype=np.uint8) # + ' ' is WA for CPU bug + return np.frombuffer(bytes(string, "utf-8"), dtype=np.uint8) def pack_strings(strings: List[str]) -> NDArray: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index ee5917ce2..fbfe0189c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -6,5 +6,5 @@ "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, "tokenizers_test.py::test_bpe_detokenizer": 0.93125, "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, - "tokenizers_test.py::test_": 0.7761194029850746 + "tokenizers_test.py::test_": 0.8078960038517092 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp index bebcfe297..bc93545f6 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -26,8 +26,8 @@ RegexNormalization::RegexNormalization( auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr()); - auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant - m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size() - 1); // FIXME: -1 is a complementary change to a WA applied in string_attribute_to_constant + auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size()); + m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size()); m_search_pattern_re = std::make_shared(search_pattern); }; constructor_validate_and_infer_types(); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 3aaf6989e..b5c7fa08c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -211,8 +211,7 @@ bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorV } std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name) { - // FIXME: using space to pad the value to work-around CPU issue with empty constants - auto value = node.get_attribute(name) + " "; + auto value = node.get_attribute(name); // TODO: How to translate attribute `replace_global`? From 2f5cc1c4d71baf368bc851e79448cbaf56ddc6d9 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Sat, 18 Nov 2023 01:05:24 +0000 Subject: [PATCH 110/116] Fix Mac CI Pipeline --- .../user_ie_extensions/tokenizer/python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml index b89e14158..338bf5690 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -8,7 +8,7 @@ authors = [ ] dependencies = [ - "openvino>=2023.1", + "openvino", "numpy" ] From 1568727807a31c16b2d732bb526df954cc9349fc Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Sat, 18 Nov 2023 11:56:48 +0000 Subject: [PATCH 111/116] Change Mac CI --- .ci/azure/mac.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index c6dbf6254..f86359f63 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -147,19 +147,16 @@ jobs: # condition: eq(variables['CMAKE_OSX_ARCHITECTURES'], 'x86_64') - script: | - python3 -m pip install --user virtualenv - python3 -m virtualenv -p /usr/bin/python3.8 .env3 - source .env3/bin/activate + python3 -m venv venv + source venv/bin/activate python -m pip install --upgrade pip - python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt - cd ${OPENVINO_REPO_DIR}/tools && python -m pip install mo/ - python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] + python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[transformers] workingDirectory: $(WORK_DIR) - displayName: 'Create user custom operations env' + displayName: 'Create tokenizers env' - script: | . $(SETUPVARS) - source $(WORK_DIR)/.env3/bin/activate + source $(WORK_DIR)/venv/bin/activate python -m pytest --tb=no tokenizers_test.py workingDirectory: $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/ displayName: 'Tokenizers extension regression test' From 14f993b71ff44f85e7d26c4cbc9b4580845fb2fb Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 20 Nov 2023 13:29:46 +0000 Subject: [PATCH 112/116] Add setupvars to mac CI --- .ci/azure/mac.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index f86359f63..2424ef660 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -150,6 +150,7 @@ jobs: python3 -m venv venv source venv/bin/activate python -m pip install --upgrade pip + . $(SETUPVARS) python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[transformers] workingDirectory: $(WORK_DIR) displayName: 'Create tokenizers env' From fa822c250f0ce00dafdef787fc720df44b08c502 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 20 Nov 2023 17:02:10 +0400 Subject: [PATCH 113/116] Fixed compilation --- .../tokenizer/CMakeLists.txt | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 3ac63e1c6..09da10fd3 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -11,6 +11,26 @@ option(BUILD_FAST_TOKENIZERS OFF) # to build only sentencepiece-static target set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL ON) +# +# Compile flags +# + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") + set(cxx_flags "-Wno-undef") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # C4244: 'argument' : conversion from 'type1' to 'type2', possible loss of data + # C4267: 'var' : conversion from 'size_t' to 'type', possible loss of data + set(cxx_flags "/wd4244 /wd4267") +endif() + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-Wsuggest-override" SUGGEST_OVERRIDE_SUPPORTED) +if(SUGGEST_OVERRIDE_SUPPORTED) + set(cxx_flags "${cxx_flags} -Wno-suggest-override") +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") + # # Dependencies # @@ -77,7 +97,7 @@ else() FetchContent_Declare( fast_tokenizer URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz - URL_HASH SHA256=5421e5fa5bab8690eaba9f74bbb055c855fb0be921492a47fb99a489fe1f6ef2 + URL_HASH SHA256=4c8123ad941b3e4325ef72f328db545e34d5eec2de3e2545e1ab8ebeeb5146a9 ) elseif(APPLE AND AARCH64) FetchContent_Declare( @@ -99,26 +119,6 @@ else() endif() endif() -# -# Compile flags -# - -if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") - set(cxx_flags "-Wno-undef") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - # C4244: 'argument' : conversion from 'type1' to 'type2', possible loss of data - # C4267: 'var' : conversion from 'size_t' to 'type', possible loss of data - set(cxx_flags "/wd4244 /wd4267") -endif() - -include(CheckCXXCompilerFlag) -check_cxx_compiler_flag("-Wsuggest-override" SUGGEST_OVERRIDE_SUPPORTED) -if(SUGGEST_OVERRIDE_SUPPORTED) - set(cxx_flags "${cxx_flags} -Wno-suggest-override") -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") - # # Target include dirs, link libraries and other properties # From b59204d54dcf06b276a5b528d0f67a32fc64d858 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 20 Nov 2023 15:23:42 +0000 Subject: [PATCH 114/116] Change detokenizer output type --- .../python/ov_tokenizer/convert_tokenizer.py | 11 ++++++----- .../tokenizer/python/ov_tokenizer/utils.py | 7 +++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index fed5ba4f7..1d107a1ce 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -9,7 +9,7 @@ from openvino.runtime import Model, Type from openvino.runtime.exceptions import OVTypeError -from .utils import change_outputs_type +from .utils import change_inputs_type, change_outputs_type logger = logging.getLogger(__name__) @@ -21,6 +21,7 @@ def convert_tokenizer( with_decoder: bool = False, streaming_decoder: bool = False, tokenizer_output_type: Type = Type.i64, + detokenizer_input_type: Type = Type.i64, ) -> Union[Model, Tuple[Model, Model]]: # todo: add support for more then 1 input if number_of_inputs > 1: @@ -65,10 +66,10 @@ def convert_tokenizer( if ov_tokenizers is None: raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") - if tokenizer_output_type == Type.i32: - return ov_tokenizers - if isinstance(ov_tokenizers, tuple): - return change_outputs_type(ov_tokenizers[0], tokenizer_output_type), ov_tokenizers[1] + return ( + change_outputs_type(ov_tokenizers[0], tokenizer_output_type), + change_inputs_type(ov_tokenizers[1], detokenizer_input_type), + ) return change_outputs_type(ov_tokenizers, tokenizer_output_type) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py index c083b6f99..1d152c13c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -94,6 +94,13 @@ def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGIT return model +def change_inputs_type(model: Model, input_type: Type) -> Model: + ppp = PrePostProcessor(model) + for idx, _ in enumerate(model.inputs): + ppp.input(idx).tensor().set_element_type(input_type) + return ppp.build() + + def change_outputs_type(model: Model, output_type: Type) -> Model: ppp = PrePostProcessor(model) for idx, _ in enumerate(model.outputs): From 6c3bae38391737b231282623a9fea82986886ff1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 20 Nov 2023 15:53:12 +0000 Subject: [PATCH 115/116] Fix SegFault on AddedTokens For BPE tokenizer --- .../python/ov_tokenizer/hf_parser.py | 2 +- .../python/ov_tokenizer/tokenizer_pipeline.py | 22 +++++++++++++++++-- .../tokenizer/python/tests/pass_rates.json | 6 ++--- .../tokenizer/python/tests/tokenizers_test.py | 3 +-- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 71f8d8e89..53c647d7c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -341,7 +341,7 @@ def convert_sentencepiece_model_tokenizer( hf_tokenizer.save_pretrained(tmp) vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"] - if (is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer"): + if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer": add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer) sp_model = np.fromfile(vocab_file, dtype=np.uint8) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index dc2867797..d155bf93c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -326,20 +326,37 @@ class BPETokenizationStep(TokenizationModelStep): suffix_indicator: str = "" end_suffix: str = "" byte_fallback: bool = False + added_tokens: Optional[Dict[int, str]] = None + + def __post_init__(self): + if self.added_tokens is not None: + self.extend_vocab_with_added_tokens() + + def extend_vocab_with_added_tokens(self) -> None: + for idx, token in sorted(self.added_tokens.items()): + self.vocab.append(token) @classmethod def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": + vocab = [token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])] return cls( unk_token=tokenizer_json["model"]["unk_token"] or "", fuse_unk=tokenizer_json["model"]["fuse_unk"] or False, suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"] or "", end_suffix=tokenizer_json["model"]["end_of_word_suffix"] or "", - vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])], + vocab=vocab, merges=tokenizer_json["model"]["merges"], + added_tokens={ + token["id"]: token["content"] for token in tokenizer_json["added_tokens"] if token["id"] >= len(vocab) + }, ) @classmethod - def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep": # noqa + def from_tiktoken_encoding( + cls, + encoding: "Encoding", # noqa + added_tokens: Optional[Dict[int, str]] = None, + ) -> "BPETokenizationStep": from .tiktoken_parser import generate_vocab_and_merges vocab, merges = generate_vocab_and_merges(encoding) @@ -350,6 +367,7 @@ def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep": end_suffix="", vocab=[token for token, idx in sorted(vocab.items(), key=lambda x: x[1])], merges=merges, + added_tokens=added_tokens, ) def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index fbfe0189c..1ec7a932d 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -3,8 +3,8 @@ "tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641, "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.6875, "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5525, - "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, - "tokenizers_test.py::test_bpe_detokenizer": 0.93125, + "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88, + "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882, "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, - "tokenizers_test.py::test_": 0.8078960038517092 + "tokenizers_test.py::test_": 0.8124118476727785 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 288e77a1a..ca2611a44 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -86,7 +86,7 @@ "microsoft/deberta-base", "bigscience/bloom", "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", - # "Salesforce/codegen-16B-multi", # Segfalts on ""A lot\t\tof whitespaces!"" + "Salesforce/codegen-16B-multi", # "google/flan-t5-xxl", # needs Precompiled/CharsMap # "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer # "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work @@ -284,7 +284,6 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): assert ov_output == hf_output -# @pytest.mark.skip(reason="tiktoken tokenizer is WIP") @pytest.mark.parametrize( "test_string", [ From d34d401bf4549834c85c1402e224129cef356e4e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 20 Nov 2023 19:35:08 +0000 Subject: [PATCH 116/116] Add SP Space handling for decoder --- .../tokenizer/python/ov_tokenizer/hf_parser.py | 3 +++ .../python/ov_tokenizer/tokenizer_pipeline.py | 7 +++++++ .../tokenizer/python/tests/pass_rates.json | 2 +- .../tokenizer/python/tests/tokenizers_test.py | 18 ++++++++++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 53c647d7c..401c8ea2b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -437,6 +437,9 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode [sp_model_node, token_ids], ).outputs() + if streaming_decoder: + decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) + string_output = factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index d155bf93c..74654344a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -692,6 +692,13 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: ) return factory.create("RegexNormalization", input_nodes).outputs() + @classmethod + def replace_sp_spaces(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern="▁", + replace_term=" ", + ) + @dataclass class TokenizerPipeline: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index 1ec7a932d..6d8440fc9 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -6,5 +6,5 @@ "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88, "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882, "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, - "tokenizers_test.py::test_": 0.8124118476727785 + "tokenizers_test.py::test_": 0.825187969924812 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index ca2611a44..57e723bab 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -7,6 +7,7 @@ # os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so" import numpy as np +import openvino import pytest from openvino import Core from transformers import AutoTokenizer @@ -321,3 +322,20 @@ def test_tiktoken_detokenizer(tiktoken_detokenizers, test_string): ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) assert ov_output == hf_output + + +def test_streaming_detokenizer(): + hf_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2") + _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True, streaming_decoder=True) + ov_detokenizer = core.compile_model(ov_detokenizer) + + test_string = "this is a test string" + tokenized_string = hf_tokenizer(test_string).input_ids + hf_detokenized = hf_tokenizer.decode(tokenized_string) + + detokenized_string = "" + for token in tokenized_string: + ov_output = unpack_strings(ov_detokenizer(np.atleast_2d(token))["string_output"])[0] + detokenized_string += ov_output + + assert detokenized_string == hf_detokenized