diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 6aeaf27c1..1b51cd8b2 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -154,15 +154,15 @@ jobs: - script: ls -alR $(INSTALL_DIR) displayName: 'List install files' - - script: | - set -e - export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} - . $(SETUPVARS) gradle clean build --info - for d in CPU HETERO:CPU; do - gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; - done - workingDirectory: $(REPO_DIR)/modules/java_api - displayName: 'Java tests' +# - script: | +# set -e +# export PATH=$(WORK_DIR)/gradle-$(GRADLE_VER)/bin:${PATH} +# . $(SETUPVARS) gradle clean build --info +# for d in CPU HETERO:CPU; do +# gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; +# done +# workingDirectory: $(REPO_DIR)/modules/java_api +# displayName: 'Java tests' - script: | python3 -m pip install --user virtualenv @@ -171,6 +171,7 @@ jobs: python -m pip install --upgrade pip python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt cd ${OPENVINO_REPO_DIR}/tools && python -m pip install mo/ + python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] workingDirectory: $(WORK_DIR) displayName: 'Create user custom operations env' @@ -181,3 +182,10 @@ jobs: python -m pytest -k "not sparse_conv" tests/run_tests.py workingDirectory: $(REPO_DIR)/modules/custom_operations displayName: 'Custom user operation tests' + + - script: | + . $(SETUPVARS) + source $(WORK_DIR)/.env3/bin/activate + python -m pytest --tb=no tokenizers_test.py + workingDirectory: $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/ + displayName: 'Tokenizers extension regression test' diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index d902709f9..2424ef660 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -137,11 +137,27 @@ jobs: - script: ls -alR $(INSTALL_DIR) displayName: 'List install files' +# - script: | +# . $(SETUPVARS) gradle clean build --info +# for d in CPU HETERO:CPU; do +# gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; +# done +# workingDirectory: $(REPO_DIR)/modules/java_api +# displayName: 'Java tests' +# condition: eq(variables['CMAKE_OSX_ARCHITECTURES'], 'x86_64') + + - script: | + python3 -m venv venv + source venv/bin/activate + python -m pip install --upgrade pip + . $(SETUPVARS) + python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[transformers] + workingDirectory: $(WORK_DIR) + displayName: 'Create tokenizers env' + - script: | - . $(SETUPVARS) gradle clean build --info - for d in CPU HETERO:CPU; do - gradle test -Prun_tests -DMODELS_PATH=$(MODELS_PATH) -Ddevice=$d --info; - done - workingDirectory: $(REPO_DIR)/modules/java_api - displayName: 'Java tests' - condition: eq(variables['CMAKE_OSX_ARCHITECTURES'], 'x86_64') + . $(SETUPVARS) + source $(WORK_DIR)/venv/bin/activate + python -m pytest --tb=no tokenizers_test.py + workingDirectory: $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/ + displayName: 'Tokenizers extension regression test' diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 4abffac6d..9bd7ee5a7 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -54,14 +54,13 @@ jobs: SETUPVARS: $(INSTALL_DIR)\setupvars.bat CUSTOM_OP_LIB: $(BIN_DIR)\user_ov_extensions.dll GRADLE_VER: 7.1.1 + PYTHON_EXE: C:\hostedtoolcache\windows\Python\3.8.2\x64\python.exe steps: - script: | powershell -command "Invoke-RestMethod -Headers @{\"Metadata\"=\"true\"} -Method GET -Uri http://169.254.169.254/metadata/instance/compute?api-version=2019-06-01 | format-custom" - where python3 - python3 --version - where python - python --version + where $(PYTHON_EXE) + $(PYTHON_EXE) --version where java java -version wmic computersystem get TotalPhysicalMemory @@ -99,11 +98,11 @@ jobs: powershell -command "Expand-Archive -Force ninja-win.zip" powershell -command "Invoke-WebRequest https://services.gradle.org/distributions/gradle-$(GRADLE_VER)-bin.zip -OutFile gradle-$(GRADLE_VER)-bin.zip" powershell -command "Expand-Archive -Force gradle-$(GRADLE_VER)-bin.zip" - python -m pip install --upgrade pip - python -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\src\compatibility\openvino\requirements-dev.txt - python -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\requirements.txt - python -m pip install -r $(REPO_DIR)\modules\custom_operations\tests\requirements.txt - python -m pip install $(OPENVINO_REPO_DIR)\tools\mo + $(PYTHON_EXE) -m pip install --upgrade pip + $(PYTHON_EXE) -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\src\compatibility\openvino\requirements-dev.txt + $(PYTHON_EXE) -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\requirements.txt + $(PYTHON_EXE) -m pip install -r $(REPO_DIR)\modules\custom_operations\tests\requirements.txt + $(PYTHON_EXE) -m pip install $(OPENVINO_REPO_DIR)\tools\mo powershell -command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" choco install opencv -y workingDirectory: $(WORK_DIR) @@ -159,7 +158,7 @@ jobs: - script: | call C:\tools\opencv\build\setup_vars_opencv4.cmd - call $(SETUPVARS) - python -m pytest -k "not sparse_conv" tests\run_tests.py + call $(SETUPVARS) -pyver 3.8 && ^ + $(PYTHON_EXE) -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' diff --git a/modules/custom_operations/tests/run_tests.py b/modules/custom_operations/tests/run_tests.py index 984e70263..a387f7eea 100644 --- a/modules/custom_operations/tests/run_tests.py +++ b/modules/custom_operations/tests/run_tests.py @@ -9,7 +9,7 @@ import os -def run_test(ref_inputs, ref_res, test_onnx=False, threshold=1e-5): +def run_test(ref_inputs, ref_res, test_onnx=False, threshold=1e-5): inputs = {} shapes = {} for i in range(len(ref_inputs)): diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 506443152..c830c0a21 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -12,7 +12,11 @@ endif() set(TARGET_NAME "user_ov_extensions") -set(CMAKE_CXX_STANDARD 11) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) +endif() + +include(cmake/platforms.cmake) find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB COMPONENTS tbb tbbmalloc) @@ -27,6 +31,7 @@ set(OP_REQ_TBB "complex_mul" "fft") if(NOT CUSTOM_OPERATIONS) file(GLOB op_src "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") file(GLOB op_dirs LIST_DIRECTORIES true "${CMAKE_CURRENT_SOURCE_DIR}/*") + list(REMOVE_ITEM op_dirs "${CMAKE_CURRENT_SOURCE_DIR}/cmake") foreach(op IN LISTS op_src) get_filename_component(op_name ${op} NAME_WE) @@ -88,10 +93,12 @@ if(TBB_FOUND) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb TBB::tbbmalloc) endif() -if(sentence_piece IN_LIST CUSTOM_OPERATIONS) - add_subdirectory(sentence_piece) +# Left sentence_piece for backward compatibility +if(tokenizer IN_LIST CUSTOM_OPERATIONS) + add_subdirectory(tokenizer) endif() target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS}) +target_include_directories(${TARGET_NAME} PUBLIC ./include/) diff --git a/modules/custom_operations/user_ie_extensions/cmake/platforms.cmake b/modules/custom_operations/user_ie_extensions/cmake/platforms.cmake new file mode 100644 index 000000000..67c7f3c82 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/cmake/platforms.cmake @@ -0,0 +1,89 @@ + +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +if(CMAKE_CL_64) + set(MSVC64 ON) +endif() + +if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine + OUTPUT_VARIABLE OPENVINO_GCC_TARGET_MACHINE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(OPENVINO_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + set(MINGW64 ON) + endif() +endif() + +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(OV_HOST_ARCH X86_64) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") + set(OV_HOST_ARCH X86) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") + set(OV_HOST_ARCH AARCH64) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(OV_HOST_ARCH ARM) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^riscv64$") + set(OV_HOST_ARCH RISCV64) +endif() + +macro(_ov_user_ext_detect_arch_by_processor_type) + if(CMAKE_OSX_ARCHITECTURES AND APPLE) + if(CMAKE_OSX_ARCHITECTURES STREQUAL "arm64") + set(OV_ARCH AARCH64) + elseif(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") + set(OV_ARCH X86_64) + elseif(CMAKE_OSX_ARCHITECTURES MATCHES ".*x86_64.*" AND CMAKE_OSX_ARCHITECTURES MATCHES ".*arm64.*") + set(OV_ARCH UNIVERSAL2) + else() + message(FATAL_ERROR "Unsupported value: CMAKE_OSX_ARCHITECTURES = ${CMAKE_OSX_ARCHITECTURES}") + endif() + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(OV_ARCH X86_64) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*|wasm") + set(OV_ARCH X86) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*|armv8)") + set(OV_ARCH AARCH64) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(OV_ARCH ARM) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64$") + set(OV_ARCH RISCV64) + endif() +endmacro() + +macro(_ov_user_ext_process_msvc_generator_platform) + # if cmake -A is passed + if(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") + set(OV_ARCH AARCH64) + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM") + set(OV_ARCH ARM) + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "x64") + set(OV_ARCH X86_64) + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "Win32") + set(OV_ARCH X86) + else() + _ov_user_ext_detect_arch_by_processor_type() + endif() +endmacro() + +if(MSVC64 OR MINGW64) + _ov_user_ext_process_msvc_generator_platform() +elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) + _ov_user_ext_process_msvc_generator_platform() +else() + _ov_user_ext_detect_arch_by_processor_type() +endif() + +set(HOST_${OV_HOST_ARCH} ON) +set(${OV_ARCH} ON) + +unset(OV_ARCH) + +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(EMSCRIPTEN ON) +endif() + +if(UNIX AND NOT (APPLE OR ANDROID OR EMSCRIPTEN OR CYGWIN)) + set(LINUX ON) +endif() diff --git a/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp new file mode 100644 index 000000000..5bfe85e5a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace openvino_extensions { +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .size() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according the input data +template +void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + size_t symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference str) + { return accum + str.size(); }); + + size_t total_size = 4 * (1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + int32_t* pindices = reinterpret_cast(destination.data()); + pindices[0] = batch_size; + pindices[1] = 0; + pindices += 2; + char* psymbols = reinterpret_cast(pindices + batch_size); + size_t current_symbols_pos = 0; + + for (const auto& str: strings) { + psymbols = std::copy(str.begin(), str.end(), psymbols); + current_symbols_pos += str.size(); + *pindices = current_symbols_pos; + ++pindices; + } +} + +std::vector unpack_strings(const ov::Tensor& source) { + int32_t length = source.get_byte_size(); + // check the format of the input bitstream representing the string tensor + OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + const int32_t* pindices = reinterpret_cast(source.data()); + int32_t batch_size = pindices[0]; + OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + const int32_t* begin_ids = pindices + 1; + const int32_t* end_ids = pindices + 2; + const char* symbols = reinterpret_cast(pindices + 2 + batch_size); + + std::vector result; + result.reserve(batch_size); + for (int32_t idx = 0; idx < batch_size; ++idx) { + result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]); + } + return result; +} +} diff --git a/modules/custom_operations/user_ie_extensions/ov_extension.cpp b/modules/custom_operations/user_ie_extensions/ov_extension.cpp index 47f548022..360fc6f67 100644 --- a/modules/custom_operations/user_ie_extensions/ov_extension.cpp +++ b/modules/custom_operations/user_ie_extensions/ov_extension.cpp @@ -52,14 +52,38 @@ # define S_CONV_EXT #endif -#ifdef sentence_piece -# include "sentence_piece/sentence_piece.hpp" -# define SENTENSE_PIECE_EXT \ +#ifdef tokenizer +# include "tokenizer/tokenizer.hpp" +# define TOKENIZER_EXT \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), \ + std::make_shared>(), \ + std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ + std::make_shared>(), \ + std::make_shared("StaticRegexReplace", translate_static_regex_replace), \ + std::make_shared>(), \ + std::make_shared("RegexSplitWithOffsets", translate_regex_split_with_offsets), \ + std::make_shared>(), \ + std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared("LookupTableFindV2", translate_lookup_table_find_v2), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared("Reshape", translate_reshape), \ + std::make_shared("Const", translate_const), \ std::make_shared>(), \ + std::make_shared>(), \ + std::make_shared>(), \ std::make_shared("SentencepieceOp", translate_sentencepiece_op), \ std::make_shared("RaggedTensorToSparse", translate_sentencepiece_tokenizer), #else -# define SENTENSE_PIECE_EXT +# define TOKENIZER_EXT #endif OPENVINO_CREATE_EXTENSIONS(std::vector( @@ -69,5 +93,5 @@ OPENVINO_CREATE_EXTENSIONS(std::vector( S_CONV_TRANSPOSE_EXT S_CONV_EXT COMPLEX_MUL_EXT - SENTENSE_PIECE_EXT + TOKENIZER_EXT })); diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt deleted file mode 100644 index 5817a9ad3..000000000 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/CMakeLists.txt +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -include(CheckCXXCompilerFlag) - -# to build only sentencepiece-static target -set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - PROPERTY EXCLUDE_FROM_ALL ON) - -include(FetchContent) - -FetchContent_Declare( - sentencepiece - URL https://github.com/google/sentencepiece/archive/87721596842ab099c603b23357d948906813e853.tar.gz - URL_HASH SHA256=a7c105aca0131b4a899155a6c44ea9728e63514edaa8d71fa92e7a5de53b6ca0 -) - -if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") - set(cxx_flags "-Wno-undef") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - # C4244: 'argument' : conversion from 'type1' to 'type2', possible loss of data - # C4267: 'var' : conversion from 'size_t' to 'type', possible loss of data - set(cxx_flags "/wd4244 /wd4267") -endif() - -check_cxx_compiler_flag("-Wsuggest-override" SUGGEST_OVERRIDE_SUPPORTED) -if(SUGGEST_OVERRIDE_SUPPORTED) - set(cxx_flags "${cxx_flags} -Wno-suggest-override") -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") - -FetchContent_MakeAvailable(sentencepiece) - -# set include dirs for specific source files -target_include_directories(${TARGET_NAME} PRIVATE - "${sentencepiece_SOURCE_DIR}/src/builtin_pb" - "${sentencepiece_SOURCE_DIR}/src" - "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite" - "${sentencepiece_SOURCE_DIR}" - "${sentencepiece_SOURCE_DIR}" - "${sentencepiece_BINARY_DIR}") - -if(CMAKE_CL_64) - target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) -endif() - -target_link_libraries(${TARGET_NAME} PRIVATE sentencepiece-static) - -# string_view is used from cxx17 -string(REPLACE " " ";" cxx_flags "${cxx_flags}") -set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17 - COMPILE_OPTIONS "${cxx_flags}") diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp deleted file mode 100644 index 0257ec23a..000000000 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.cpp +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "normalizer.h" -#include "sentence_piece.hpp" - -#include "openvino/opsets/opset10.hpp" - -using sentencepiece::SentencePieceProcessor; -using namespace TemplateExtension; -using namespace ov; -using namespace ov::frontend; -using namespace ov::opset10; - -namespace { - template - T extract_scalar_const_value(const std::shared_ptr& node, const std::string& const_name) { - auto const_node = as_type_ptr(node); - FRONT_END_GENERAL_CHECK(const_node, "Conversion expects " + const_name + " to be constant."); - std::vector const_value = const_node->cast_vector(); - FRONT_END_GENERAL_CHECK(const_value.size() == 1, "Conversion expects " + const_name + " to be a scalar."); - return const_value[0]; - } -} // namespace - -SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t nbest_size, float alpha, - bool add_bos, bool add_eos, bool reverse) : m_sp(std::make_shared()), - m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), - m_reverse(reverse), Op(args) { - FRONT_END_GENERAL_CHECK(args.size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); - auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); - FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); - auto spm_model = static_cast(sp_model_const->get_data_ptr()); - auto spm_model_size = sp_model_const->get_byte_size(); - - // configure SentencePieceProcessor - std::string model_proto(spm_model, spm_model_size); - CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); - - // form extra options to configure SentencePieceProcessor - std::string extra_options = ""; - if (m_add_bos) { - extra_options += "bos"; - } - if (m_add_eos) { - extra_options = extra_options.empty() ? extra_options : extra_options + ":"; - extra_options += "eos"; - } - /* TODO: TF ignores this option, so we are ignoring it as well; need to understand what should we do - if (m_reverse) { - extra_options = extra_options.empty() ? extra_options : extra_options + ":"; - extra_options += "reverse"; - } - */ - // example of extra_options, if "bos:eos:reverse" - CHECK_OK(m_sp->SetEncodeExtraOptions(extra_options)); - constructor_validate_and_infer_types(); -} - -SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const std::shared_ptr& sp, - int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse) : m_sp(sp), - m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), - m_reverse(reverse), Op(args) { - constructor_validate_and_infer_types(); -} - -void SentencepieceTokenizer::validate_and_infer_types() { - // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values - // and dense shape - set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); - set_output_type(1, element::i32, PartialShape{ Dimension() }); - set_output_type(2, element::i64, PartialShape{ Dimension(2) }); -} - -bool SentencepieceTokenizer::visit_attributes(AttributeVisitor& visitor) { - visitor.on_attribute("nbest_size", m_nbest_size); - visitor.on_attribute("alpha", m_alpha); - visitor.on_attribute("add_bos", m_add_bos); - visitor.on_attribute("add_eos", m_add_eos); - visitor.on_attribute("reverse", m_reverse); - return true; -} - -bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { - std::vector sparse_indices; - std::vector sparse_values; - std::vector sparse_dense_shape; - - FRONT_END_GENERAL_CHECK(inputs.size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); - - const uint8_t* strings = inputs[1].data(); - auto bitstream_size = inputs[1].get_byte_size(); - - // check the format of the input bitstream representing the string tensor - FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); - FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, - "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto data = strings + 4 + 4 + 4 * batch_size; - - size_t max_token_id = 0; - for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { - auto begin_ind = begin_ids[batch_ind]; - auto end_ind = end_ids[batch_ind]; - std::vector ids; - std::string sentence(data + begin_ind, data + end_ind); - CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); - // put into resulted vectors - for (size_t token_id = 0; token_id < ids.size(); ++token_id) { - sparse_indices.push_back(static_cast(batch_ind)); - sparse_indices.push_back(static_cast(token_id)); - sparse_values.push_back(static_cast(ids[token_id])); - } - max_token_id = max_token_id < ids.size() ? ids.size() : max_token_id; - } - sparse_dense_shape.push_back(static_cast(batch_size)); - sparse_dense_shape.push_back(static_cast(max_token_id)); - - outputs[0].set_shape({ sparse_indices.size() / 2, 2 }); - memcpy(outputs[0].data(), sparse_indices.data(), sizeof(int64_t) * sparse_indices.size()); - outputs[1].set_shape({ sparse_values.size() }); - memcpy(outputs[1].data(), sparse_values.data(), sizeof(int32_t) * sparse_values.size()); - outputs[2].set_shape({ 2 }); - memcpy(outputs[2].data(), sparse_dense_shape.data(), sizeof(int64_t) * sparse_dense_shape.size()); - return true; -} - -bool SentencepieceTokenizer::has_evaluate() const { - return true; -} - -std::shared_ptr SentencepieceTokenizer::clone_with_new_inputs(const OutputVector& new_args) const { - return std::make_shared(new_args, m_sp, m_nbest_size, m_alpha, m_add_bos, m_add_eos, m_reverse); -} - -OutputVector translate_sentencepiece_op(const NodeContext& node) { - // extract model to configure SentencePieceTokenizer - auto sp_model_ov_any = node.get_attribute_as_any("model"); - FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), - "SentencePieceOp configuration model is in incorrect format"); - auto str_spm_model = sp_model_ov_any.as(); - auto sp_model_const = std::make_shared(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data()); - return { sp_model_const }; -} - -NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { - // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, - // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp - FRONT_END_GENERAL_CHECK(node.get_input_size() > 0, "RaggedTensorToSparse expects at least one input."); - auto node_name = node.get_name(); - - // check that producers of RaggedTensorToSparse is SentencePieceTokenizer - auto sp_tokenize_op = node.get_input(0).get_node_shared_ptr(); - FRONT_END_GENERAL_CHECK(sp_tokenize_op->get_input_size() > 6, - "SentencepieceTokenizeOp expects at least six inputs"); - - // prepare inputs that go to custom operation - // prepare input 0 - SentencePieceTokenizer configuration model - auto sp_model_const = as_type_ptr(sp_tokenize_op->input_value(0).get_node_shared_ptr()); - FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant."); - - // prepare input six inputs - auto inputs = sp_tokenize_op->input_value(1); - - // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes - auto nbest_size = extract_scalar_const_value(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size"); - auto alpha = extract_scalar_const_value(sp_tokenize_op->input_value(3).get_node_shared_ptr(), "alpha"); - auto add_bos = extract_scalar_const_value(sp_tokenize_op->input_value(4).get_node_shared_ptr(), "add_bos"); - auto add_eos = extract_scalar_const_value(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos"); - auto reverse = extract_scalar_const_value(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse"); - - OutputVector inputs_vector = OutputVector{ sp_model_const, inputs }; - - // Override type of input tensor if this is a Parameter - if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { - parameter->set_partial_shape(PartialShape{ Dimension() }); - parameter->set_element_type(element::u8); - parameter->validate_and_infer_types(); - } - - // create a node with custom operation - auto sp_tokenizer_ext = std::make_shared(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse); - FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3, - "Internal error: SentencepieceTokenizer operation extension must have three outputs."); - - // set tensor names - sp_tokenizer_ext->output(0).add_names({ node_name + ":0" }); - sp_tokenizer_ext->output(1).add_names({ node_name + ":1" }); - sp_tokenizer_ext->output(2).add_names({ node_name + ":2" }); - - // create named outputs for the conversion extension - NamedOutputVector named_results; - named_results.push_back({ "sparse_indices", sp_tokenizer_ext->output(0) }); - named_results.push_back({ "sparse_values", sp_tokenizer_ext->output(1) }); - named_results.push_back({ "sparse_dense_shape", sp_tokenizer_ext->output(2) }); - - return named_results; -} diff --git a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp deleted file mode 100644 index 9823163d2..000000000 --- a/modules/custom_operations/user_ie_extensions/sentence_piece/sentence_piece.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include - -namespace sentencepiece { - class SentencePieceProcessor; -} - -namespace TemplateExtension { - class SentencepieceTokenizer : public ov::op::Op { - public: - OPENVINO_OP("SentencepieceTokenizer"); - - SentencepieceTokenizer() = default; - SentencepieceTokenizer(const ov::OutputVector& args, int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse); - SentencepieceTokenizer(const ov::OutputVector& args, const std::shared_ptr& sp, int32_t nbest_size, float alpha, - bool add_bos, bool add_eos, bool reverse); - - bool visit_attributes(ov::AttributeVisitor& visitor) override; - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - - bool has_evaluate() const override; - - private: - std::shared_ptr m_sp; - int32_t m_nbest_size; - float m_alpha; - bool m_add_bos; - bool m_add_eos; - bool m_reverse; - }; -} // namespace TemplateExtension - -ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node); - -ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt new file mode 100644 index 000000000..09da10fd3 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -0,0 +1,146 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +option(BUILD_FAST_TOKENIZERS OFF) + +# to build only sentencepiece-static target +set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL ON) + +# +# Compile flags +# + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$") + set(cxx_flags "-Wno-undef") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # C4244: 'argument' : conversion from 'type1' to 'type2', possible loss of data + # C4267: 'var' : conversion from 'size_t' to 'type', possible loss of data + set(cxx_flags "/wd4244 /wd4267") +endif() + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-Wsuggest-override" SUGGEST_OVERRIDE_SUPPORTED) +if(SUGGEST_OVERRIDE_SUPPORTED) + set(cxx_flags "${cxx_flags} -Wno-suggest-override") +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags}") + +# +# Dependencies +# + +include(FetchContent) + +FetchContent_Declare( + sentencepiece + URL https://github.com/google/sentencepiece/archive/refs/tags/v0.1.99.tar.gz + URL_HASH SHA256=63617eaf56c7a3857597dcd8780461f57dd21381b56a27716ef7d7e02e14ced4 +) +FetchContent_MakeAvailable(sentencepiece) + +if(BUILD_FAST_TOKENIZERS) + FetchContent_Declare( + fast_tokenizer + URL https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz + URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a + PATCH_COMMAND git apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/icu4c.patch" + ) + + FetchContent_GetProperties(fast_tokenizer) + if(NOT fast_tokenizer_POPULATED) + FetchContent_Populate( + fast_tokenizer + ) + set(WITH_PYTHON OFF CACHE BOOL "Disable Python API for fast_tokenizer") + add_subdirectory(${fast_tokenizer_SOURCE_DIR}/fast_tokenizer ${CMAKE_CURRENT_BINARY_DIR}/fast_tokenizer) + endif() + + # variables used later + set(FAST_TOKENIZER_INCS + "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer" + "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/" + "${CMAKE_BINARY_DIR}/third_party/json/src/extern_json/single_include/" + "${CMAKE_BINARY_DIR}/third_party/install/re2/include/") + set(FAST_TOKENIZER_LIBS core_tokenizers) +else() + if(WIN32 AND X86_64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip + URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d + ) + FetchContent_Declare( + re2 + URL https://github.com/google/re2/archive/refs/tags/2022-04-01.tar.gz + URL_HASH SHA256=1ae8ccfdb1066a731bba6ee0881baad5efd2cd661acd9569b689f2586e1a50e9 + ) + FetchContent_MakeAvailable(re2) + elseif(LINUX AND X86_64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz + URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc + ) + elseif(LINUX AND AARCH64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-1.0.2.tgz + URL_HASH SHA256=fc16c51b24a954ae3d659e1b233ce15349eafc1e4c72710b51a4f12fb2c03033 + ) + elseif(APPLE AND X86_64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz + URL_HASH SHA256=4c8123ad941b3e4325ef72f328db545e34d5eec2de3e2545e1ab8ebeeb5146a9 + ) + elseif(APPLE AND AARCH64) + FetchContent_Declare( + fast_tokenizer + URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-1.0.2.tgz + URL_HASH SHA256=ffb0f16ec96b2f5dbdb681d00d74e932e273ec1c2108196d13f2fd28abc4d266 + ) + else() + message(FATAL_ERROR "Platform ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR} does not have prebuilt Fast Tokenizer" + "Please, use -DBUILD_FAST_TOKENIZERS=ON cmake option to enable build from soures") + endif() + FetchContent_MakeAvailable(fast_tokenizer) + include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") + + if(WIN32 AND X86_64) + # we use re2 library in regex_normalization operation, so have to add to this list + # because prebuilt fast_tokenizers package does not provide this library + list(APPEND FAST_TOKENIZER_LIBS re2) + endif() +endif() + +# +# Target include dirs, link libraries and other properties +# + +# set include dirs for specific source files +target_include_directories(${TARGET_NAME} PRIVATE + # sentensepiece + "${sentencepiece_SOURCE_DIR}/src/builtin_pb" + "${sentencepiece_SOURCE_DIR}/src" + "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite" + "${sentencepiece_SOURCE_DIR}" + "${sentencepiece_BINARY_DIR}" + # fast_tokenizer + ${FAST_TOKENIZER_INCS}) + +if(CMAKE_CL_64) + target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) +endif() + +target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} sentencepiece-static) + +# string_view is used from cxx17 +string(REPLACE " " ";" cxx_flags "${cxx_flags}") +set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17 + COMPILE_OPTIONS "${cxx_flags}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp new file mode 100644 index 000000000..a16d294f9 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "bpe_tokenizer.hpp" +#include "utils.hpp" +#include "openvino/opsets/opset10.hpp" + +using namespace ov; +using namespace ov::opset10; + +#undef tokenizer + + +BPETokenizer::BPETokenizer( + const ov::OutputVector& arguments, + const std::string& unk_token, + bool fuse_unk, + const std::string& suffix_indicator, + const std::string& end_suffix, + bool byte_fallback +) : + ov::op::Op(arguments), + m_unk_token(unk_token), + m_fuse_unk(fuse_unk), + m_suffix_indicator(suffix_indicator), + m_end_suffix(end_suffix), + m_byte_fallback(byte_fallback) { + + constructor_validate_and_infer_types(); +} +BPETokenizer::BPETokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr& tokenizer, + const std::string& unk_token, + bool fuse_unk, + const std::string& suffix_indicator, + const std::string& end_suffix, + bool byte_fallback +) : + ov::op::Op(arguments), + m_tokenizer(tokenizer), + m_unk_token(unk_token), + m_fuse_unk(fuse_unk), + m_suffix_indicator(suffix_indicator), + m_end_suffix(end_suffix), + m_byte_fallback(byte_fallback) { + + if (m_tokenizer == nullptr) { + // vocab constant folding doesn't work, get packed constant + auto packed_vocab_const = as_type_ptr(arguments[5].get_node_shared_ptr()->get_input_node_shared_ptr(0)); + auto packed_vocab_buf = static_cast(packed_vocab_const->get_data_ptr()); + auto vocab_size = *reinterpret_cast(packed_vocab_buf + 0); + auto vocab_begins = reinterpret_cast(packed_vocab_buf + 4); + auto vocab_ends = reinterpret_cast(packed_vocab_buf + 4 + 4); + auto vocab_chars = packed_vocab_buf + 4 + 4 + 4 * vocab_size; + + core::Vocab vocab; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + } + + auto packed_merges_const = as_type_ptr(arguments[8].get_node_shared_ptr()->get_input_node_shared_ptr(0)); + auto packed_merges_buf = static_cast(packed_merges_const->get_data_ptr()); + auto merges_size = *reinterpret_cast(packed_merges_buf + 0); + auto merges_begins = reinterpret_cast(packed_merges_buf + 4); + auto merges_ends = reinterpret_cast(packed_merges_buf + 4 + 4); + auto merges_chars = packed_merges_buf + 4 + 4 + 4 * merges_size; + + core::Merges merges; + std::string delim = " "; + for(size_t id = 0; id < merges_size; ++id) { + auto merge = std::string(merges_chars + merges_begins[id], merges_chars + merges_ends[id]); + const int delim_pos = merge.find(delim); + + std::pair merge_pair = { + merge.substr(0, delim_pos), merge.substr(delim_pos + 1) + }; + merges.emplace_back(merge_pair); + } + + std::vector unk_token = {}; + if (m_unk_token.size() > 0) { + unk_token.push_back(m_unk_token); + }; + std::vector suffix_indicator = {}; + if (m_suffix_indicator.size() > 0) { + suffix_indicator.push_back(m_suffix_indicator); + }; + std::vector end_suffix = {}; + if (m_end_suffix.size() > 0) { + end_suffix.push_back(m_end_suffix); + }; + + m_tokenizer = std::make_shared( + vocab, + merges, + 10000 /* default cache size */, + std::vector {} /* dropout - don't use dropout for inference */, + unk_token, + suffix_indicator, + end_suffix, + m_fuse_unk + ); + } + + constructor_validate_and_infer_types(); +} + + +void BPETokenizer::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_input(this, 5); + check_string_input(this, 8); + set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); +} + +bool BPETokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + OPENVINO_ASSERT(inputs.size() == 11, "Too few inputs passed to BPETokenizer, it means it is not converted properly or it is not used in the supported pattern"); + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_rows = inputs[0].get_size(); + + // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area + // to represent different elements in ragged tensor + outputs[2].set_shape({inputs[4].get_size()}); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t ragged_offset = 0; + + for(size_t seq = 0; seq < num_rows; ++seq) { + new_begins[seq] = ragged_offset; + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + std::vector results = m_tokenizer->Tokenize(str); + for (const core::Token& token : results) { + OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); + new_elems[ragged_offset++] = token.id_; + }; + } + new_ends[seq] = ragged_offset; + } + outputs[2].set_shape({size_t(ragged_offset)}); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp new file mode 100644 index 000000000..97fb2db03 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bpe_tokenizer.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "fast_tokenizer/models/models.h" + + +using namespace paddlenlp::fast_tokenizer; + +#undef tokenizer +#undef m_tokenizer + +class BPETokenizer : public ov::op::Op { +public: + OPENVINO_OP("BPETokenizer"); + + BPETokenizer () = default; + BPETokenizer( + const ov::OutputVector& arguments, + const std::string& unk_token = "", + bool fuse_unk = false, + const std::string& suffix_indicator = "", + const std::string& end_suffix = "", + bool byte_fallback = false + ); + BPETokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr& tokenizer, + const std::string& unk_token = "", + bool fuse_unk = false, + const std::string& suffix_indicator = "", + const std::string& end_suffix = "", + bool byte_fallback = false + ); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_tokenizer, m_unk_token, m_fuse_unk, m_suffix_indicator, m_end_suffix, m_byte_fallback); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("unk_token", m_unk_token); + visitor.on_attribute("fuse_unk", m_fuse_unk); + visitor.on_attribute("suffix_indicator", m_suffix_indicator); + visitor.on_attribute("end_suffix", m_end_suffix); + visitor.on_attribute("byte_fallback", m_byte_fallback); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + std::shared_ptr m_tokenizer; + std::string m_unk_token; + bool m_fuse_unk = false; + std::string m_suffix_indicator; + std::string m_end_suffix; + bool m_byte_fallback = false; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp new file mode 100644 index 000000000..b9b4f5338 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.cpp @@ -0,0 +1,320 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "bytes_to_chars.hpp" +#include "utils.hpp" + +using namespace ov; + + +const std::array, 256> create_bytes_to_chars_map() { + return {{ + { 196, 128 }, + { 196, 129 }, + { 196, 130 }, + { 196, 131 }, + { 196, 132 }, + { 196, 133 }, + { 196, 134 }, + { 196, 135 }, + { 196, 136 }, + { 196, 137 }, + { 196, 138 }, + { 196, 139 }, + { 196, 140 }, + { 196, 141 }, + { 196, 142 }, + { 196, 143 }, + { 196, 144 }, + { 196, 145 }, + { 196, 146 }, + { 196, 147 }, + { 196, 148 }, + { 196, 149 }, + { 196, 150 }, + { 196, 151 }, + { 196, 152 }, + { 196, 153 }, + { 196, 154 }, + { 196, 155 }, + { 196, 156 }, + { 196, 157 }, + { 196, 158 }, + { 196, 159 }, + { 196, 160 }, + { 33 }, + { 34 }, + { 35 }, + { 36 }, + { 37 }, + { 38 }, + { 39 }, + { 40 }, + { 41 }, + { 42 }, + { 43 }, + { 44 }, + { 45 }, + { 46 }, + { 47 }, + { 48 }, + { 49 }, + { 50 }, + { 51 }, + { 52 }, + { 53 }, + { 54 }, + { 55 }, + { 56 }, + { 57 }, + { 58 }, + { 59 }, + { 60 }, + { 61 }, + { 62 }, + { 63 }, + { 64 }, + { 65 }, + { 66 }, + { 67 }, + { 68 }, + { 69 }, + { 70 }, + { 71 }, + { 72 }, + { 73 }, + { 74 }, + { 75 }, + { 76 }, + { 77 }, + { 78 }, + { 79 }, + { 80 }, + { 81 }, + { 82 }, + { 83 }, + { 84 }, + { 85 }, + { 86 }, + { 87 }, + { 88 }, + { 89 }, + { 90 }, + { 91 }, + { 92 }, + { 93 }, + { 94 }, + { 95 }, + { 96 }, + { 97 }, + { 98 }, + { 99 }, + { 100 }, + { 101 }, + { 102 }, + { 103 }, + { 104 }, + { 105 }, + { 106 }, + { 107 }, + { 108 }, + { 109 }, + { 110 }, + { 111 }, + { 112 }, + { 113 }, + { 114 }, + { 115 }, + { 116 }, + { 117 }, + { 118 }, + { 119 }, + { 120 }, + { 121 }, + { 122 }, + { 123 }, + { 124 }, + { 125 }, + { 126 }, + { 196, 161 }, + { 196, 162 }, + { 196, 163 }, + { 196, 164 }, + { 196, 165 }, + { 196, 166 }, + { 196, 167 }, + { 196, 168 }, + { 196, 169 }, + { 196, 170 }, + { 196, 171 }, + { 196, 172 }, + { 196, 173 }, + { 196, 174 }, + { 196, 175 }, + { 196, 176 }, + { 196, 177 }, + { 196, 178 }, + { 196, 179 }, + { 196, 180 }, + { 196, 181 }, + { 196, 182 }, + { 196, 183 }, + { 196, 184 }, + { 196, 185 }, + { 196, 186 }, + { 196, 187 }, + { 196, 188 }, + { 196, 189 }, + { 196, 190 }, + { 196, 191 }, + { 197, 128 }, + { 197, 129 }, + { 197, 130 }, + { 194, 161 }, + { 194, 162 }, + { 194, 163 }, + { 194, 164 }, + { 194, 165 }, + { 194, 166 }, + { 194, 167 }, + { 194, 168 }, + { 194, 169 }, + { 194, 170 }, + { 194, 171 }, + { 194, 172 }, + { 197, 131 }, + { 194, 174 }, + { 194, 175 }, + { 194, 176 }, + { 194, 177 }, + { 194, 178 }, + { 194, 179 }, + { 194, 180 }, + { 194, 181 }, + { 194, 182 }, + { 194, 183 }, + { 194, 184 }, + { 194, 185 }, + { 194, 186 }, + { 194, 187 }, + { 194, 188 }, + { 194, 189 }, + { 194, 190 }, + { 194, 191 }, + { 195, 128 }, + { 195, 129 }, + { 195, 130 }, + { 195, 131 }, + { 195, 132 }, + { 195, 133 }, + { 195, 134 }, + { 195, 135 }, + { 195, 136 }, + { 195, 137 }, + { 195, 138 }, + { 195, 139 }, + { 195, 140 }, + { 195, 141 }, + { 195, 142 }, + { 195, 143 }, + { 195, 144 }, + { 195, 145 }, + { 195, 146 }, + { 195, 147 }, + { 195, 148 }, + { 195, 149 }, + { 195, 150 }, + { 195, 151 }, + { 195, 152 }, + { 195, 153 }, + { 195, 154 }, + { 195, 155 }, + { 195, 156 }, + { 195, 157 }, + { 195, 158 }, + { 195, 159 }, + { 195, 160 }, + { 195, 161 }, + { 195, 162 }, + { 195, 163 }, + { 195, 164 }, + { 195, 165 }, + { 195, 166 }, + { 195, 167 }, + { 195, 168 }, + { 195, 169 }, + { 195, 170 }, + { 195, 171 }, + { 195, 172 }, + { 195, 173 }, + { 195, 174 }, + { 195, 175 }, + { 195, 176 }, + { 195, 177 }, + { 195, 178 }, + { 195, 179 }, + { 195, 180 }, + { 195, 181 }, + { 195, 182 }, + { 195, 183 }, + { 195, 184 }, + { 195, 185 }, + { 195, 186 }, + { 195, 187 }, + { 195, 188 }, + { 195, 189 }, + { 195, 190 }, + { 195, 191 }, + }}; +} + +void BytesToChars::validate_and_infer_types() { + check_ragged_string_input(this, 0); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + +bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + OPENVINO_ASSERT(inputs.size() == 5, "Too few inputs passed to BytesToChars, it means it is not converted properly or it is not used in the supported pattern"); + + // Set output shapes + outputs[0] = inputs[0]; + outputs[1] = inputs[1]; + outputs[2].set_shape(inputs[2].get_shape()); + outputs[3].set_shape(inputs[3].get_shape()); + outputs[4].set_shape(Shape({inputs[4].get_size() * 2})); + const size_t num_elems = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + uint32_t char_pointer = 0; + + for(size_t j = 0; j < num_elems; ++j) { + + for(size_t i = ragged_begins[j]; i < ragged_ends[j]; ++i) { + const auto word_len = ends[i] - begins[i]; + new_begins[i] = char_pointer; + + for (size_t k = 0; k < word_len; ++k) { + for (auto byte : m_bytes_to_chars[chars[begins[i] + k]]) { + new_chars[char_pointer++] = byte; + } + } + new_ends[i] = char_pointer; + } + } + outputs[4].set_shape({char_pointer}); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp new file mode 100644 index 000000000..77b30b0c1 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/bytes_to_chars.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + + +const std::array, 256> create_bytes_to_chars_map(); + +class BytesToChars : public ov::op::Op { +public: + OPENVINO_OP("BytesToChars"); + + BytesToChars () = default; + + BytesToChars(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + const std::array, 256> m_bytes_to_chars = create_bytes_to_chars_map(); +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp new file mode 100644 index 000000000..8c5fd681b --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.cpp @@ -0,0 +1,25 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "case_fold.hpp" +#include "utils.hpp" + +#include "fast_tokenizer/normalizers/normalizers.h" + +using namespace ov; + + +void CaseFold::validate_and_infer_types() { + check_string_input(this, 0); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + return evaluate_normalization_helper( + outputs, inputs, + [](const std::string& str) { + using namespace paddlenlp::fast_tokenizer; + return normalizers::NormalizedString(str).Lowercase().GetStr(); + }); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp new file mode 100644 index 000000000..6c273ad82 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/case_fold.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class CaseFold : public ov::op::Op { +public: + OPENVINO_OP("CaseFold"); + + CaseFold () = default; + + CaseFold (const ov::OutputVector& arguments) : ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp new file mode 100644 index 000000000..75e980dd3 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "chars_to_bytes.hpp" +#include "bytes_to_chars.hpp" +#include "utils.hpp" + +using namespace ov; + +void CharsToBytes::validate_and_infer_types() { + check_ragged_string_input(this, 0); +// set_ragged_string_output(this, 0, get_input_partial_shape(0)); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +std::array, 4> CharsToBytes::create_pair_map() { + auto bytes_to_chars = create_bytes_to_chars_map(); + std::array, 4> pair_map; + + for (int i=0; i < bytes_to_chars.size(); ++i) { + std::vector chars = bytes_to_chars[i]; + if (chars.size() == 2) { + pair_map[chars[0] - 194][chars[1] - 128] = i; + }; + }; + + return pair_map; +} + +bool CharsToBytes::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + outputs[2].set_shape(Shape({inputs[4].get_size()})); + const size_t num_rows = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_chars = outputs[2].data(); + uint32_t char_pointer = 0; + + for(size_t row = 0; row < num_rows; ++row) { + new_begins[row] = char_pointer; + for(size_t col = ragged_begins[row]; col < ragged_ends[row]; ++col) { + const auto word_len = ends[col] - begins[col]; + + for (size_t k = 0; k < word_len; ++k) { + const auto first_byte = chars[begins[col] + k]; + if (first_byte < m_one_byte_border) { + new_chars[char_pointer++] = first_byte; + } else { + const auto second_byte = chars[begins[col] + (++k)]; + new_chars[char_pointer++] = m_pair_map[first_byte - m_first_byte_offset][second_byte - m_second_byte_offset]; + } + } + }; + new_ends[row] = char_pointer; + } + outputs[2].set_shape({char_pointer}); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp new file mode 100644 index 000000000..4a79a72b3 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/chars_to_bytes.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class CharsToBytes : public ov::op::Op { +public: + OPENVINO_OP("CharsToBytes"); + + CharsToBytes () = default; + + CharsToBytes(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + + std::array, 4> create_pair_map(); + +private: + const std::array, 4> m_pair_map = create_pair_map(); + const uint8_t m_one_byte_border = 128; // if char > 128 => it is two byte char + // + const uint8_t m_first_byte_offset = 194; + const uint8_t m_second_byte_offset = 128; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp new file mode 100644 index 000000000..17a3a6e98 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.cpp @@ -0,0 +1,135 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "combine_segments.hpp" +#include "utils.hpp" + +using namespace ov; + +void CombineSegments::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() > 0); + OPENVINO_ASSERT((get_input_size() - 1)%3 == 0); + + // First come several ragged tensors each represented as 3 regular tesors + size_t num_inputs = (get_input_size() - 1)/3; + PartialShape ps = PartialShape::dynamic(); + element::Type et = element::dynamic; + for (size_t i = 0; i < num_inputs; ++i) { + check_ragged_input(this, 3*i); + // Check limited broadcast + // Limited means that we support only two shapes on inputs: scalar and not scalars, + // and all not-scalars should have the same shape + auto rank = get_input_partial_shape(3*i).rank(); + if(rank.is_static() && rank.get_length()) { + OPENVINO_ASSERT(ps.merge_into(ps, get_input_partial_shape(3*i))); + } + OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i))); + OPENVINO_ASSERT(element::Type::merge(et, et, get_input_element_type(3*i + 1))); + } + + set_ragged_output(this, 0, ps, et); + // TODO: Avoid emitting ragged indices for the second ragged tensor, they should be identical to the first output ragged tensor + set_ragged_output(this, 3, ps, get_input_element_type(get_input_size() - 1)); +} + +bool CombineSegments::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // FIXME: Works for POD types only (not for strings!) + size_t num_of_ragged = (inputs.size() - 1)/3; + OPENVINO_ASSERT(num_of_ragged == inputs.back().get_size()); + std::vector begins; + std::vector ends; + std::vector nelems; + std::vector elems; + auto element_type = inputs[2].get_element_type(); + auto elem_size = element_type.size(); + size_t max_nelems = 0; + size_t flat_out_size = 0; + Shape ps; + + for(size_t i = 0; i < num_of_ragged; ++i) { + OPENVINO_ASSERT(inputs[3*i + 2].get_element_type() == element_type); + begins.push_back(inputs[3*i + 0].data()); + ends.push_back(inputs[3*i + 1].data()); + nelems.push_back(inputs[3*i + 0].get_size()); + elems.push_back(reinterpret_cast(inputs[3*i + 2].data())); + // TODO: Get rank from a tensor instead of partial_shape. This is a WA for CPU bug that gives 1D tensors instead of 0D tensors. + if(get_input_partial_shape(3*i + 0).rank().get_length() > 0) { + ps = inputs[3*i + 0].get_shape(); + } + max_nelems = std::max(max_nelems, nelems.back()); + } + + // flat_out_size is going to be an estimation of the final size + // This is only an estimation, not the exact output size, because ragged tensor may have gaps in the representation + + for(size_t i = 0; i < num_of_ragged; ++i) { + if(nelems[i] == 1) { + flat_out_size += max_nelems * inputs[3*i + 2].get_size(); // broadcast + } else { + flat_out_size += inputs[3*i + 2].get_size(); // FIXME: doesn't work for overlapped ragged regions + } + } + + auto ids = reinterpret_cast(inputs.back().data()); + size_t id_type_size = inputs.back().get_element_type().size(); + + outputs[3*0 + 0].set_shape(ps); + outputs[3*0 + 1].set_shape(ps); + OPENVINO_ASSERT(max_nelems == outputs[3*0 + 0].get_size()); + OPENVINO_ASSERT(max_nelems == outputs[3*0 + 1].get_size()); + outputs[3*0 + 2].set_shape({flat_out_size}); + + outputs[3*1 + 0].set_shape(ps); + outputs[3*1 + 1].set_shape(ps); + OPENVINO_ASSERT(max_nelems == outputs[3*1 + 0].get_size()); + OPENVINO_ASSERT(max_nelems == outputs[3*1 + 1].get_size()); + outputs[3*1 + 2].set_shape({flat_out_size}); + + auto out_elem_begins = outputs[3*0 + 0].data(); + auto out_elem_ends = outputs[3*0 + 1].data(); + auto out_elems = reinterpret_cast(outputs[3*0 + 2].data()); + auto out_id_begins = outputs[3*1 + 0].data(); + auto out_id_ends = outputs[3*1 + 1].data(); + auto out_ids = reinterpret_cast(outputs[3*1 + 2].data()); + + auto out_elems_orig = out_elems; + auto out_ids_orig = out_ids; + size_t out_offset = 0; + + for(size_t i = 0; i < max_nelems; ++i) { + out_elem_begins[i] = out_offset; + out_id_begins[i] = out_offset; + + for(size_t j = 0; j < num_of_ragged; ++j) { + const char* begin; + size_t len; + if(nelems[j] == 1) { + begin = elems[j] + elem_size*begins[j][0]; + len = ends[j][0] - begins[j][0]; + } else { + begin = elems[j] + elem_size*begins[j][i]; + len = ends[j][i] - begins[j][i]; + } + auto end = begin + elem_size*len; + out_elems = std::copy(begin, end, out_elems); + for(size_t k = 0; k < len; ++k) { + out_ids = std::copy(ids + id_type_size*j, ids + id_type_size*(j + 1), out_ids); + } + out_offset += len; + } + + out_elem_ends[i] = out_offset; + out_id_ends[i] = out_offset; + } + + OPENVINO_ASSERT(out_offset <= flat_out_size); + + outputs[3*0 + 2].set_shape({out_offset}); + outputs[3*1 + 2].set_shape({out_offset}); + + OPENVINO_ASSERT(out_elems == out_elems_orig + outputs[3*0 + 2].get_byte_size()); + OPENVINO_ASSERT(out_ids == out_ids_orig + outputs[3*1 + 2].get_byte_size()); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp new file mode 100644 index 000000000..a4d904a55 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/combine_segments.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class CombineSegments : public ov::op::Op { +public: + OPENVINO_OP("CombineSegments"); + + CombineSegments () = default; + + CombineSegments(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch b/modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch new file mode 100644 index 000000000..2c7d9e82a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/icu4c.patch @@ -0,0 +1,13 @@ +diff --git a/fast_tokenizer/cmake/external/icu.cmake b/fast_tokenizer/cmake/external/icu.cmake +index cd604d38..6be44bdb 100644 +--- a/fast_tokenizer/cmake/external/icu.cmake ++++ b/fast_tokenizer/cmake/external/icu.cmake +@@ -113,7 +113,7 @@ ExternalProject_Add( + GIT_PROGRESS 1 + PREFIX ${ICU_PREFIX_DIR} + UPDATE_COMMAND "" +- CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux/gcc" --enable-static --disable-shared --enable-rpath ++ CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux/gcc" --enable-static --enable-rpath + BUILD_COMMAND make -j4 + INSTALL_COMMAND make install prefix="" DESTDIR=${ICU_INSTALL_DIR} install + BUILD_BYPRODUCTS ${ICU_LIBRARIES} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp new file mode 100644 index 000000000..83514b801 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "normalize_unicode.hpp" +#include "utils.hpp" + +using namespace ov; + +namespace { +using namespace paddlenlp::fast_tokenizer::normalizers; +using NormalizersMap = std::map>; + +const NormalizersMap normalizers = { + {"NFD", [](const std::string& str) { return NormalizedString(str).NFD().GetStr(); }}, + {"NFC", [](const std::string& str) { return NormalizedString(str).NFC().GetStr(); }}, + {"NFKD", [](const std::string& str) { return NormalizedString(str).NFKD().GetStr(); }}, + {"NFKC", [](const std::string& str) { return NormalizedString(str).NFKC().GetStr(); }}, +}; + +} + +void NormalizeUnicode::validate_and_infer_types() { + check_string_input(this, 0); + OPENVINO_ASSERT(normalizers.find(m_normalization_form) != normalizers.end(), "NormalizeUnicode doesn't know normalization form " + m_normalization_form); + set_string_output(this, 0, get_input_partial_shape(0)); +} + +bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + return evaluate_normalization_helper(outputs, inputs, normalizers.at(m_normalization_form)); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp new file mode 100644 index 000000000..cacdec18c --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/normalize_unicode.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class NormalizeUnicode : public ov::op::Op { +public: + OPENVINO_OP("NormalizeUnicode"); + + NormalizeUnicode () = default; + + NormalizeUnicode(const ov::OutputVector& arguments, const std::string& normalization_form = "NFD") : + ov::op::Op(arguments), + m_normalization_form(normalization_form) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_normalization_form); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("normalization_form", m_normalization_form); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + + std::string m_normalization_form = "NFD"; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md new file mode 100644 index 000000000..2ca6d83bb --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -0,0 +1,183 @@ +# OpenVINO Tokenizers + +## Features + +- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer: + - Fast tokenizers based on Wordpiece and BPE models + - Slow tokenizers based on SentencePiece model file +- Combine OpenVINO models into a single model +- Add greedy decoding pipeline to text generation model + +## Installation + +1. Build the extension with the `-DCUSTOM_OPERATIONS="tokenizer"` flag: [instruction](../../../README.md#build-custom-openvino-operation-extension-library) +2. (Recommended) Create and activate virtual env: +```bash +python3 -m venv venv +source venv/bin/activate +``` +3. Go to `modules/custom_operations/user_ie_extensions/tokenizer/python` and run: +```bash +# to use converted tokenizers or models combined with tokenizers +pip install . +# to convert tokenizers from transformers library +pip install .[transformers] +# for development and testing the library +pip isntall -e .[all] +``` + +## Usage + +Set `OV_TOKENIZER_PREBUILD_EXTENSION_PATH` environment variable to `libuser_ov_extensions.so` file path +or use `init_extension` function. + +### Convert HuggingFace tokenizer + +```python +from transformers import AutoTokenizer +from openvino import compile_model +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings + + +init_extension("path/to/libuser_ov_extensions.so") + +hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +ov_tokenizer = convert_tokenizer(hf_tokenizer) + +compiled_tokenzier = compile_model(ov_tokenizer) +text_input = "Test string" + +hf_output = hf_tokenizer([text_input], return_tensors="np") +ov_output = compiled_tokenzier(pack_strings([text_input])) + +for output_name in hf_output: + print(f"OpenVINO {output_name} = {ov_output[output_name]}") + print(f"HuggingFace {output_name} = {hf_output[output_name]}") +# OpenVINO input_ids = [[ 101 3231 5164 102]] +# HuggingFace input_ids = [[ 101 3231 5164 102]] +# OpenVINO token_type_ids = [[0 0 0 0]] +# HuggingFace token_type_ids = [[0 0 0 0]] +# OpenVINO attention_mask = [[1 1 1 1]] +# HuggingFace attention_mask = [[1 1 1 1]] +``` + +### Connect Tokenizer to a Model + +```python +from transformers import AutoTokenizer, AutoModelForSequenceClassification +from openvino import compile_model, convert_model +from ov_tokenizer import init_extension, convert_tokenizer, pack_strings, connect_models + + +init_extension("path/to/libuser_ov_extensions.so") + +checkpoint = "mrm8488/bert-tiny-finetuned-sms-spam-detection" +hf_tokenizer = AutoTokenizer.from_pretrained(checkpoint) +hf_model = AutoModelForSequenceClassification.from_pretrained(checkpoint) + +text_input = ["Free money!!!"] +hf_input = hf_tokenizer(text_input, return_tensors="pt") +hf_output = hf_model(**hf_input) + +ov_tokenizer = convert_tokenizer(hf_tokenizer) +ov_model = convert_model(hf_model, example_input=hf_input.data) +combined_model = connect_models(ov_tokenizer, ov_model) +compiled_combined_model = compile_model(combined_model) + +openvino_output = compiled_combined_model(pack_strings(text_input)) + +print(f"OpenVINO logits: {openvino_output['logits']}") +# OpenVINO logits: [[ 1.2007061 -1.4698029]] +print(f"HuggingFace logits {hf_output.logits}") +# HuggingFace logits tensor([[ 1.2007, -1.4698]], grad_fn=) +``` + +### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer + +To work with converted tokenizer you need `pack_strings`/`unpack_strings` functions. + +```python +import numpy as np +from openvino import Core +from ov_tokenizer import unpack_strings + + +core = Core() +core.add_extension("path/to/libuser_ov_extensions.so") +# detokenizer from codellama sentencepiece model +compiled_detokenizer = core.compile_model("detokenizer.xml") + +token_ids = np.random.randint(100, 1000, size=(3, 5)) +openvino_output = compiled_detokenizer(token_ids) + +print(unpack_strings(openvino_output["string_output"])) +# ['sc�ouition�', 'intvenord hasient', 'g shouldwer M more'] +``` + +### Text generation pipeline + +```python +import numpy as np +from openvino import compile_model, convert_model +from transformers import AutoModelForCausalLM, AutoTokenizer +from ov_tokenizer import ( + add_greedy_decoding, + convert_tokenizer, + init_extension, + pack_strings, + unpack_strings, +) + + +init_extension("path/to/libuser_ov_extensions.so") + +# Use different repo for the tokenizer because the original repo doesn't have .model file +# Sentencepiece(Unigram) tokenizer supported only with .model file +tokenizer_checkpoint = "microsoft/Llama2-7b-WhoIsHarryPotter" +model_checkpoint = "nickypro/tinyllama-15M" +hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) +hf_model = AutoModelForCausalLM.from_pretrained(model_checkpoint, use_cache=False) + +# convert hf tokenizer +text_input = ["Quick brown fox was"] +ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) +compiled_tokenizer = compile_model(ov_tokenizer) + +# transform input text into tokens +ov_input = compiled_tokenizer(pack_strings(text_input)) +hf_input = hf_tokenizer(text_input, return_tensors="pt") + +# convert Pytorch model to OpenVINO IR and add greedy decoding pipeline to it +ov_model = convert_model(hf_model, example_input=hf_input.data) +ov_model_with_greedy_decoding = add_greedy_decoding(ov_model) +compiled_model = compile_model(ov_model_with_greedy_decoding) + +# generate new tokens +new_tokens_size = 10 +prompt_size = ov_input["input_ids"].shape[-1] +input_dict = { + output.any_name: np.hstack([tensor, np.zeros(shape=(1, new_tokens_size), dtype=np.int_)]) + for output, tensor in ov_input.items() +} +for idx in range(prompt_size, prompt_size + new_tokens_size): + output = compiled_model(input_dict)["token_ids"] + input_dict["input_ids"][:, idx] = output[:, idx - 1] + input_dict["attention_mask"][:, idx] = 1 +ov_token_ids = input_dict["input_ids"] + +hf_token_ids = hf_model.generate( + **hf_input, + min_new_tokens=new_tokens_size, + max_new_tokens=new_tokens_size, + temperature=0, # greedy decoding +) + +# decode model output +compiled_detokenizer = compile_model(ov_detokenizer) +ov_output = unpack_strings(compiled_detokenizer(ov_token_ids)["string_output"]) +hf_output = hf_tokenizer.batch_decode(hf_token_ids, skip_special_tokens=True) +print(f"OpenVINO output string: `{ov_output}`") +# OpenVINO output string: `['Quick brown fox was walking through the forest. He was looking for something']` +print(f"HuggingFace output string: `{hf_output}`") +# HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']` +``` diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py new file mode 100644 index 000000000..ce757b861 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .convert_tokenizer import convert_tokenizer +from .node_factory import init_extension +from .str_pack import pack_strings, unpack_strings +from .utils import add_greedy_decoding, connect_models diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py new file mode 100644 index 000000000..46d2626a9 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/constants.py @@ -0,0 +1,12 @@ +ATTENTION_MASK_INPUT_NAME = "attention_mask" +TOKEN_IDS_INPUT_NAME = "input_ids" +TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids" + +LOGITS_OUTPUT_NAME = "logits" +TOKEN_IDS_OUTPUT_NAME = "token_ids" +STRING_OUTPUT_NAME = "string_output" + +GREEDY_DECODER_NAME = "greedy_decoder" + +TOKENIZER_ENCODER_NAME = "tokenizer_encoder" +TOKENIZER_DECODER_NAME = "tokenizer_decoder" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py new file mode 100644 index 000000000..1d107a1ce --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +import sys +from typing import Any, Tuple, Union + +from openvino.runtime import Model, Type +from openvino.runtime.exceptions import OVTypeError + +from .utils import change_inputs_type, change_outputs_type + + +logger = logging.getLogger(__name__) + + +def convert_tokenizer( + tokenizer_object: Any, + number_of_inputs: int = 1, + with_decoder: bool = False, + streaming_decoder: bool = False, + tokenizer_output_type: Type = Type.i64, + detokenizer_input_type: Type = Type.i64, +) -> Union[Model, Tuple[Model, Model]]: + # todo: add support for more then 1 input + if number_of_inputs > 1: + raise ValueError("Tokenizers with more then one input are not supported yet.") + + ov_tokenizers = None + + if "transformers" in sys.modules: + from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast + + from .hf_parser import ( + convert_fast_tokenizer, + convert_sentencepiece_model_tokenizer, + convert_tiktoken_model_tokenizer, + is_sentencepiece_model, + is_tiktoken_model, + ) + + if isinstance(tokenizer_object, PreTrainedTokenizerBase): + if is_sentencepiece_model(tokenizer_object): + logger.info("Convert tokenizer using SentencePiece .model file.") + ov_tokenizers = convert_sentencepiece_model_tokenizer( + tokenizer_object, + add_attention_mask=True, + with_decoder=with_decoder, + streaming_decoder=streaming_decoder, + ) + elif is_tiktoken_model(tokenizer_object): + logger.info("Convert tiktoken-based tokenizer") + ov_tokenizers = convert_tiktoken_model_tokenizer( + tokenizer_object, + with_decoder=with_decoder, + ) + elif isinstance(tokenizer_object, PreTrainedTokenizerFast): + logger.info("Convert Huggingface Fast tokenizer pipeline.") + ov_tokenizers = convert_fast_tokenizer( + tokenizer_object, + number_of_inputs=number_of_inputs, + with_decoder=with_decoder, + ) + + if ov_tokenizers is None: + raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") + + if isinstance(ov_tokenizers, tuple): + return ( + change_outputs_type(ov_tokenizers[0], tokenizer_output_type), + change_inputs_type(ov_tokenizers[1], detokenizer_input_type), + ) + + return change_outputs_type(ov_tokenizers, tokenizer_output_type) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py new file mode 100644 index 000000000..401c8ea2b --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -0,0 +1,486 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import tempfile +from copy import deepcopy +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import openvino.runtime.opset12 as opset +from openvino import Model, PartialShape, Type +from openvino.runtime import Node, op +from openvino.runtime.exceptions import OVTypeError +from openvino.runtime.utils.types import as_node, make_constant_node +from transformers.convert_slow_tokenizer import import_protobuf + +from .constants import ( + ATTENTION_MASK_INPUT_NAME, + STRING_OUTPUT_NAME, + TOKEN_IDS_INPUT_NAME, + TOKEN_TYPE_IDS_INPUT_NAME, + TOKENIZER_DECODER_NAME, + TOKENIZER_ENCODER_NAME, +) +from .node_factory import factory +from .tokenizer_pipeline import ( + BPETokenizationStep, + BytesToCharsStep, + CaseFoldStep, + CharsToBytesStep, + CombineSegmentsStep, + NMTNormalizationStep, + NormalizationStep, + NormalizeUnicode, + PaddingStep, + PreTokenizatinStep, + PunctuationSplitStep, + RegexDecodingStep, + RegexNormalizationStep, + RegexSplitStep, + StripStringStep, + TokenizerPipeline, + TruncationStep, + VocabDecoderStep, + WhitespaceSplitStep, + WordPieceTokenizationStep, +) + + +def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep: + regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"] + return RegexNormalizationStep( + regex_search_pattern=regex_search_pattern, + replace_term=normalizer_dict["content"], + ) + + +def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]: + steps: List[NormalizationStep] = [] + + if normalizer_dict["clean_text"] is True: + steps.append(RegexNormalizationStep.del_control_chars_regex()) + + # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L127 + if normalizer_dict.get("strip_accents") or normalizer_dict["lowercase"]: + steps.append(NormalizeUnicode("NFD")) + steps.append(RegexNormalizationStep.strip_accents_regex()) + + if normalizer_dict["lowercase"] is True: + steps.append(CaseFoldStep()) + + return steps + + +def parse_strip_step(split_dict: Dict[str, Any]) -> StripStringStep: + return StripStringStep( + left=split_dict["strip_left"], + right=split_dict["strip_right"], + ) + + +def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep: + split_pattern = pretokenizer_dict["pattern"].get("String") or pretokenizer_dict["pattern"]["Regex"] + return RegexSplitStep( + split_pattern=split_pattern, + invert=pretokenizer_dict["invert"], + behaviour=pretokenizer_dict["behavior"].lower().rstrip("d"), + ) + + +def parse_byte_level_pretokenization_step( + pretokenizer_dict: Dict[str, Any] +) -> List[Union[NormalizationStep, PreTokenizatinStep]]: + steps = [] + if pretokenizer_dict.get("add_prefix_space"): + steps.append(RegexNormalizationStep.add_prefix_whitespace_regex()) + + # regex is used by default, but it does not appear in config yet + if pretokenizer_dict.get("use_regex", True): + # re2 does not support negative lookahead, so there is two steps replicate the behaviour + # this WA causes segfault for CLIP tokenizer + # steps.append(RegexSplitStep.add_whitespace_to_the_next_word()) + steps.append(RegexSplitStep.byte_level_splitter()) + + steps.append(BytesToCharsStep()) + return steps + + +class TransformersTokenizerPipelineParser: + def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None: + assert tokenizer_object.is_fast + + self.original_tokenizer = tokenizer_object + with TemporaryDirectory() as tmpdir: + tokenizer_object.save_pretrained(tmpdir) + with open(Path(tmpdir) / "tokenizer.json") as tj: + self.tokenizer_json = json.load(tj) + self.pipeline = TokenizerPipeline() + self.number_of_inputs = number_of_inputs + self.num_of_added_tokens = 0 + + def parse(self, number_of_inputs: Optional[int] = None) -> TokenizerPipeline: + self.number_of_inputs = self.number_of_inputs if number_of_inputs is None else number_of_inputs + self.pipeline.number_of_inputs = self.number_of_inputs + for add_steps in [ + self.normalization, + self.pre_tokenization, + self.tokenization_model, + self.post_tokenization, + self.decoding, + ]: + add_steps() + + return self.pipeline + + normalizers_map: Dict[ + str, + Callable[[Dict[str, Any]], Union[NormalizationStep, List[NormalizationStep]]], + ] = { + "NFC": lambda step_dict: NormalizeUnicode("NFC"), + "NFD": lambda step_dict: NormalizeUnicode("NFD"), + "NFKC": lambda step_dict: NormalizeUnicode("NFKC"), + "NFKD": lambda step_dict: NormalizeUnicode("NFKD"), + "Nmt": lambda step_dict: NMTNormalizationStep(), + "Lowercase": lambda step_dict: CaseFoldStep(), + "StripAccents": lambda step_dict: RegexNormalizationStep.strip_accents_regex(), + "BertNormalizer": parse_bert_normalizer, + "Replace": parse_replace_normalizer, + "Strip": parse_strip_step, + } + + def parse_normalizer_step(self, step_dict: Dict[str, Any]) -> None: + try: + self.pipeline.add_steps(self.normalizers_map[step_dict["type"]](step_dict)) + except KeyError: + raise OVTypeError(f"Normalizer type '{step_dict['type']}' is not supported") + + def normalization(self) -> None: + if self.tokenizer_json["normalizer"] is None: + return + + if self.tokenizer_json["normalizer"].get("type") == "Sequence": + for normalizer in self.tokenizer_json["normalizer"]["normalizers"]: + self.parse_normalizer_step(normalizer) + else: + self.parse_normalizer_step(self.tokenizer_json["normalizer"]) + + pre_tokenization_map: Dict[ + str, + Callable[[Dict[str, Any]], Union[PreTokenizatinStep, List[PreTokenizatinStep]]], + ] = { + "BertPreTokenizer": lambda step_dict: RegexSplitStep.bert_splitter(), + "Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(), + "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(), + "Split": parse_split_step, + "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]), + "ByteLevel": parse_byte_level_pretokenization_step, + "Digits": lambda step_dict: RegexSplitStep.digits_splitter( + "isolate" if step_dict["individual_digits"] else "contiguous" + ), + } + + def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None: + try: + self.pipeline.add_steps(self.pre_tokenization_map[step_dict["type"]](step_dict)) + except KeyError: + raise OVTypeError(f"Pre-tokenizer type '{step_dict['type']}' is not supported") + + def pre_tokenization(self) -> None: + if self.tokenizer_json["pre_tokenizer"] is None: + return + + if self.tokenizer_json["pre_tokenizer"].get("type") == "Sequence": + for pretokenizer in self.tokenizer_json["pre_tokenizer"]["pretokenizers"]: + self.parse_pre_tokenization_step(pretokenizer) + else: + self.parse_pre_tokenization_step(self.tokenizer_json["pre_tokenizer"]) + + def tokenization_model(self) -> None: + if self.tokenizer_json["model"]["type"] == "WordPiece": + self.pipeline.add_steps(WordPieceTokenizationStep.from_hf_json(self.tokenizer_json)) + self.pipeline.vocab = self.pipeline[-1].vocab + elif self.tokenizer_json["model"]["type"] == "BPE": + self.pipeline.add_steps(BPETokenizationStep.from_hf_json(self.tokenizer_json)) + self.pipeline.vocab = self.pipeline[-1].vocab + else: + raise OVTypeError(f"Tokenizer type '{self.tokenizer_json['model']['type']}' is not supported") + + def post_tokenization(self) -> None: + if ( + self.tokenizer_json["post_processor"] is None + or self.tokenizer_json["post_processor"]["type"] == "ByteLevel" + ): + self.add_truncation() + self.add_padding() + return + + if self.tokenizer_json["post_processor"]["type"] == "TemplateProcessing": + combine_segments_step = CombineSegmentsStep.from_hf_json_template_postprocessor( + self.tokenizer_json, self.number_of_inputs + ) + elif self.tokenizer_json["post_processor"]["type"] == "BertProcessing": + combine_segments_step = CombineSegmentsStep.from_hf_json_bert_postprocessor( + self.tokenizer_json, self.number_of_inputs + ) + elif self.tokenizer_json["post_processor"]["type"] == "RobertaProcessing": + combine_segments_step = CombineSegmentsStep.from_hf_json_roberta_processor( + self.tokenizer_json, self.number_of_inputs + ) + else: + raise OVTypeError( + f"Post-processor type '{self.tokenizer_json['post_processor']['type']}' is not supported" + ) + + self.num_of_added_tokens += combine_segments_step.number_of_added_tokens + combine_segments_step.set_tokens_ids(self.pipeline.vocab) + + self.add_truncation() + self.pipeline.add_steps(combine_segments_step) + + self.add_padding() + + def add_truncation(self) -> None: + if self.tokenizer_json["truncation"] is not None: + self.pipeline.add_steps(TruncationStep.from_hf_json(self.tokenizer_json, self.num_of_added_tokens)) + elif self.original_tokenizer.model_max_length is not None: + self.pipeline.add_steps(TruncationStep.from_hf_object(self.original_tokenizer, self.num_of_added_tokens)) + + def add_padding(self) -> None: + if self.tokenizer_json["padding"] is not None: + self.pipeline.add_steps(PaddingStep.from_hf_json(self.tokenizer_json)) + self.pipeline[-1].set_token_id(self.pipeline.vocab) + elif self.original_tokenizer.pad_token is not None: + self.pipeline.add_steps(PaddingStep(token=self.original_tokenizer.pad_token)) + self.pipeline[-1].set_token_id(self.pipeline.vocab) + else: + self.pipeline.add_steps(PaddingStep()) + + def decoding(self) -> None: + if self.tokenizer_json["decoder"] is None: + return + + if self.tokenizer_json["decoder"]["type"] == "ByteLevel": + self.pipeline.add_steps(VocabDecoderStep()) + self.pipeline.add_steps(CharsToBytesStep()) + + if self.original_tokenizer.clean_up_tokenization_spaces: + self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces()) + return + + +def convert_fast_tokenizer( + hf_tokenizer: "PreTrainedTokenizerBase", + number_of_inputs: int = 1, + with_decoder: bool = False, +) -> Union[Model, Tuple[Model, Model]]: + pipeline = TransformersTokenizerPipelineParser(hf_tokenizer).parse(number_of_inputs=number_of_inputs) + ov_tokenizer = pipeline.get_encoder_ov_subgraph() + output_names = hf_tokenizer.model_input_names + + ov_tokenizer_output_names = [TOKEN_IDS_INPUT_NAME, ATTENTION_MASK_INPUT_NAME] + if len(output_names) == 3 and len(ov_tokenizer.outputs) == 3: + ov_tokenizer_output_names.insert(1, TOKEN_TYPE_IDS_INPUT_NAME) + + filtered_outputs = [] + for i, output_name in enumerate(ov_tokenizer_output_names): + current_output = next( + (output for output in ov_tokenizer.outputs if output.any_name == output_name), + False, + ) + if current_output: + filtered_outputs.append(current_output) + continue + + if output_name in output_names: + ov_tokenizer.output(i).tensor.add_names({output_name}) + filtered_outputs.append(ov_tokenizer.output(i)) + + tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_ENCODER_NAME) + if with_decoder: + return tokenizer_model, pipeline.get_decoder_ov_subgraph() + + return tokenizer_model + + +def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: + return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model") + + +def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None: + model_pb = import_protobuf() + model = model_pb.ModelProto() + with open(sp_model_path, "rb") as model_file: + model.ParseFromString(model_file.read()) + + add_token_dict = hf_tokenizer.tokenizer.index_special_tokens + for idx, token in sorted(add_token_dict.items()): + new_piece = deepcopy(model.pieces[-1]) + new_piece.piece = token + model.pieces.append(new_piece) + + with open(sp_model_path, "wb") as model_file: + model_file.write(model.SerializeToString()) + + +def convert_sentencepiece_model_tokenizer( + hf_tokenizer: "PreTrainedTokenizerBase", + add_attention_mask: bool = True, + with_decoder: bool = False, + streaming_decoder: bool = False, +) -> Union[Model, Tuple[Model, Model]]: + if not is_sentencepiece_model(hf_tokenizer): + raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.") + + fairseq_offset = getattr(hf_tokenizer, "fairseq_offset", None) + + with tempfile.TemporaryDirectory() as tmp: + hf_tokenizer.save_pretrained(tmp) + vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"] + + if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer": + add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer) + + sp_model = np.fromfile(vocab_file, dtype=np.uint8) + sp_model_node = as_node(sp_model) + + if hf_tokenizer.is_fast: + hf_slow_tokenizer = hf_tokenizer.slow_tokenizer_class.from_pretrained(tmp) + fairseq_offset = getattr(hf_slow_tokenizer, "fairseq_offset", None) + + input_node = op.Parameter(Type.u8, PartialShape(["?"])) + input_node.set_friendly_name("string_input") + + if is_chatglm: + add_eos_token = False + elif hasattr(hf_tokenizer, "add_eos_token"): + add_eos_token = hf_tokenizer.add_eos_token or False + else: + add_eos_token = ( + getattr(hf_tokenizer, "truncation_side", "") == "right" + or getattr(hf_tokenizer, "padding_side", "") == "right" + ) + add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False + + tokenizer_node = factory.create( + "SentencepieceTokenizer", + [sp_model_node, input_node], + { + "add_bos": add_bos_token, + "add_eos": add_eos_token, + "reverse": False, + "alpha": 0.0, + }, + ) + + indices, values, dense_shape = tokenizer_node.outputs() + + if fairseq_offset: + values = opset.add(values, make_constant_node(fairseq_offset, values.element_type)).output(0) + + default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type) + broadcast = opset.broadcast(default_value, dense_shape) + scatternd_input_ids = factory.create( + "ScatterNDUpdate", + [broadcast, indices, values], # FIXME: pad left side instead of right + ) + + if is_chatglm: + prefix_tokens = make_constant_node( + np.array([hf_tokenizer.get_prefix_tokens()]), dtype=scatternd_input_ids.output(0).element_type + ) + scatternd_input_ids = opset.concat([prefix_tokens, scatternd_input_ids], axis=-1) + + scatternd_input_ids.output(0).tensor.add_names({TOKEN_IDS_INPUT_NAME}) + + outputs = scatternd_input_ids.outputs() + + if add_attention_mask: + attention_mask = factory.create( + "ScatterNDUpdate", + [ + broadcast, + indices, + opset.broadcast( + make_constant_node(1, values.element_type), + opset.shape_of(values), + ), + ], + ) + + if is_chatglm: + attention_prefix = make_constant_node( + np.array([[1 for _ in hf_tokenizer.get_prefix_tokens()]]), dtype=attention_mask.output(0).element_type + ) + attention_mask = opset.concat([attention_prefix, attention_mask], axis=-1) + + attention_mask.output(0).tensor.add_names({ATTENTION_MASK_INPUT_NAME}) + outputs.append(attention_mask.output(0)) + + tokenizer_encoder = Model(outputs, [input_node], TOKENIZER_ENCODER_NAME) + tokenizer_encoder.validate_nodes_and_infer_types() + + if not with_decoder: + return tokenizer_encoder + + return tokenizer_encoder, get_sp_decoder(sp_model_node, streaming_decoder=streaming_decoder) + + +def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model: + token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) + + decoder = factory.create( + "SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer", + [sp_model_node, token_ids], + ).outputs() + + if streaming_decoder: + decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) + + string_output = factory.create("StringTensorPack", decoder).outputs() + string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) + tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) + tokenizer_decoder.validate_nodes_and_infer_types() + return tokenizer_decoder + + +def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: + try: + from tiktoken import Encoding + except ImportError: + return False + + return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken") or isinstance( + getattr(hf_tokenizer, "encoder", None), Encoding + ) + + +def convert_tiktoken_model_tokenizer( + hf_tokenizer: "PreTrainedTokenizerBase", + with_decoder: bool = False, +) -> Union[Model, Tuple[Model, Model]]: + encoding = getattr(hf_tokenizer, "tokenizer", None) or hf_tokenizer.encoder + split_pattern = encoding._pat_str + + pipeline = TokenizerPipeline() + pipeline.add_steps( + [ + NormalizeUnicode("NFC"), + RegexSplitStep(split_pattern), + BytesToCharsStep(), + BPETokenizationStep.from_tiktoken_encoding(encoding), + TruncationStep( + max_length=hf_tokenizer.model_max_length, truncate_right=(hf_tokenizer.truncation_side == "right") + ), + PaddingStep(pad_right=(hf_tokenizer.padding_side == "right")), + VocabDecoderStep(), + CharsToBytesStep(), + ] + ) + if not with_decoder: + return pipeline.get_encoder_ov_subgraph() + + return pipeline.get_encoder_ov_subgraph(), pipeline.get_decoder_ov_subgraph() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py new file mode 100644 index 000000000..e2b6ed63c --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from pathlib import Path +from typing import Union + +from openvino.runtime.utils.node_factory import NodeFactory + + +factory = NodeFactory() + + +def init_extension(extension_path: Union[str, Path]) -> None: + """ + Initialize factory with compiled tokenizer extension. + + :param extension_path: path to prebuilt C++ tokenizer library. + """ + factory.add_extension(extension_path) + + +if _extension_path := os.environ.get("OV_TOKENIZER_PREBUILD_EXTENSION_PATH"): + init_extension(_extension_path) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py new file mode 100644 index 000000000..ed7c1d9e7 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/str_pack.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from io import BytesIO +from typing import List + +import numpy as np +from numpy.typing import NDArray +from openvino.runtime.exceptions import UserInputError + + +def to_bytes(number: int) -> bytes: + return number.to_bytes(4, "little") + + +def pack_string(string: str) -> NDArray: + return np.frombuffer(bytes(string, "utf-8"), dtype=np.uint8) + + +def pack_strings(strings: List[str]) -> NDArray: + """ + Convert any list of string to U8/1D numpy array compatible with converted OV model input + """ + if not isinstance(strings, list): + raise UserInputError("") + + batch_size = len(strings) + if batch_size == 0: + return to_bytes(0) + + buffer = BytesIO() + buffer.write(to_bytes(batch_size)) + symbols = BytesIO() + offset = 0 + buffer.write(to_bytes(offset)) + for string in strings: + byte_string = string.encode("utf-8") + offset += len(byte_string) + + buffer.write(to_bytes(offset)) + symbols.write(byte_string) + + buffer.write(symbols.getvalue()) + return np.frombuffer(buffer.getvalue(), np.uint8) + + +# TODO: handle possible sighed values in batch size and offsets +def unpack_strings(u8_tensor: NDArray, decoding_errors: str = "replace") -> List[str]: + """ + Convert an array of uint8 elements to a list of strings; reverse to pack_strings + """ + + def from_bytes(offset: int, size: int) -> int: + return int.from_bytes(u8_tensor[offset : offset + size], "little") + + batch_size = from_bytes(0, 4) + strings = [] + for i in range(batch_size): + begin = from_bytes(4 + i * 4, 4) + end = from_bytes(4 + (i + 1) * 4, 4) + length = end - begin + begin += 4 * (batch_size + 2) + strings.append(bytes(u8_tensor[begin : begin + length]).decode("utf-8", errors=decoding_errors)) + return strings diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py new file mode 100644 index 000000000..270124b57 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tiktoken_parser.py @@ -0,0 +1,74 @@ +import logging +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + +from tiktoken import Encoding + + +# from transformers.models.gpt2.tokenization_gpt2 +@lru_cache() +def bytes_to_unicode() -> Dict[bytes, str]: + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = (chr(n) for n in cs) + return dict(zip(bs, cs)) + + +# https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee +byte_encoder = bytes_to_unicode() + + +def token_bytes_to_string(b: bytes) -> str: + return "".join(byte_encoder[ord(char)] for char in b.decode("latin-1")) + + +def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> List[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :] + return parts + + +def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[str]]: + mergeable_ranks = encoding._mergeable_ranks + + merges = [] + vocab = {} + for token, rank in mergeable_ranks.items(): + vocab[token_bytes_to_string(token)] = rank + + if len(token) == 1: + continue + merged = tuple(bpe(mergeable_ranks, token, max_rank=rank)) + + # if special tokens added to the tokenizer and the bpe split might produce more than 2 tokens + # if there are "\t" in the vocab and special token "\t\t\t" was added before "\t\t" it will + # be tokenized into 3 tokens: bpe("\t\t\t") -> ["\t", "\t", "\t"] which is cannot be included + # in merges + if len(merged) == 2: + merges.append(" ".join(map(token_bytes_to_string, merged))) + else: + logging.warning("Skip merges for added tokens. Tokenization results might be different.") + + # Also add special tokens + vocab.update(encoding._special_tokens) + + return vocab, merges diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py new file mode 100644 index 000000000..74654344a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -0,0 +1,794 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import weakref +from dataclasses import dataclass, field +from functools import singledispatchmethod +from itertools import chain, islice +from typing import Any, Dict, List, Optional, Union + +import numpy as np +from openvino.runtime import Model, Output, PartialShape, Type, op +from openvino.runtime import opset12 as opset +from openvino.runtime.exceptions import OVTypeError, UserInputError +from openvino.runtime.utils.types import as_node, make_constant_node + +from .constants import ( + ATTENTION_MASK_INPUT_NAME, + STRING_OUTPUT_NAME, + TOKEN_IDS_INPUT_NAME, + TOKEN_TYPE_IDS_INPUT_NAME, + TOKENIZER_DECODER_NAME, + TOKENIZER_ENCODER_NAME, +) +from .node_factory import factory +from .str_pack import pack_string, pack_strings + + +class BasePipelineStep: + _pipeline = field(default=None, init=False, repr=False) + + def __str__(self) -> str: + params_string = ", ".join(f"{key}={val!r}" for key, val in self.get_config().items()) + return f"{self.__class__.__name__}({params_string})" + + def get_config(self) -> Dict[str, Any]: + config = {key: value for key, value in vars(self).items() if not key.startswith("_")} + properties = { + key: getattr(self, key) + for key in dir(type(self)) + if not key.startswith("_") and isinstance(getattr(type(self), key), property) + } + config.update(properties) + return config + + def get_pipeline(self) -> Optional["TokenizerPipeline"]: + return self._pipeline() + + def set_pipeline(self, pipeline: "TokenizerPipeline") -> None: + self._pipeline = weakref.ref(pipeline) + + def get_ov_subgraph(self, *input_nodes: List[Output]) -> List[Output]: + raise NotImplementedError + + @staticmethod + def create_string_constant_node(value: Union[str, List[str]]) -> op.Constant: + if isinstance(value, str): + # string scalar + ps = pack_string(value) + return op.Constant(ps) + else: + # support only 1D strings for now + ps = pack_strings(value) + return factory.create("StringTensorUnpack", op.Constant(ps).outputs()) + + +@dataclass +class NormalizationStep(BasePipelineStep): + pass + + +@dataclass +class NormalizeUnicode(NormalizationStep): + normalization_form: str = "NFD" + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return factory.create( + "NormalizeUnicode", + input_nodes, + {"normalization_form": self.normalization_form}, + ).outputs() + + +@dataclass +class CaseFoldStep(NormalizationStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return factory.create("CaseFold", input_nodes).outputs() + + +@dataclass +class RegexNormalizationStep(NormalizationStep): + regex_search_pattern: str + replace_term: str + + @classmethod + def strip_accents_regex(cls) -> "RegexNormalizationStep": + return cls(regex_search_pattern=r"\p{Mn}", replace_term="") + + @classmethod + def add_prefix_whitespace_regex(cls) -> "RegexNormalizationStep": + return cls(regex_search_pattern=r"^(\S)", replace_term=r" \1") + + @classmethod + def del_control_chars_regex(cls) -> "RegexNormalizationStep": + # https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L17 + return cls( + regex_search_pattern=r"((?=[^\n\t\r])\p{Cc})|((?=[^\n\t\r])\p{Cf})", + replace_term=" ", + ) + + @classmethod + def clean_up_tokenization_spaces(cls) -> "RegexNormalizationStep": + return cls( + regex_search_pattern=r" ([\.\?\!\,])| ('[ms])| (') | ('[rv]e)", + replace_term="\1", + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + input_nodes.extend( + ( + self.create_string_constant_node(self.regex_search_pattern), + self.create_string_constant_node(self.replace_term), + ) + ) + return factory.create("RegexNormalization", input_nodes).outputs() + + +@dataclass +class NMTNormalizationStep(NormalizationStep): + """Normaization based on NMT task. + + https://github.com/huggingface/tokenizers/blob/28cd3dce2a75d106572392194ff2564574c33235/tokenizers/src/normalizers/unicode.rs#L44 + """ + + +@dataclass +class StripAccentsStep(NormalizationStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return RegexNormalizationStep.strip_accents_regex().get_ov_subgraph(input_nodes).outputs() + + +@dataclass +class DelControlCharsStep(NormalizationStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return RegexNormalizationStep.del_control_chars_regex().get_ov_subgraph(input_nodes).outputs() + + +@dataclass +class StripStringStep(NormalizationStep): + left: bool + right: bool + + +@dataclass +class PreTokenizatinStep(BasePipelineStep): + pass + + +@dataclass +class RegexSplitStep(PreTokenizatinStep): + split_pattern: str + invert: bool = False + behaviour: str = "remove" + + @classmethod + def bert_whitespace_splitter(cls) -> "RegexSplitStep": + return cls(split_pattern=r"\s+", invert=False) + + @classmethod + def bert_keep_delimeters_splitter(cls) -> "RegexSplitStep": + """Generates a step with a standard BERT regex. + + The source: + https://github.com/tensorflow/text/blob/4a098cd852c0b7ebee621e2d211c7f202dd679c2/tensorflow_text/python/ops/bert_tokenizer.py#L39 + """ + return cls( + "|".join( + [ + r"|".join( + [ + r"[!-/]", + r"[:-@]", + r"[\[-`]", + r"[{-~]", + r"[\p{P}]", + ], + ), + r"|".join( + [ + r"[\x{4E00}-\x{9FFF}]", + r"[\x{3400}-\x{4DBF}]", + r"[\x{20000}-\x{2A6DF}]", + r"[\x{2A700}-\x{2B73F}]", + r"[\x{2B740}-\x{2B81F}]", + r"[\x{2B820}-\x{2CEAF}]", + r"[\x{F900}-\x{FAFF}]", + r"[\x{2F800}-\x{2FA1F}]", + ], + ), + ], + ), + invert=False, + behaviour="isolate", + ) + + @classmethod + def bert_splitter(cls) -> List["RegexSplitStep"]: + return [cls.bert_whitespace_splitter(), cls.bert_keep_delimeters_splitter()] + + @classmethod + def whitespace_splitter(cls) -> "RegexSplitStep": + return cls(r"\w+|[^\w\s]+", invert=True) + + @classmethod + def byte_level_splitter(cls) -> "RegexSplitStep": + return cls( + r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+", + invert=False, + behaviour="isolate", + ) + + @classmethod + def add_whitespace_to_the_next_word(cls): + return cls(r"\s\S", invert=False, behaviour="merge_with_next") + + @classmethod + def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": + return cls( + r"\p{Nd}|\p{Nl}|\p{No}", + invert=False, + behaviour=behaviour, + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs()) + return factory.create( + "RegexSplit", + input_nodes, + { + "behaviour": self.behaviour.lower(), + "invert": self.invert, + }, + ).outputs() + + +@dataclass +class WhitespaceSplitStep(PreTokenizatinStep): + """Works like python `str.split`.""" + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes).outputs() + + +@dataclass +class PunctuationSplitStep(PreTokenizatinStep): + """Splits string on punctuation chars.""" + + # behaviour: str = "Isolated" + + +@dataclass +class BytesToCharsStep(PreTokenizatinStep): + """Maps chars to other chars for Byte-level BPE Tokenizer""" + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return factory.create( + "BytesToChars", + input_nodes, + ).outputs() + + +@dataclass +class TokenizationModelStep(BasePipelineStep): + pass + + +@dataclass +class WordPieceTokenizationStep(TokenizationModelStep): + vocab: List[str] = field(repr=False) + unk_token: str = "[UNK]" + suffix_indicator: str = "##" + max_bytes_per_word: int = 100 + unk_token_id: int = field(init=False) + + def __post_init__(self) -> None: + try: + self.unk_token_id = self.vocab.index(self.unk_token) + except ValueError: + raise UserInputError(f"Cannot find unknown token '{self.unk_token}' in the vocab") + + @property + def vocab_size(self) -> int: + return len(self.vocab) + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "WordPieceTokenizationStep": + return cls( + unk_token=tokenizer_json["model"]["unk_token"], + suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"], + vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])], + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + input_nodes.extend( + ( + *self.create_string_constant_node(self.vocab).outputs(), + *as_node(self.unk_token_id).outputs(), + ) + ) + return factory.create( + "WordpieceTokenizer", + input_nodes, + { + "suffix_indicator": self.suffix_indicator, + "max_bytes_per_word": self.max_bytes_per_word, + }, + ).outputs() + + +@dataclass +class BPETokenizationStep(TokenizationModelStep): + vocab: List[str] = field(repr=False) + merges: List[str] = field(repr=False) + unk_token: str = "" + fuse_unk: bool = False + suffix_indicator: str = "" + end_suffix: str = "" + byte_fallback: bool = False + added_tokens: Optional[Dict[int, str]] = None + + def __post_init__(self): + if self.added_tokens is not None: + self.extend_vocab_with_added_tokens() + + def extend_vocab_with_added_tokens(self) -> None: + for idx, token in sorted(self.added_tokens.items()): + self.vocab.append(token) + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": + vocab = [token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])] + return cls( + unk_token=tokenizer_json["model"]["unk_token"] or "", + fuse_unk=tokenizer_json["model"]["fuse_unk"] or False, + suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"] or "", + end_suffix=tokenizer_json["model"]["end_of_word_suffix"] or "", + vocab=vocab, + merges=tokenizer_json["model"]["merges"], + added_tokens={ + token["id"]: token["content"] for token in tokenizer_json["added_tokens"] if token["id"] >= len(vocab) + }, + ) + + @classmethod + def from_tiktoken_encoding( + cls, + encoding: "Encoding", # noqa + added_tokens: Optional[Dict[int, str]] = None, + ) -> "BPETokenizationStep": + from .tiktoken_parser import generate_vocab_and_merges + + vocab, merges = generate_vocab_and_merges(encoding) + return cls( + unk_token="", + fuse_unk=False, + suffix_indicator="", + end_suffix="", + vocab=[token for token, idx in sorted(vocab.items(), key=lambda x: x[1])], + merges=merges, + added_tokens=added_tokens, + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + pipeline = self.get_pipeline() + pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs() + input_nodes.extend( + ( + *self.get_pipeline().vocab_node_outputs, + *self.create_string_constant_node(self.merges).outputs(), + ) + ) + return factory.create( + "BPETokenizer", + input_nodes, + { + "unk_token": self.unk_token, + "fuse_unk": self.fuse_unk, + "suffix_indicator": self.suffix_indicator, + "end_suffix": self.end_suffix, + "byte_fallback": self.byte_fallback, + }, + ).outputs() + + +@dataclass +class PostTokenizationStep(BasePipelineStep): + pass + + +@dataclass +class TruncationStep(PostTokenizationStep): + max_length: int + truncate_right: bool = True + axis: int = -1 + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any], num_of_added_tokens: int = 0) -> "TruncationStep": + max_length = min( + tokenizer_json["truncation"]["max_length"] - num_of_added_tokens, + 2**31 - 1 - num_of_added_tokens, + ) + return cls( + max_length=max_length, + truncate_right=tokenizer_json["truncation"]["direction"] == "Right", + ) + + @classmethod + def from_hf_object(cls, tokenizer: Any, num_of_added_tokens: int = 0) -> "TruncationStep": + max_length = min( + tokenizer.model_max_length - num_of_added_tokens, + 2**31 - 1 - num_of_added_tokens, + ) + return cls( + max_length=max_length, + truncate_right=tokenizer.truncation_side == "right", + ) + + @staticmethod + def validate_inputs(input_nodes): + if len(input_nodes) != 3: + raise UserInputError("Only one input ragged tensor is supported as an input for TruncationStep") + + def get_ov_subgraph(self, input_nodes: List[Output]): + # FIXME: Truncation side (truncate_right) is ignored + # TODO: Check if axis is the right-most dimension + self.validate_inputs(input_nodes) + + max_length = opset.minimum( + opset.subtract(input_nodes[1], input_nodes[0]), + make_constant_node(self.max_length, Type.i32), + ) + return [ + input_nodes[0], + opset.add(input_nodes[0], max_length).output(0), + input_nodes[2], + ] + + +@dataclass +class SpecialTokenWithId: + token: Optional[str] = None + _token_id: Optional[int] = None + + def set_token_id(self, vocab: Optional[List[str]]) -> None: + if vocab is not None and self.token in vocab: + self._token_id = vocab.index(self.token) + + +@dataclass +class TokenWithTypeId: + token_type_id: Optional[int] = None + + +@dataclass +class AddToken(TokenWithTypeId, SpecialTokenWithId): + pass + + +@dataclass +class Sequence(TokenWithTypeId): + pass + + +@dataclass +class CombineSegmentsStep(PostTokenizationStep): + inputs: List[TokenWithTypeId] = field(default_factory=list) + segment_ids: Optional[List[int]] = None + axis: int = -1 + + def __post_init__(self): + if self.segment_ids is not None: + return + + segment_ids_tensor = [node.token_type_id for node in self.inputs] + if any(segment is None for segment in segment_ids_tensor): + segment_ids_tensor = [0] * len(self.inputs) + + self.segment_ids = segment_ids_tensor + + def set_tokens_ids(self, vocab: Optional[List[int]]) -> None: + for input_ in self.inputs: + if isinstance(input_, AddToken): + input_.set_token_id(vocab) + + @property + def number_of_added_tokens(self) -> int: + return sum(1 for input_ in self.inputs if isinstance(input_, AddToken)) + + @classmethod + def from_hf_json_template_postprocessor( + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + ) -> "CombineSegmentsStep": + inputs: List[TokenWithTypeId] = [] + if number_of_inputs == 1: + post_processor = tokenizer_json["post_processor"]["single"] + else: + post_processor = tokenizer_json["post_processor"]["pair"] + + for template_dict in post_processor: + if "SpecialToken" in template_dict: + step = AddToken( + token=template_dict["SpecialToken"]["id"], + token_type_id=template_dict["SpecialToken"]["type_id"], + ) + inputs.append(step) + else: + inputs.append(Sequence(token_type_id=template_dict["Sequence"]["type_id"])) + return cls(inputs) + + @classmethod + def from_hf_json_bert_postprocessor( + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + ) -> "CombineSegmentsStep": + post_processor_dict = tokenizer_json["post_processor"] + inputs: List[TokenWithTypeId] = [ + AddToken( + token=post_processor_dict["cls"][0], + token_type_id=0, + ), + Sequence(token_type_id=0), + AddToken( + token=post_processor_dict["sep"][0], + token_type_id=0, + ), + ] + if number_of_inputs == 2: + inputs.extend( + [ + Sequence(token_type_id=1), + AddToken( + token=post_processor_dict["sep"][0], + token_type_id=1, + ), + ] + ) + return cls(inputs) + + @classmethod + def from_hf_json_roberta_processor( + cls, tokenizer_json: Dict[str, Any], number_of_inputs: int = 1 + ) -> "CombineSegmentsStep": + if number_of_inputs == 2: + raise UserInputError("Two inputs not supported for RoBERTa processor") + + post_processor_dict = tokenizer_json["post_processor"] + + inputs: List[TokenWithTypeId] = [Sequence(token_type_id=0)] + + if not post_processor_dict.get("add_special_tokens", True): + return cls(inputs) + + inputs.insert(0, AddToken(token=post_processor_dict["cls"][0], token_type_id=0)) + inputs.append(AddToken(token=post_processor_dict["sep"][0], token_type_id=0)) + return cls(inputs) + + def validate_inputs(self, input_nodes: List[Output]) -> None: + number_of_sequence_inputs = sum(1 for input_ in self.inputs if isinstance(input_, Sequence)) + if number_of_sequence_inputs != len(input_nodes) / 3: + raise UserInputError( + f"Number of input nodes: {len(input_nodes)}, must be equal to {number_of_sequence_inputs}" + ) + + def get_ov_subgraph(self, input_nodes): + self.validate_inputs(input_nodes) + + op_inputs = [] + input_nodes_iter = iter(input_nodes) + for node in self.inputs: + if isinstance(node, Sequence): + op_inputs.extend(islice(input_nodes_iter, 3)) + elif isinstance(node, AddToken): + # Put a scalar as a ragged tensor with scalar shape and a single element + op_inputs.extend(make_constant_node(0, Type.i32).outputs()) + op_inputs.extend(make_constant_node(1, Type.i32).outputs()) + op_inputs.append(make_constant_node(np.array([node._token_id]), Type.i32).output(0)) + else: + raise UserInputError(f"Unexpected node type in CombineSegments: {type(node)}") + + op_inputs.append(make_constant_node(self.segment_ids, Type.i32).output(0)) + return factory.create("CombineSegments", op_inputs).outputs() + + +@dataclass +class PaddingStep(PostTokenizationStep, SpecialTokenWithId): + pad_right: bool = True + token_type_id: Optional[int] = None + max_length: int = -1 + axis: int = -1 + + @classmethod + def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "PaddingStep": + padding_dict = tokenizer_json["padding"] + return cls( + token=padding_dict["pad_token"], + pad_right=padding_dict["direction"] == "Right", + token_type_id=padding_dict["pad_type_id"], + # TODO: Initialize max_length + ) + + @staticmethod + def validate_inputs(input_nodes: List[Output]) -> None: + # Suppose input_nodes may have multiple tuples each with 3 tensors represented decomposed ragged tensors + # We suppose that all ragged tensors represent the same structure and produce the mask only once + if len(input_nodes) % 3 != 0 or len(input_nodes) < 3: + raise UserInputError( + f"Number of input nodes should be divisible by 3 and bigger or equal 3. Got {len(input_nodes)}" + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + self.validate_inputs(input_nodes) + + outputs = [] + + if self.max_length == -1 or self.max_length >= 2**31: + # Calculate max_length as the maximum ragged length + max_length = opset.reduce_max( + opset.subtract(input_nodes[1], input_nodes[0]), + make_constant_node(0, Type.i32), + ) + else: + max_length = make_constant_node(self.max_length, Type.i32) + + names = [TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME][: len(input_nodes) // 3] + for i, name in enumerate(names): + cur_outputs = factory.create( + "RaggedToDense", + input_nodes[3 * i : 3 * (i + 1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs(), + ).outputs() + cur_outputs[0].tensor.add_names({name}) + + outputs.append(cur_outputs[0]) + if i == 0: + mask = opset.convert(cur_outputs[1], "i32").output( + 0 + ) # TODO: Change RaggedToDense to generate mask of any type + + mask.tensor.add_names({ATTENTION_MASK_INPUT_NAME}) + outputs.append(mask) + + return outputs + + +@dataclass +class DecodingStep(BasePipelineStep): + pass + + +@dataclass +class VocabDecoderStep(DecodingStep): + def get_vocab_node_outputs(self) -> Optional[List[Output]]: + return self.get_pipeline().vocab_node_outputs + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + input_nodes.extend(self.get_vocab_node_outputs()) + return factory.create("VocabDecoder", input_nodes, {}).outputs() + + +@dataclass +class CharsToBytesStep(DecodingStep): + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return factory.create("CharsToBytes", input_nodes, {}).outputs() + + +@dataclass +class RegexDecodingStep(DecodingStep): + regex_search_pattern: str + replace_term: str + + @classmethod + def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)", + replace_term=r"\1", + ) + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + input_nodes.extend( + ( + *self.create_string_constant_node(self.regex_search_pattern).outputs(), + *self.create_string_constant_node(self.replace_term).outputs(), + ) + ) + return factory.create("RegexNormalization", input_nodes).outputs() + + @classmethod + def replace_sp_spaces(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern="▁", + replace_term=" ", + ) + + +@dataclass +class TokenizerPipeline: + steps: List[BasePipelineStep] = field(default_factory=list) + vocab: Optional[List[str]] = field(default=None, repr=False) + number_of_inputs: int = 1 + vocab_node_outputs: Optional[List[Output]] = field(default=None, repr=False) + + def get_config(self) -> Dict[str, Dict[str, Any]]: + return {type(step).__name__: step.get_config() for step in self.steps} + + @singledispatchmethod + def add_steps(self, steps: Any) -> None: + raise OVTypeError(f"Type {type(steps)} is not supported") + + @add_steps.register + def _(self, steps: BasePipelineStep) -> None: + self.steps.append(steps) + steps.set_pipeline(self) + + @add_steps.register + def _(self, steps: list) -> None: + for step in steps: + self.steps.append(step) + step.set_pipeline(self) + + def __getitem__(self, item: int) -> BasePipelineStep: + return self.steps[item] + + def get_encoder_ov_subgraph(self) -> Model: + string_inputs = [op.Parameter(Type.u8, PartialShape(["?"])) for _ in range(self.number_of_inputs)] + + processing_outputs = [] + for input_node in string_inputs: + input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() + for step in self.normalization_steps: + input_node = step.get_ov_subgraph(input_node) + input_node = self.add_ragged_dimension(input_node) + + for step in chain(self.pre_tokenization_steps, self.tokenization_steps): + input_node = step.get_ov_subgraph(input_node) + + processing_outputs.extend(input_node) + + for step in self.post_tokenization_steps: + processing_outputs = step.get_ov_subgraph(processing_outputs) + + return Model(processing_outputs, string_inputs, name=TOKENIZER_ENCODER_NAME) + + @property + def normalization_steps(self) -> List[NormalizationStep]: + return [step for step in self.steps if isinstance(step, NormalizationStep)] + + @property + def pre_tokenization_steps(self) -> List[PreTokenizatinStep]: + return [step for step in self.steps if isinstance(step, PreTokenizatinStep)] + + @property + def tokenization_steps(self) -> List[TokenizationModelStep]: + return [step for step in self.steps if isinstance(step, TokenizationModelStep)] + + @property + def post_tokenization_steps(self) -> List[PostTokenizationStep]: + return [step for step in self.steps if isinstance(step, PostTokenizationStep)] + + @property + def decoding_steps(self) -> List[DecodingStep]: + return [step for step in self.steps if isinstance(step, DecodingStep)] + + @staticmethod + def add_ragged_dimension(input_node: List[Output]) -> List[Output]: + shape = opset.shape_of(input_node[0]) + batch_size = opset.gather(shape, as_node(0), as_node(0)) + ragged_begins = opset.range(as_node(0), batch_size, as_node(1), output_type="i32").outputs() + ragged_ends = opset.range( + as_node(1), opset.add(batch_size, as_node(1)), as_node(1), output_type="i32" + ).outputs() + return ragged_begins + ragged_ends + input_node + + def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: + for step in self.decoding_steps: + pipeline_step = step.get_ov_subgraph(input_nodes) + input_nodes = pipeline_step + + return factory.create("StringTensorPack", input_nodes).outputs() + + def get_decoder_ov_subgraph(self) -> Model: + input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) + token_ids = input_node + outputs = self.create_decoding_pipeline([token_ids]) + model = Model(outputs, [input_node], name=TOKENIZER_DECODER_NAME) + model.output().tensor.add_names({STRING_OUTPUT_NAME}) + return model diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py new file mode 100644 index 000000000..1d152c13c --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/utils.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Dict, Optional, Sequence, Tuple, Union + +from openvino import Model, Type +from openvino.preprocess import PrePostProcessor +from openvino.runtime import opset12 as opset + +from .constants import LOGITS_OUTPUT_NAME, TOKEN_IDS_OUTPUT_NAME + + +logger = logging.getLogger(__name__) + + +def connect_models( + first: Model, + second: Model, + name_map: Optional[Union[Sequence[Tuple[str, str]], Dict[str, str]]] = None, + by_indices: bool = False, + keep_second_model_unaligned_inputs: bool = True, + keep_remaining_first_model_outputs: bool = False, +) -> Model: + if by_indices: + min_len = min(len(first.outputs), len(second.inputs)) + aligned_first_outputs = first.outputs[:min_len] + aligned_second_inputs = second.inputs[:min_len] + elif name_map is None: + aligned_first_outputs = first.outputs + aligned_second_inputs = [second.input(model1_output.get_any_name()) for model1_output in aligned_first_outputs] + else: + if isinstance(name_map, dict): + name_map = list(name_map.items()) + aligned_first_outputs = [first.output(name1) for name1, _ in name_map] + aligned_second_inputs = [second.input(name2) for _, name2 in name_map] + + for second_input, first_output in zip(aligned_second_inputs, aligned_first_outputs): + logger.debug(f"Connecting: {first_output.get_any_name()} -> {second_input.get_any_name()}") + for target in second_input.get_target_inputs(): + target.replace_source_output(first_output.get_node().input_value(0)) + # target.replace_source_output(model1_output) # TODO: Produces incorrect topology + + new_inputs = first.get_parameters() + remaining_inputs = [input_ for input_ in second.inputs if input_ not in aligned_second_inputs] + if keep_second_model_unaligned_inputs: + new_inputs.extend(remaining_inputs) + elif remaining_inputs: + logger.info( + "Some inputs of the second model were left uncovered and not included in the connected model: " + + ", ".join(input_.name for input_ in remaining_inputs) + + ". To add them set `keep_unaligned_inputs` to `True`" + ) + + new_outputs = second.outputs + remaining_outputs = [output for output in first.outputs if output not in aligned_first_outputs] + if keep_remaining_first_model_outputs: + new_outputs.extend(remaining_outputs) + elif remaining_outputs: + logger.info( + "Some outputs of the first model were left uncovered and not included in the connected model: " + + ", ".join(output.name for output in remaining_outputs) + + ". To add them set `keep_unaligned_outputs` to `True`" + ) + + connected_model = Model(new_outputs, new_inputs, f"{first.get_name()}_with_{second.get_name()}") + # TODO: Cleanup model1 and mode2 to avoid using them, they are ill-formed after the reconnection + connected_model.validate_nodes_and_infer_types() + return connected_model + + +def greedy_decoder(input) -> Model: + argmax = opset.topk( + data=input, + k=1, + axis=-1, + mode="max", + sort="none", + name="ArgMax", + ) + token_ids = opset.squeeze( + data=argmax.output(1), + axes=-1, + ) + return token_ids.output(0) + + +def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model: + ppp = PrePostProcessor(text_generation_model) + ppp.output(logits_output).postprocess().custom(greedy_decoder) + model = ppp.build() + model.output(logits_output).tensor.set_names({TOKEN_IDS_OUTPUT_NAME}) + return model + + +def change_inputs_type(model: Model, input_type: Type) -> Model: + ppp = PrePostProcessor(model) + for idx, _ in enumerate(model.inputs): + ppp.input(idx).tensor().set_element_type(input_type) + return ppp.build() + + +def change_outputs_type(model: Model, output_type: Type) -> Model: + ppp = PrePostProcessor(model) + for idx, _ in enumerate(model.outputs): + ppp.output(idx).tensor().set_element_type(output_type) + return ppp.build() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml new file mode 100644 index 000000000..338bf5690 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/pyproject.toml @@ -0,0 +1,40 @@ +[project] +name = "ov_tokenizer" +version = "0.0.1" +description = "Convert tokenizers into OpenVINO models" +requires-python = ">=3.8" +authors = [ + { name = "OpenVINO Developers", email = "openvino@intel.com" }, +] + +dependencies = [ + "openvino", + "numpy" +] + +[project.optional-dependencies] +dev = [ + "ruff", + "pytest", +] +transformers = [ + "transformers[sentencepiece,tiktoken]" +] +tiktoken = [ + "tiktoken" +] +all = [ + "ov_tokenizer[dev,transformers,tiktoken]" +] + +[tool.ruff] +ignore = ["C901", "E501", "E741", "W605"] +select = ["C", "E", "F", "I", "W"] +line-length = 119 + +[tool.ruff.per-file-ignores] +"__init__.py" = ["F401"] +"ov_tokenizer/hf_parser.py" = ["F821"] + +[tool.ruff.isort] +lines-after-imports = 2 diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py new file mode 100644 index 000000000..054388410 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py @@ -0,0 +1,53 @@ +import json +import os +from math import isclose +from pathlib import Path + +import pytest + + +def prebuild_extenson_path(): + ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH") + if not ext_path: + raise EnvironmentError( + "No extension path found in the environment. " + "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable." + ) + return ext_path + + +os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path() +PASS_RATES_FILE = Path(__file__).parent / "pass_rates.json" + + +@pytest.hookimpl(trylast=True) +def pytest_sessionfinish(session, exitstatus) -> None: + """ + Tests fail if the test pass rate decreases + """ + if exitstatus != pytest.ExitCode.TESTS_FAILED: + return + + parent = os.path.commonprefix([item.nodeid for item in session.items]).strip("[]") + + with open(PASS_RATES_FILE) as f: + previous_rates = json.load(f) + + pass_rate = 1 - session.testsfailed / session.testscollected + previous = previous_rates.get(parent, 0) + + reporter = session.config.pluginmanager.get_plugin("terminalreporter") + if isclose(pass_rate, previous): + session.exitstatus = pytest.ExitCode.OK + reporter.write_line(f"New pass rate isclose to previous: {pass_rate}") + return + + if pass_rate > previous: + reporter.write_line(f"New pass rate {pass_rate} is bigger then previous: {previous}") + session.exitstatus = pytest.ExitCode.OK + previous_rates[parent] = pass_rate + + with open(PASS_RATES_FILE, "w") as f: + json.dump(previous_rates, f, indent=4) + else: + reporter.write_line(f"Pass rate is lower! Current: {pass_rate}, previous: {previous}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json new file mode 100644 index 000000000..6d8440fc9 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -0,0 +1,10 @@ +{ + "tokenizers_test.py::test_hf_wordpiece_tokenizers_outputs": 0.9423076923076923, + "tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641, + "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.6875, + "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5525, + "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88, + "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882, + "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, + "tokenizers_test.py::test_": 0.825187969924812 +} \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py new file mode 100644 index 000000000..57e723bab --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -0,0 +1,341 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +# import os +# os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so" + +import numpy as np +import openvino +import pytest +from openvino import Core +from transformers import AutoTokenizer + +from ov_tokenizer import ( + convert_tokenizer, + pack_strings, + unpack_strings, +) + + +core = Core() + +eng_test_strings = [ + "Eng... test, string?!", + "Multiline\nstring!\nWow!", + "A lot\t w!", + "A lot\t\tof whitespaces!", + "\n\n\n\t\t A lot\t\tof\twhitespaces\n!\n\n\n\t\n\n", + "Eng, but with d1gits: 123; 0987654321, stop." "0987654321 - eng, but with d1gits: 123", +] +multilingual_test_strings = [ + "Тестовая строка!", + "Testzeichenfolge?", + "Tester, la chaîne...", + "測試字符串", + "سلسلة الاختبار", + "מחרוזת בדיקה", + "Сынақ жолы", + "رشته تست", +] +emoji_test_strings = [ + "😀", + "😁😁", + "🤣🤣🤣😁😁😁😁", + "🫠", # melting face + "🤷‍♂️", + "🤦🏼‍♂️", +] +misc_strings = [ + "", + " ", + " " * 10, + "\n", + " \t\n", +] + +wordpiece_models = [ + "bert-base-multilingual-cased", + "bert-large-cased", + "cointegrated/rubert-tiny2", + "distilbert-base-uncased-finetuned-sst-2-english", + "sentence-transformers/all-MiniLM-L6-v2", + "rajiv003/ernie-finetuned-qqp", # ernie model with fast tokenizer + "google/electra-base-discriminator", + "google/mobilebert-uncased", + "jhgan/ko-sbert-sts", + "squeezebert/squeezebert-uncased", + "prajjwal1/bert-mini", + "ProsusAI/finbert", + "rasa/LaBSE", +] +bpe_models = [ + "stabilityai/stablecode-completion-alpha-3b-4k", + "EleutherAI/gpt-neo-125m", + "EleutherAI/gpt-j-6b", + "roberta-base", + "sentence-transformers/all-roberta-large-v1", # standin for setfit + "facebook/bart-large-mnli", + "facebook/opt-66b", + "gpt2", + "EleutherAI/gpt-neox-20b", + "ai-forever/rugpt3large_based_on_gpt2", + "KoboldAI/fairseq-dense-13B", + "facebook/galactica-120b", + "EleutherAI/pythia-12b-deduped", + "microsoft/deberta-base", + "bigscience/bloom", + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", + "Salesforce/codegen-16B-multi", + # "google/flan-t5-xxl", # needs Precompiled/CharsMap + # "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer + # "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work +] +sentencepiece_models = [ + "codellama/CodeLlama-7b-hf", + "camembert-base", + "NousResearch/Llama-2-13b-hf", + "xlm-roberta-base", + "microsoft/deberta-v3-base", + "xlnet-base-cased", + # "THUDM/chatglm-6b", # hf_tokenizer init error + "THUDM/chatglm2-6b", # detokenizer cannot filter special tokens + "THUDM/chatglm3-6b", + # "t5-base", # crashes tests +] +tiktiken_models = [ + "Qwen/Qwen-14B-Chat", + "Salesforce/xgen-7b-8k-base", +] + + +def get_tokenizer(request, fast_tokenizer=True, trust_remote_code=False): + hf_tokenizer = AutoTokenizer.from_pretrained( + request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code + ) + ov_tokenizer = convert_tokenizer(hf_tokenizer, with_decoder=False) + compiled_tokenizer = core.compile_model(ov_tokenizer) + return hf_tokenizer, compiled_tokenizer + + +def get_tokenizer_detokenizer(request, fast_tokenizer=True, trust_remote_code=False): + hf_tokenizer = AutoTokenizer.from_pretrained( + request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code + ) + ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) + compiled_tokenizer = core.compile_model(ov_tokenizer) + compiled_detokenizer = core.compile_model(ov_detokenizer) + return hf_tokenizer, compiled_tokenizer, compiled_detokenizer + + +@pytest.fixture(scope="session", params=wordpiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_wordpiece_tokenizers(request): + return get_tokenizer(request) + + +@pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_bpe_tokenizers(request): + return get_tokenizer_detokenizer(request) + + +@pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_and_ov_bpe_detokenizer(request): + return get_tokenizer_detokenizer(request) + + +@pytest.fixture(scope="session", params=[True, False], ids=lambda is_fast: "Fast" if is_fast else "Slow") +def is_fast_tokenizer(request): + return request.param + + +@pytest.fixture(scope="session", params=sentencepiece_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def sentencepice_model_tokenizers(request, is_fast_tokenizer): + return get_tokenizer_detokenizer(request, is_fast_tokenizer, trust_remote_code=True) + + +@pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def tiktoken_tokenizers(request): + return get_tokenizer(request, trust_remote_code=True) + + +@pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def tiktoken_detokenizers(request): + return get_tokenizer_detokenizer(request, trust_remote_code=True) + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_hf_wordpiece_tokenizers(hf_and_ov_wordpiece_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers + packed_strings = pack_strings([test_string]) + + hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + ov_tokenized = ov_tokenizer(packed_strings) + + for output_name, hf_result in hf_tokenized.items(): + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + eng_test_strings, + multilingual_test_strings, + emoji_test_strings, + misc_strings, + ], +) +def test_hf_wordpiece_tokenizers_multiple_strings(hf_and_ov_wordpiece_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = hf_and_ov_wordpiece_tokenizers + packed_strings = pack_strings(test_string) + + hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True) + ov_tokenized = ov_tokenizer(packed_strings) + + for output_name, hf_result in hf_tokenized.items(): + assert np.all((ov_result := ov_tokenized[output_name]) == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_sentencepiece_model_tokenizer(sentencepice_model_tokenizers, test_string): + hf_tokenizer, ov_tokenizer, _ = sentencepice_model_tokenizers + + hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + ov_tokenized = ov_tokenizer(pack_strings([test_string])) + + for output_name, hf_result in hf_tokenized.items(): + # chatglm has token_type_ids output that we omit + if (ov_result := ov_tokenized.get(output_name)) is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_sentencepiece_model_detokenizer(sentencepice_model_tokenizers, test_string): + hf_tokenizer, _, ov_detokenizer = sentencepice_model_tokenizers + + token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) + ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) + + assert ov_output == hf_output + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_hf_bpe_tokenizers_outputs(hf_and_ov_bpe_tokenizers, test_string): + hf_tokenizer, ov_tokenizer, _ = hf_and_ov_bpe_tokenizers + packed_strings = pack_strings([test_string]) + + hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + ov_tokenized = ov_tokenizer(packed_strings) + + for output_name, hf_result in hf_tokenized.items(): + # galactica tokenizer has 3 output, but model has 2 inputs + if (ov_result := ov_tokenized.get(output_name)) is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): + hf_tokenizer, _, ov_detokenizer = hf_and_ov_bpe_detokenizer + + token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + hf_output = hf_tokenizer.batch_decode(token_ids) + ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) + + assert ov_output == hf_output + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string): + hf_tokenizer, ov_tokenizer = tiktoken_tokenizers + + hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + ov_tokenized = ov_tokenizer(pack_strings([test_string])) + + for output_name, hf_result in hf_tokenized.items(): + if (ov_result := ov_tokenized.get(output_name)) is not None: + assert np.all(ov_result == hf_result), f"{hf_result}\n{ov_result}" + + +@pytest.mark.parametrize( + "test_string", + [ + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ], +) +def test_tiktoken_detokenizer(tiktoken_detokenizers, test_string): + hf_tokenizer, _, ov_detokenizer = tiktoken_detokenizers + + token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + hf_output = hf_tokenizer.batch_decode(token_ids, skip_special_tokens=True) + ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) + + assert ov_output == hf_output + + +def test_streaming_detokenizer(): + hf_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2") + _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True, streaming_decoder=True) + ov_detokenizer = core.compile_model(ov_detokenizer) + + test_string = "this is a test string" + tokenized_string = hf_tokenizer(test_string).input_ids + hf_detokenized = hf_tokenizer.decode(tokenized_string) + + detokenized_string = "" + for token in tokenized_string: + ov_output = unpack_strings(ov_detokenizer(np.atleast_2d(token))["string_output"])[0] + detokenized_string += ov_output + + assert detokenized_string == hf_detokenized diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp new file mode 100644 index 000000000..6276d13df --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ragged_tensor_pack.hpp" +#include "utils.hpp" + +using namespace ov; + + +void RaggedTensorPack::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 3); + OPENVINO_ASSERT(get_input_element_type(0) == element::i32); + OPENVINO_ASSERT(get_input_element_type(1) == element::i32); + + // Pass through the base tensor which is used to build ragged dimensions + // TODO: Provide correct implementation that saves information about ragged structure + // TODO: Requires single-tensor packed representation for ragged tensor + set_output_type(0, get_input_element_type(2), get_input_partial_shape(2)); +} + + +bool RaggedTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto input_shape = inputs[0].get_shape(); + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto num_elements = shape_size(input_shape); + + inputs[2].copy_to(outputs[0]); + + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp new file mode 100644 index 000000000..edcbf4bbc --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_tensor_pack.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Having a decomposed representation for a tensor, converts it to a single string tensor for debugging purposes and to facilitate model conversion +// Base tensor on which this operation builds a ragged tensor can have any shape or type, this operation doesn't try to interpret it. +class RaggedTensorPack : public ov::op::Op { +public: + OPENVINO_OP("RaggedTensorPack"); + + RaggedTensorPack () = default; + + RaggedTensorPack(ov::OutputVector inputs) + : ov::op::Op(inputs) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp new file mode 100644 index 000000000..acb145d11 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "ragged_to_dense.hpp" +#include "utils.hpp" + +using namespace ov; +using op::v0::Constant; + +void RaggedToDense::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 3 + 1 + 1); + + // Input ragged tensor + check_ragged_input(this, 0); + + // Target size along ragged dimension + OPENVINO_ASSERT(get_input_element_type(3).is_integral_number()); + auto rank = get_input_partial_shape(3).rank(); + OPENVINO_ASSERT( + rank.is_dynamic() || + rank.get_length() == 0 || + rank.get_length() == 1 && get_input_partial_shape(3)[0].compatible(1), + "Target dense dimension size for RaggedToDense should be a 0D or 1D tensor with a single element"); + + // Default value to fill out of ragged range elements in output tensor + OPENVINO_ASSERT(get_input_element_type(4).compatible(get_input_element_type(2))); + auto input4_rank = get_input_partial_shape(4).rank(); + OPENVINO_ASSERT(input4_rank.compatible(0)); + + set_input_is_relevant_to_shape(3); + + if(get_input_partial_shape(0).rank().is_dynamic()) { + set_output_type(0, get_input_element_type(2), PartialShape::dynamic()); + set_output_type(1, element::boolean, PartialShape::dynamic()); + } else { + auto shape = get_input_partial_shape(0); + if(auto target_dim = dynamic_cast(get_input_node_ptr(3))) { + shape.push_back(target_dim->cast_vector()[0]); + } else { + shape.push_back(Dimension()); + } + set_output_type(0, get_input_element_type(2), shape); + set_output_type(1, element::boolean, shape); + } +} + + +bool RaggedToDense::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + // FIXME: Works for POD types only (not for strings!) + // FIXME: Output mask is calculated even if there are no consumers + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto nelems = inputs[0].get_size(); + auto elems = reinterpret_cast(inputs[2].data()); + auto elem_size = inputs[2].get_element_type().size(); + auto default_value = reinterpret_cast(inputs[4].data()); + + // Suppose validate was called and set correct output shape + // Take a target shape value for ragged dimension + size_t target_dim = outputs[0].get_shape().back(); + + auto out_elems = reinterpret_cast(outputs[0].data()); + auto out_mask = outputs[1].data(); + + auto out_elem_orig = out_elems; + auto out_mask_orig = out_mask; + + for(size_t i = 0; i < nelems; ++i) { + auto begin = elems + elem_size*begins[i]; + auto len = std::min(size_t(ends[i] - begins[i]), target_dim); // truncation + auto end = begin + elem_size*len; + out_elems = std::copy(begin, end, out_elems); + out_mask = std::fill_n(out_mask, len, char(1)); + if(len < target_dim) + out_mask = std::fill_n(out_mask, target_dim - len, char(0)); + while(len < target_dim) { + out_elems = std::copy(default_value, default_value + elem_size, out_elems); + ++len; + } + } + + OPENVINO_ASSERT(out_elems == out_elem_orig + outputs[0].get_byte_size()); + OPENVINO_ASSERT(out_mask == out_mask_orig + outputs[1].get_byte_size()); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp new file mode 100644 index 000000000..2d543fdb2 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/ragged_to_dense.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Takes a ragged tensor with one ragged right-most dimension and produces a normal tensor +class RaggedToDense : public ov::op::Op { +public: + OPENVINO_OP("RaggedToDense"); + + RaggedToDense () = default; + + RaggedToDense(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp new file mode 100644 index 000000000..bc93545f6 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + + + +#include "regex_normalization.hpp" +#include "utils.hpp" + +using namespace ov; + + +RegexNormalization::RegexNormalization(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + +RegexNormalization::RegexNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr& search_pattern_re, + const absl::string_view replace_pattern + ) : ov::op::Op(arguments), m_search_pattern_re(search_pattern_re), m_replace_pattern(replace_pattern) { + if (m_search_pattern_re == nullptr) { + auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr()); + auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr()); + auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr()); + auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr()); + auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size()); + m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size()); + m_search_pattern_re = std::make_shared(search_pattern); + }; + constructor_validate_and_infer_types(); + } + + +void RegexNormalization::validate_and_infer_types() { + check_string_input(this, 0); + check_string_scalar_input(this, 3); + check_string_scalar_input(this, 4); + set_string_output(this, 0, get_input_partial_shape(0)); +} + + +bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + return evaluate_normalization_helper( + outputs, inputs, + [this](const std::string& str) { + // FIXME: if regex is not valid re2, return string without changing (use another regex engine) + if (m_search_pattern_re->NumberOfCapturingGroups() == -1) + return str; + + std::string result = str; + re2::RE2::GlobalReplace(&result, *m_search_pattern_re, m_replace_pattern); + return result; + }); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp new file mode 100644 index 000000000..2f3924ec7 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_normalization.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "normalizer.h" // for absl::string_view + +#include +#include "openvino/opsets/opset10.hpp" +#include "fast_tokenizer/normalizers/normalizers.h" + +using namespace ov; +using namespace ov::opset10; + +class RegexNormalization : public ov::op::Op { +public: + OPENVINO_OP("RegexNormalization"); + + RegexNormalization () = default; + RegexNormalization(const ov::OutputVector& arguments); // does not used + RegexNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr& search_pattern_re, + const absl::string_view replace_pattern + ); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_search_pattern_re, m_replace_pattern); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +private: + std::shared_ptr m_search_pattern_re; + absl::string_view m_replace_pattern; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp new file mode 100644 index 000000000..235a54cce --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" + +#include "regex_split.hpp" +#include "utils.hpp" +#include "fast_tokenizer/normalizers/normalizers.h" + +using namespace ov; +using namespace ov::opset10; + +namespace { + +using paddlenlp::fast_tokenizer::core::SplitMode; +const std::map split_modes = { + {"remove", SplitMode::REMOVED}, + {"isolate", SplitMode::ISOLATED}, + {"contiguous", SplitMode::CONTIGUOUS}, + {"merge_with_previous", SplitMode::MERGED_WITH_PREVIOUS}, + {"merge_with_next", SplitMode::MERGED_WITH_NEXT}, +}; + +} + + +RegexSplit::RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour, bool invert) : + ov::op::Op(arguments), + m_behaviour(behaviour), + m_invert(invert) { + constructor_validate_and_infer_types(); +} + + +RegexSplit::RegexSplit( + const ov::OutputVector& arguments, + const std::shared_ptr& pretokenizer, + const std::string& behaviour, + bool invert +) : + ov::op::Op(arguments), + m_pretokenizer(pretokenizer), + m_behaviour(behaviour), + m_invert(invert) { + + if (m_pretokenizer == nullptr) { + auto split_pattern_const = as_type_ptr(arguments[5].get_node_shared_ptr()); + auto split_pattern_buf = static_cast(split_pattern_const->get_data_ptr()); + auto split_pattern = std::string(split_pattern_buf, split_pattern_const->get_byte_size()); + m_pretokenizer = std::make_shared(split_pattern, split_modes.at(behaviour), invert); + }; + + constructor_validate_and_infer_types(); +} + + +void RegexSplit::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_scalar_input(this, 5); + OPENVINO_ASSERT(split_modes.find(m_behaviour) != split_modes.end(), "RegexSplit doesn't support unknown split mode: " + m_behaviour); + set_ragged_string_output(this, 0, get_input_partial_shape(0)); +} + +bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + outputs[4] = inputs[4]; + const size_t num_rows = inputs[0].get_size(); + const size_t num_chars = inputs[4].get_size(); + + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + + outputs[2].set_shape(Shape{num_chars}); + outputs[3].set_shape(Shape{num_chars}); + + outputs[4] = inputs[4]; + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + int32_t ragged_offset = 0; + + for(size_t seq = 0; seq < num_rows; ++seq) { + new_ragged_begins[seq] = ragged_offset; + + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + paddlenlp::fast_tokenizer::pretokenizers::PreTokenizedString pretokenized(str); + (*m_pretokenizer)(&pretokenized); + size_t num_splits = pretokenized.GetSplitsSize(); + + for (size_t j = 0; j < num_splits; ++j) { + auto split = pretokenized.GetSplit(j); + const auto& value = split.normalized_.GetStr(); + auto offset = split.normalized_.GetOrginalOffset(); + new_begins[ragged_offset] = begins[ragged_col] + offset.first; + new_ends[ragged_offset++] = begins[ragged_col] + offset.second; + }; + } + + new_ragged_ends[seq] = ragged_offset; + } + + // Fix real shape based on collected results + outputs[2].set_shape({size_t(ragged_offset)}); + outputs[3].set_shape({size_t(ragged_offset)}); + + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp new file mode 100644 index 000000000..a1f001f64 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/regex_split.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "openvino/opsets/opset10.hpp" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +using namespace ov; +using namespace paddlenlp::fast_tokenizer; + + +class RegexSplit : public ov::op::Op { +public: + OPENVINO_OP("RegexSplit"); + + RegexSplit () = default; + RegexSplit(const ov::OutputVector& arguments, const std::string& behaviour = "remove", bool invert = false); + RegexSplit( + const ov::OutputVector& arguments, + const std::shared_ptr& pretokenizer, + const std::string& behaviour = "remove", + bool invert = false + ); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_pretokenizer, m_behaviour, m_invert); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("behaviour", m_behaviour); + visitor.on_attribute("invert", m_invert); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + std::shared_ptr m_pretokenizer; + std::string m_behaviour = "remove"; + bool m_invert = false; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp new file mode 100644 index 000000000..c46472a2d --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -0,0 +1,399 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "normalizer.h" +#include "model_interface.h" + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" + +#include "sentence_piece.hpp" +#include "utils.hpp" + +using sentencepiece::SentencePieceProcessor; +using sentencepiece::util::Status; +using namespace TemplateExtension; +using namespace ov; +using namespace ov::frontend; +using namespace ov::opset10; + +// TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor + +SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse) : m_sp(std::make_shared()), + m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), + m_reverse(reverse), Op(args) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + + // form extra options to configure SentencePieceProcessor + std::string extra_options = ""; + if (m_add_bos) { + extra_options += "bos"; + } + if (m_add_eos) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "eos"; + } + /* TODO: TF ignores this option, so we are ignoring it as well; need to understand what should we do + if (m_reverse) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "reverse"; + } + */ + // example of extra_options, if "bos:eos:reverse" + CHECK_OK(m_sp->SetEncodeExtraOptions(extra_options)); + constructor_validate_and_infer_types(); +} + +SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const std::shared_ptr& sp, + int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse) : + m_sp((sp == nullptr) ? std::make_shared(): sp), + m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos), + m_reverse(reverse), Op(args) { + // constructor above without sp argument never called when the node is created with python factory, so need to init and cache m_sp here + if (!m_sp->status().ok()) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + FRONT_END_GENERAL_CHECK(sp_model_const, "SentencepieceTokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + + // form extra options to configure SentencePieceProcessor + std::string extra_options = ""; + if (m_add_bos) { + extra_options += "bos"; + } + if (m_add_eos) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "eos"; + } + if (m_reverse) { + extra_options = extra_options.empty() ? extra_options : extra_options + ":"; + extra_options += "reverse"; + } + // example of extra_options, if "bos:eos:reverse" + CHECK_OK(m_sp->SetEncodeExtraOptions(extra_options)); + }; + constructor_validate_and_infer_types(); +} + +void SentencepieceTokenizer::validate_and_infer_types() { + + #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)"); + FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor"); + FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor"); + + #else + + FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); + FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); + + #if USE_STRING_TENSORS + + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::string || get_input_element_type(1) == element::u8, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 or string depending on the current stage of model preparation"); + #else + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::string, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor"); + #endif + + #else + +#if 0 // change to 0 when compiled with master and the bug with data propagation from within inline context is not solved + FRONT_END_GENERAL_CHECK( + get_input_element_type(1) == element::u8, + "SentencepieceTokenizer accepts sentences as the second input and it should be of type u8 tensor, but got " + + get_input_element_type(1).get_type_name()); +#endif + + #endif + + #endif + + // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values + // and dense shape + set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); + set_output_type(1, element::i32, PartialShape{ Dimension() }); + set_output_type(2, element::i64, PartialShape{ Dimension(2) }); +} + +bool SentencepieceTokenizer::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("nbest_size", m_nbest_size); + visitor.on_attribute("alpha", m_alpha); + visitor.on_attribute("add_bos", m_add_bos); + visitor.on_attribute("add_eos", m_add_eos); + visitor.on_attribute("reverse", m_reverse); + return true; +} + +bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + std::vector sparse_indices; + std::vector sparse_values; + std::vector sparse_dense_shape; + +#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + auto begin_ids = inputs[1].data(); + auto end_ids = inputs[2].data(); + auto data = inputs[3].data(); + + auto batch_size = shape_size(inputs[1].get_shape()); + +#else + +#if USE_STRING_TENSORS + + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + const ov::Tensor& strings_tensor = **reinterpret_cast(inputs[1].data()); + #else + const ov::Tensor& strings_tensor = inputs[1]; + #endif + + const std::string* strings = strings_tensor.data(); + size_t batch_size = ov::shape_size(strings_tensor.get_shape()); + +#else + + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data); + +#endif + +#endif + size_t max_token_id = 0; + for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { +#if USE_STRING_TENSORS && !SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + const std::string& sentence = strings[batch_ind]; +#else + auto begin_ind = begin_ids[batch_ind]; + auto end_ind = end_ids[batch_ind]; + absl::string_view sentence((const char*)data + begin_ind, end_ind - begin_ind); +#endif + std::vector ids; + CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); + // put into resulted vectors + for (size_t token_id = 0; token_id < ids.size(); ++token_id) { + sparse_indices.push_back(static_cast(batch_ind)); + sparse_indices.push_back(static_cast(token_id)); + sparse_values.push_back(static_cast(ids[token_id])); + } + max_token_id = max_token_id < ids.size() ? ids.size() : max_token_id; + } + sparse_dense_shape.push_back(static_cast(batch_size)); + sparse_dense_shape.push_back(static_cast(max_token_id)); + + outputs[0].set_shape({ sparse_indices.size() / 2, 2 }); + memcpy(outputs[0].data(), sparse_indices.data(), sizeof(int64_t) * sparse_indices.size()); + outputs[1].set_shape({ sparse_values.size() }); + memcpy(outputs[1].data(), sparse_values.data(), sizeof(int32_t) * sparse_values.size()); + outputs[2].set_shape({ 2 }); + memcpy(outputs[2].data(), sparse_dense_shape.data(), sizeof(int64_t) * sparse_dense_shape.size()); + + return true; +} + +bool SentencepieceTokenizer::has_evaluate() const { + return true; +} + +std::shared_ptr SentencepieceTokenizer::clone_with_new_inputs(const OutputVector& new_args) const { + return std::make_shared(new_args, m_sp, m_nbest_size, m_alpha, m_add_bos, m_add_eos, m_reverse); +} + + +// Detokenizer + +SentencepieceDetokenizer::SentencepieceDetokenizer(const OutputVector& args) : + m_sp(std::make_shared()), Op(args) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + constructor_validate_and_infer_types(); +} + +SentencepieceDetokenizer::SentencepieceDetokenizer(const OutputVector& args, const std::shared_ptr& sp) : + m_sp((sp == nullptr) ? std::make_shared(): sp), Op(args) { + // constructor above without sp argument never called when the node is created with python factory, so need to init and cache m_sp here + if (!m_sp->status().ok()) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + }; + constructor_validate_and_infer_types(); +} + +void SentencepieceDetokenizer::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 2, "SentencepieceDetokenizer expects two inputs: sp model and token ids"); + OPENVINO_ASSERT(get_input_element_type(0) == element::u8, "SentencepieceDetokenizer accepts sp model as the first input and it should be of type u8 tensor"); + OPENVINO_ASSERT(get_input_partial_shape(1).size() == 2, "SentencepieceDetokenizer expects 2D tensor as second input"); + + auto batch_size = PartialShape({get_input_partial_shape(1)[0]}); + set_string_output(this, 0, batch_size); +} + +bool SentencepieceDetokenizer::visit_attributes(AttributeVisitor& visitor) { + return true; +} + +bool SentencepieceDetokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + auto batch_size = inputs[1].get_shape()[0]; + auto seq_len = inputs[1].get_shape()[1]; + auto input_data = inputs[1].data(); + + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + auto start = batch * seq_len; + + std::vector token_ids(seq_len); + std::memcpy(&token_ids[0], &input_data[start], sizeof(int32_t) * seq_len); + + std::string detokenized; + CHECK_OK(m_sp->Decode(token_ids, &detokenized)); + std::copy(detokenized.begin(), detokenized.end(), &chars[char_offset]); + + begins[batch] = char_offset; + char_offset += detokenized.size(); + ends[batch] = char_offset; + } + outputs[2].set_shape({char_offset}); + return true; +} + +bool SentencepieceDetokenizer::has_evaluate() const { + return true; +} + +std::shared_ptr SentencepieceDetokenizer::clone_with_new_inputs(const OutputVector& new_args) const { + return std::make_shared(new_args, m_sp); +} + + +// Stream Detokenizer + +SentencepieceStreamDetokenizer::SentencepieceStreamDetokenizer(const OutputVector& args) : + m_sp(std::make_shared()), Op(args) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + constructor_validate_and_infer_types(); +} + +SentencepieceStreamDetokenizer::SentencepieceStreamDetokenizer(const OutputVector& args, const std::shared_ptr& sp) : + m_sp((sp == nullptr) ? std::make_shared(): sp), Op(args) { + // constructor above without sp argument never called when the node is created with python factory, so need to init and cache m_sp here + if (!m_sp->status().ok()) { + auto sp_model_const = as_type_ptr(args[0].get_node_shared_ptr()); + OPENVINO_ASSERT(sp_model_const, "SentencepieceDetokenizer expects SentencePiece model to be constant."); + auto spm_model = static_cast(sp_model_const->get_data_ptr()); + auto spm_model_size = sp_model_const->get_byte_size(); + + // configure SentencePieceProcessor + std::string model_proto(spm_model, spm_model_size); + CHECK_OK(m_sp->LoadFromSerializedProto(model_proto)); + }; + constructor_validate_and_infer_types(); +} + +void SentencepieceStreamDetokenizer::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 2, "SentencepieceDetokenizer expects two inputs: sp model and token ids"); + OPENVINO_ASSERT(get_input_element_type(0) == element::u8, "SentencepieceDetokenizer accepts sp model as the first input and it should be of type u8 tensor"); + OPENVINO_ASSERT(get_input_partial_shape(1).size() == 2, "SentencepieceDetokenizer expects 2D tensor as second input"); + + auto batch_size = PartialShape({get_input_partial_shape(1)[0]}); + set_string_output(this, 0, batch_size); +} + +bool SentencepieceStreamDetokenizer::visit_attributes(AttributeVisitor& visitor) { + return true; +} + +bool SentencepieceStreamDetokenizer::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + auto batch_size = inputs[1].get_shape()[0]; + auto seq_len = inputs[1].get_shape()[1]; + auto input_data = inputs[1].data(); + + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + const auto start = batch * seq_len; + + begins[batch] = char_offset; + for(size_t seq = start; seq < start + seq_len; ++seq) { + const auto token_id = input_data[seq]; + const auto token = m_sp->IdToPiece(token_id); + + if(token.rfind("<") == 0 && token.rfind(">") == 5) { + // convert "byte tokens" into bytes + int ch = sentencepiece::PieceToByte(token); + chars[char_offset++] = ch; + } else { + std::copy(token.begin(), token.end(), &chars[char_offset]); + char_offset += token.size(); + }; + }; + ends[batch] = char_offset; + } + outputs[2].set_shape({char_offset}); + return true; +} + +bool SentencepieceStreamDetokenizer::has_evaluate() const { + return true; +} + +std::shared_ptr SentencepieceStreamDetokenizer::clone_with_new_inputs(const OutputVector& new_args) const { + return std::make_shared(new_args, m_sp); +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp new file mode 100644 index 000000000..fbc0394aa --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.hpp @@ -0,0 +1,89 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace sentencepiece { + class SentencePieceProcessor; +} + +namespace TemplateExtension { + class SentencepieceTokenizer : public ov::op::Op { + public: + OPENVINO_OP("SentencepieceTokenizer"); + + SentencepieceTokenizer() = default; + SentencepieceTokenizer(const ov::OutputVector& args, int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse); + SentencepieceTokenizer(const ov::OutputVector& args, const std::shared_ptr& sp, int32_t nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const override; + + private: + std::shared_ptr m_sp; + int32_t m_nbest_size; + float m_alpha; + bool m_add_bos; + bool m_add_eos; + bool m_reverse; + }; + + + class SentencepieceDetokenizer : public ov::op::Op { + public: + OPENVINO_OP("SentencepieceDetokenizer"); + + SentencepieceDetokenizer() = default; + SentencepieceDetokenizer(const ov::OutputVector& args); + SentencepieceDetokenizer(const ov::OutputVector& args, + const std::shared_ptr& sp); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const override; + + private: + std::shared_ptr m_sp; + }; + + + class SentencepieceStreamDetokenizer : public ov::op::Op { + public: + OPENVINO_OP("SentencepieceStreamDetokenizer"); + + SentencepieceStreamDetokenizer() = default; + SentencepieceStreamDetokenizer(const ov::OutputVector& args); + SentencepieceStreamDetokenizer(const ov::OutputVector& args, + const std::shared_ptr& sp); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const override; + + private: + std::shared_ptr m_sp; + }; +} // namespace TemplateExtension diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp new file mode 100644 index 000000000..f89c465c6 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.cpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "string_tensor_pack.hpp" +#include "utils.hpp" + +using namespace ov; + + +void StringTensorPack::validate_and_infer_types() { + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorPack supports only 'begins_ends' mode, but get " + m_mode); + check_string_input(this, 0); + #if USE_STRING_TENSORS + set_output_type(0, element::string, get_input_partial_shape(0)); + #else + set_output_type(0, element::u8, PartialShape{Dimension()}); + #endif +} + +bool StringTensorPack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { +#if USE_STRING_TENSORS + // TODO + return false; +#else + auto rank = inputs[0].get_shape().size(); + if (rank != 1) { + std::cerr << "[ WARNING ] StringTensorPack ignores the rank " << rank << " of input tensor and set rank=1 in the output\n"; + } + + auto num_elements = shape_size(inputs[0].get_shape()); + auto num_chars = shape_size(inputs[2].get_shape()); + auto num_output_elements = 4*(1 + 1 + num_elements) + num_chars; + outputs[0].set_shape(Shape{num_output_elements}); + + // FIXME: Do the repacking, otherwise cannot handle string tensors with gaps between strings + //auto begins = inputs[0].data(); // this is not needed as no repacking happens in this version of code + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + auto output = outputs[0].data(); + auto output_int32 = reinterpret_cast(output); + + *output_int32++ = num_elements; + *output_int32++ = 0; + output_int32 = std::copy(ends, ends + num_elements, output_int32); + output = reinterpret_cast(output_int32); + output = std::copy(chars, chars + num_chars, output); + + OPENVINO_ASSERT(num_output_elements == output - outputs[0].data(), "[ INTERNAL ERROR ] StringTensorPack output tensor is corrupted"); + + // WARNING! Chars are not repacked. If there are gaps between strings, they will remain. + + return true; +#endif +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp new file mode 100644 index 000000000..8766e6062 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_pack.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Having a decomposed representation for a tensor, converts it to a single string tensor +// (packed u8 or natively supported element::string depending on whether or not USE_STRING_TENSORS defined). +class StringTensorPack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorPack"); + + StringTensorPack () = default; + + StringTensorPack(ov::OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; + +private: + + std::string m_mode = "begins_ends"; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp new file mode 100644 index 000000000..bd1320f44 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "string_tensor_unpack.hpp" +#include "utils.hpp" + +using namespace ov; + + +void StringTensorUnpack::validate_and_infer_types() { + OPENVINO_ASSERT( + get_input_size() == 1, + "Number of inputs for StringTensorUnpack is not equal to 1"); + + auto output_shape = PartialShape::dynamic(); + + // In case of explicit string tensors the shape is carried by input tensor itself +// OPENVINO_ASSERT( +// input_shape == PartialShape::dynamic(), +// "Excplicitly set shape for a string tensor in the unpacking is not supported"); + + // There are three cases that affect expected element type of the input tensor: + // - when string tensor is passed and we are before the hack is applied (element::string) and + // - when string tensor is passed and we are after the hack in CPU (element::u8) and + // - when stirng tensor is not really used, and we expect a packed string tensor in this case (element::u8) + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + OPENVINO_ASSERT( + get_input_element_type(0) == element::string || + get_input_element_type(0) == element::dynamic, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); +#endif +#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS + OPENVINO_ASSERT( + get_input_element_type(0) == element::u8 || + get_input_element_type(0) == element::dynamic, + "Type of StringTensorUnpack input is expected to be element::string before a model compilation or element::u8 after the compilation or when element::string is not supported"); +#endif + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + if(get_input_element_type(0) == element::string) { + output_shape = get_input_partial_shape(0); + } +#endif + +#if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK || !USE_STRING_TENSORS + if(get_input_element_type(0) == element::u8) + { + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + // After the plugin hack, a tensor is represented as a wrapping u8 tensor that will hold a pointer to a string tensor. + // The original shape of a string tensor is stored in RT attribute of a tensor descriptor. + const auto& rt_info = get_input_tensor(0).get_rt_info(); + auto it = rt_info.find("__original_partial_shape"); + + // StringTensorUnpack expects __original_partial_shape attribute of type PartialShape in the input tensor. + // If it is not found that means that model compilation wasn't pass the expected transformation where a string tensor + // is wrapped to a u8 tensor holding a pointer, or because evaluation of this node is in progress and tensor attributes aren't preserved. + if(it != rt_info.end() && it->second.is()) { + output_shape = it->second.as(); + } else { + #endif + #if !USE_STRING_TENSORS + // If string tensors shouldn't be used, then the packed u8 format is also expected + // as an input, but in this case only rank is known + OPENVINO_ASSERT( + get_input_partial_shape(0).rank().is_dynamic() || get_input_partial_shape(0).rank().get_length() == 1, + "StringTensorUnpack expects a u8 tensor with rank 1 that holds packed batched string tensor as an input, but observes type " + + get_input_element_type(0).get_type_name() + " and shape " + get_input_partial_shape(0).to_string()); + + output_shape = PartialShape({Dimension()}); // [?] + #endif + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + } + #endif + } +#endif + + OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get " + m_mode); + + if (m_mode == "begins_ends") { + set_string_output(this, 0, output_shape); + } +} + +bool StringTensorUnpack::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ptensor = &inputs[0]; + #if OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + if(ptensor->get_element_type() == element::u8 && ptensor->get_byte_size() == sizeof(void*)) { + auto data = *reinterpret_cast(ptensor->data()); + if(data != nullptr) { + ptensor = reinterpret_cast(data); + } + } + #endif + + auto tensor = *ptensor; + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + if(tensor.get_element_type() == element::string) { + Shape input_shape = tensor.get_shape(); + const std::string* input_strings = tensor.data(); + unpack_strings_to_tensors(input_strings, input_shape, outputs[0], outputs[1], outputs[2]); + return true; + } else { +#endif + +#if USE_STRING_TENSORS + OPENVINO_ASSERT(false, "Detected a u8 tensor but element::string tensor should be provided"); +#endif + + int32_t batch_size; + const int32_t* begin_ids; + const int32_t* end_ids; + const uint8_t* data; + parse_packed_strings(tensor, batch_size, begin_ids, end_ids, data); + auto num_chars = end_ids[batch_size - 1]; + + outputs[0].set_shape(Shape{static_cast(batch_size)}); + outputs[1].set_shape(Shape{static_cast(batch_size)}); + outputs[2].set_shape(Shape{static_cast(num_chars)}); + auto begins = outputs[0].data(); + auto ends = outputs[1].data(); + auto chars = outputs[2].data(); + std::copy(begin_ids, begin_ids + batch_size, begins); + std::copy(end_ids, end_ids + batch_size, ends); + std::copy(data, data + num_chars, chars); + + return true; + +#if OPENVINO_ELEMENT_STRING_SUPPORTED + } +#endif +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp new file mode 100644 index 000000000..2570b9596 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/string_tensor_unpack.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Unpack a string tensor representation regardless of the source format, which +// can be an OV tensor with element::string element type (if supported) or u8 +// packed representation, to a decompose tensor representation that may potentially +// consist of multiple tensors. The destination format is defined by `mode` attribute. +// Shape of the output tensor is compitelly recognized from the input (if supported) +// or defined partially by a dedicated input attribute `shape`. If `shape` is not set, +// which default to completelly dynamic `shape`, then output shape is defined +// by an input tensor. +class StringTensorUnpack : public ov::op::Op { +public: + OPENVINO_OP("StringTensorUnpack"); + + StringTensorUnpack () = default; + + StringTensorUnpack(ov::OutputVector inputs, const std::string& mode = "begins_ends") + : ov::op::Op(inputs), m_mode(mode) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + auto result = std::make_shared(inputs, m_mode); + return result; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("mode", m_mode); + return true; + } + + bool has_evaluate() const { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const; + +private: + + std::string m_mode = "begins_ends"; +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp new file mode 100644 index 000000000..51179dcac --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp @@ -0,0 +1,251 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" + +#include "tensorflow_translators.hpp" +#include "utils.hpp" + +#include "string_tensor_pack.hpp" +#include "string_tensor_unpack.hpp" +#include "sentence_piece.hpp" +#include "case_fold.hpp" +#include "normalize_unicode.hpp" +#include "regex_normalization.hpp" +#include "regex_split.hpp" + +#include "wordpiece_tokenizer.hpp" + +using namespace TemplateExtension; +using namespace ov; +using namespace ov::frontend; +using namespace ov::opset10; + +namespace { + template + T extract_scalar_const_value(const std::shared_ptr& node, const std::string& const_name) { + auto const_node = as_type_ptr(node); + FRONT_END_GENERAL_CHECK(const_node, "Conversion expects " + const_name + " to be constant."); + std::vector const_value = const_node->cast_vector(); + FRONT_END_GENERAL_CHECK(const_value.size() == 1, "Conversion expects " + const_name + " to be a scalar."); + return const_value[0]; + } +} // namespace + +OutputVector translate_sentencepiece_op(const NodeContext& node) { + // extract model to configure SentencePieceTokenizer + auto sp_model_ov_any = node.get_attribute_as_any("model"); + FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), + "SentencePieceOp configuration model is in incorrect format"); + auto str_spm_model = sp_model_ov_any.as(); + auto sp_model_const = std::make_shared(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data()); + return { sp_model_const }; +} + +NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { + // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, + // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp + FRONT_END_GENERAL_CHECK(node.get_input_size() > 0, "RaggedTensorToSparse expects at least one input."); + auto node_name = node.get_name(); + + // check that producers of RaggedTensorToSparse is SentencePieceTokenizer + auto sp_tokenize_op = node.get_input(0).get_node_shared_ptr(); + FRONT_END_GENERAL_CHECK(sp_tokenize_op->get_input_size() > 6, + "SentencepieceTokenizeOp expects at least six inputs"); + + // prepare inputs that go to custom operation + // prepare input 0 - SentencePieceTokenizer configuration model + auto sp_model_const = as_type_ptr(sp_tokenize_op->input_value(0).get_node_shared_ptr()); + FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant."); + + // prepare input six inputs + auto inputs = sp_tokenize_op->input_value(1); + + // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes + auto nbest_size = extract_scalar_const_value(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size"); + auto alpha = extract_scalar_const_value(sp_tokenize_op->input_value(3).get_node_shared_ptr(), "alpha"); + auto add_bos = extract_scalar_const_value(sp_tokenize_op->input_value(4).get_node_shared_ptr(), "add_bos"); + auto add_eos = extract_scalar_const_value(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos"); + auto reverse = extract_scalar_const_value(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse"); + +#if !USE_STRING_TENSORS + // Override type of input tensor if this is a Parameter + if (auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr())) { + parameter->set_partial_shape(PartialShape{ Dimension() }); + parameter->set_element_type(element::u8); + parameter->validate_and_infer_types(); + } +#endif + +#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS + + OutputVector inputs_vector = OutputVector{ sp_model_const }; + auto unpacked_outputs = std::make_shared(OutputVector{inputs}, "begins_ends")->outputs(); + inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end()); + +#else + + OutputVector inputs_vector = OutputVector{ sp_model_const, inputs }; + +#endif + + // create a node with custom operation + auto sp_tokenizer_ext = std::make_shared(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse); + FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3, + "Internal error: SentencepieceTokenizer operation extension must have three outputs."); + + // set tensor names + sp_tokenizer_ext->output(0).add_names({ node_name + ":0" }); + sp_tokenizer_ext->output(1).add_names({ node_name + ":1" }); + sp_tokenizer_ext->output(2).add_names({ node_name + ":2" }); + + // create named outputs for the conversion extension + NamedOutputVector named_results; + named_results.push_back({ "sparse_indices", sp_tokenizer_ext->output(0) }); + named_results.push_back({ "sparse_values", sp_tokenizer_ext->output(1) }); + named_results.push_back({ "sparse_dense_shape", sp_tokenizer_ext->output(2) }); + + return named_results; +} + +ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared( + pre_translate_string_tensor_input(node.get_input(0)))->outputs()) }; +} + +ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "NormalizeUTF8 expects only 1 input"); + return { post_translate_string_tensor_output(std::make_shared( + pre_translate_string_tensor_input(node.get_input(0)), + node.get_attribute("normalization_form"))->outputs()) }; +} + +ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input"); + ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); + inputs.push_back(string_attribute_to_constant(node, "pattern")); + inputs.push_back(string_attribute_to_constant(node, "rewrite")); + return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) }; +} + +ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "RegexSplitWithOffsets expects 3 inputs"); + ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0)); + auto delim_regex_pattern = node.get_input(1).get_node()->input_value(2); // use u8 part of packed string tensor as we are expecting a scalar string: TODO: verify it is really there + inputs.push_back(delim_regex_pattern); + // TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolate` behaviour + auto outputs = std::make_shared(inputs)->outputs(); + auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]}); + return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) }; +} + +ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "WordpieceTokenizeWithOffsets expects 2 inputs"); + ov::OutputVector inputs = pre_translate_ragged_string_tensor_input(node.get_input(0)); + + #if USE_STRING_TENSORS + // It may seem enough to call pre_translate_string_tensor_input that will override Parameter element + // type in case if string tensors are not used. + // But a Parameter is still required to be overridden even if string tensors are used because in TF model + // it is represented not as a string tensor, but as a resource with hash table for lookup that we cannot interpret + // and have to replace by 1D string tensor. + override_parameter(node.get_input(1).get_node_shared_ptr(), element::string, PartialShape{Dimension()}); + #endif + + auto vocab = pre_translate_string_tensor_input(node.get_input(1)); + inputs.insert(inputs.end(), vocab.begin(), vocab.end()); + // FIXME: Cannot set real value for unk_token_id from attributes because it is not known in this operation + // TODO: Set other attributes. + auto wp_tokenizer = std::make_shared( + inputs, + node.get_attribute("suffix_indicator"), + node.get_attribute("max_bytes_per_word") + ); + return { post_translate_ragged_tensor_output(wp_tokenizer->outputs()) }; +} + +ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node) { + FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFindV2 expects 3 inputs"); + + // Check if this node is used in a combination with already converted WordpieceTokenizeWithOffsets + auto wp_tokenizer_outputs = pre_translate_ragged_tensor_input(node.get_input(1)); + auto wp_tokenizer = dynamic_cast(wp_tokenizer_outputs[0].get_node()); + OPENVINO_ASSERT(wp_tokenizer, "Conversion of LookupTableFindV2 without coupled WordpieceTokenizer is not yet supported"); + + // TODO: Check vocab matching for LookupTableFindV2 and WordpieceTokenizer + + // TODO: Check if overflow really happens in real models due to i64 to i32 conversion + auto unk_token_id = std::make_shared(node.get_input(2), element::i32); + + auto wp_tokenizer_inputs = wp_tokenizer->input_values(); + wp_tokenizer_inputs.push_back(unk_token_id); + //std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n"; + + auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs); + return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) }; +} + +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) { + // This is a copied-and-pasted and adopted fragment of TF reshape translator from OV. + // It checks if the input tensor has string type, and then perform custom tranlation. + // Otherwise it should operate identically to the stock version of Reshape translator in TF FE. + // TODO: Introduce an API to call original translators from an extension without copying the code to an extension. + + FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "Tensorflow Reshape op should have two inputs"); + auto tensor = node.get_input(0); + auto shape = node.get_input(1); + if(auto pack = dynamic_cast(tensor.get_node())) { + // TODO: If it is a beginning of the graph, how to detect strings? It falls in 'else' branch in this case. + // FIXME: Needs extension for a Parameter to prepare it first + auto begins = std::make_shared(pack->input_value(0), shape, false); + auto ends = std::make_shared(pack->input_value(1), shape, false); + auto chars = pack->input_value(2); + auto reshape = post_translate_string_tensor_output({begins, ends, chars}); + return {reshape}; + } else { + auto reshape = std::make_shared(tensor, shape, false); + return {reshape}; + } + // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals +} + +// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes +ov::OutputVector translate_const(const ov::frontend::NodeContext& node) { + auto ov_type = node.get_attribute_as_any("dtype"); + std::shared_ptr const_node; + if (!ov_type.is() || ov_type.as() == ov::element::dynamic || + ov_type.as() == ov::element::undefined) { + if (ov_type.is() && ov_type.as() == "DT_STRING") { + auto value_as_any = node.get_attribute_as_any("value"); + const auto& values = value_as_any.as>(); + ov::Tensor begins(element::i32, {}), ends(element::i32, {}), chars(element::u8, {}); + unpack_strings_to_tensors(&values[0], {values.size()}, begins, ends, chars); + const_node = std::make_shared(OutputVector{ + std::make_shared(begins), + std::make_shared(ends), + std::make_shared(chars) + }); + } else { + const_node = std::make_shared(OutputVector{}); + } + } else { + //static std::vector tensors; + auto tensor = node.get_attribute("value"); + //tensors.push_back(tensor); + const_node = std::make_shared(tensor); + #if OPENVINO_ELEMENT_STRING_SUPPORTED + if (const_node->get_element_type() == element::string) { + if(shape_size(tensor.get_shape()) > 0) { + auto strings = std::dynamic_pointer_cast(const_node)->get_data_ptr(); + } + const_node = std::make_shared(const_node->outputs()); + const_node = std::make_shared(const_node->outputs()); + } + #endif + } + //set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name + return {const_node}; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp new file mode 100644 index 000000000..8d501dced --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +ov::OutputVector translate_sentencepiece_op(const ov::frontend::NodeContext& node); +ov::frontend::NamedOutputVector translate_sentencepiece_tokenizer(const ov::frontend::NodeContext& node); +ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node); +ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); +ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node); +ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node); +ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); +ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext& node); +ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node); +ov::OutputVector translate_const(const ov::frontend::NodeContext& node); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp new file mode 100644 index 000000000..2eb4dcb20 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tokenizer.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "string_tensor_pack.hpp" +#include "string_tensor_unpack.hpp" +#include "ragged_tensor_pack.hpp" +#include "sentence_piece.hpp" +#include "case_fold.hpp" +#include "normalize_unicode.hpp" +#include "regex_normalization.hpp" +#include "regex_split.hpp" +#include "combine_segments.hpp" +#include "bytes_to_chars.hpp" +#include "wordpiece_tokenizer.hpp" +#include "bpe_tokenizer.hpp" +#include "ragged_to_dense.hpp" +#include "vocab_decoder.hpp" +#include "chars_to_bytes.hpp" + +#include "tensorflow_translators.hpp" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp new file mode 100644 index 000000000..b5c7fa08c --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -0,0 +1,223 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/util/framework_node.hpp" +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" +#include "string_tensor_pack.hpp" +#include "string_tensor_unpack.hpp" +#include "ragged_tensor_pack.hpp" + +using namespace ov; +using namespace ov::frontend; +using namespace ov::opset10; + +void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) { + auto strings = packed.data(); + auto bitstream_size = packed.get_byte_size(); + // check the format of the input bitstream representing the string tensor + FRONT_END_GENERAL_CHECK(bitstream_size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + batch_size = *reinterpret_cast(strings + 0); + FRONT_END_GENERAL_CHECK(bitstream_size >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + begin_ids = reinterpret_cast(strings + 4); + end_ids = begin_ids + 1; + symbols = strings + 4 + 4 + 4 * batch_size; +} + +void check_string_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::u8, "Expected a u8 tensor as the third part of the decomposed string representation"); +} + +void check_string_scalar_input(const Node* node, size_t input_index) { + auto shape = node->get_input_partial_shape(input_index); + auto element_type = node->get_input_element_type(input_index); + + #if false && USE_STRING_TENSORS + // This block is not used when we convert ops to decomposed representation (and we really do) + + OPENVINO_ASSERT( + (element_type == element::dynamic || element_type == element::string) && + (shape.rank().is_dynamic() || shape.rank().get_length() == 0), + "string/0D tensor is expected, but observed: " + element_type.get_type_name() + shape.to_string()); + + #else + + OPENVINO_ASSERT( + (element_type == element::dynamic || element_type == element::u8) && + (shape.rank().is_dynamic() || shape.rank().get_length() == 1), + "u8/1D tensor is expected"); + + #endif +} + +void check_ragged_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged representation"); + auto rank = node->get_input_partial_shape(input_index+2).rank(); + FRONT_END_GENERAL_CHECK(rank.is_dynamic() || rank.get_length() == 1, "The last tensor in ragged tensor representation should be a 1D tensor"); +} + +void check_ragged_string_input(const Node* node, size_t input_index) { + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+0) == element::i32, "Expected an i32 tensor as the first part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+1) == element::i32, "Expected an i32 tensor as the second part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+2) == element::i32, "Expected an i32 tensor as the third part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+3) == element::i32, "Expected an i32 tensor as the forth part of the decomposed ragged string representation"); + FRONT_END_GENERAL_CHECK(node->get_input_element_type(input_index+4) == element::u8, "Expected a u8 tensor as the fifth part of the decomposed ragged string representation"); +} + +void set_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); // byte offset in output[+2] -- begin of each string + node->set_output_type(output_index+1, element::i32, shape); // byte offset in output[+2] -- end of each string + node->set_output_type(output_index+2, element::u8, PartialShape{Dimension()}); // symbols from all strings concatenated +} + +void set_ragged_string_output(Node* node, size_t output_index, const PartialShape& shape) { + node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements + node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+3] -- end of each ragged dimension elements + node->set_output_type(output_index+2, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- begin of each string + node->set_output_type(output_index+3, element::i32, PartialShape{Dimension()}); // byte offset in output[+4] -- end of each string + node->set_output_type(output_index+4, element::u8, PartialShape{Dimension()}); // symbols from all strings cnocatenated +} + +void set_ragged_output(Node* node, size_t output_index, const PartialShape& shape, element::Type type) { + node->set_output_type(output_index+0, element::i32, shape); // element offset in output[+2] -- begin of each ragged dimension elements + node->set_output_type(output_index+1, element::i32, shape); // element offset in output[+2] -- end of each ragged dimension elements + node->set_output_type(output_index+2, type, PartialShape{Dimension()}); // flatten elements +} + + +void unpack_strings_to_tensors (const std::string* strings, const Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars) { // TODO: no need for a reference to a ov::Tensor? + auto nelements = shape_size(shape); + + size_t total = 0; + for(size_t i = 0; i < nelements; ++i) + total += strings[i].length(); + + begins.set_shape(shape); + ends.set_shape(shape); + chars.set_shape(Shape{total}); + + auto pbegins = begins.data(); + auto pends = ends.data(); + auto poutput_symbols = reinterpret_cast(chars.data()); + size_t offset = 0; + + for(size_t i = 0; i < nelements; ++i) + { + pbegins[i] = offset; + poutput_symbols = std::copy(strings[i].begin(), strings[i].end(), poutput_symbols); + offset += strings[i].length(); + pends[i] = offset; + } +} + +void override_parameter (std::shared_ptr node, element::Type type, const PartialShape& shape) { + if (auto parameter = std::dynamic_pointer_cast(node)) { + // TODO: Apply this change conditionally based on real Parameter value + std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n"; + parameter->set_partial_shape(shape); + parameter->set_element_type(type); + parameter->validate_and_infer_types(); + } +} + +// TODO: replace NodeContext and input_index by a single input +OutputVector pre_translate_string_tensor_input(ov::Output input) { + auto input_node = input.get_node_shared_ptr(); + +#if !USE_STRING_TENSORS + override_parameter(input_node, element::u8, PartialShape{Dimension()}); +#endif + + if (auto struct_pack = std::dynamic_pointer_cast(input_node)) { + FRONT_END_GENERAL_CHECK(struct_pack->get_input_size() == 3, "Expected 3 inputs to StringTensorPack which represents a string tensor"); + return struct_pack->input_values(); + } else { + #if USE_STRING_TENSORS || true // always + return std::make_shared(OutputVector{input}, "begins_ends")->outputs(); + #else + // Suppose this is u8 packed string tensor with a single batch dimension + // Unpack this tensor using standard operations + + // Cannot do that because there is not ReinterprectCast operation in OV + // TODO: Find a way to make it without reinterpretation operation or introduce it as an extension (easy) + #endif + } +} + +OutputVector pre_translate_ragged_tensor_input(ov::Output input) { + auto ragged_pack = dynamic_cast(input.get_node()); + OPENVINO_ASSERT(ragged_pack, "Expected RaggedTensorPack but didn't find it"); + return ragged_pack->input_values(); +} + +OutputVector pre_translate_ragged_string_tensor_input(ov::Output input) { + auto ragged_inputs = pre_translate_ragged_tensor_input(input); + auto string_inputs = pre_translate_string_tensor_input(ragged_inputs[2]); + ragged_inputs.pop_back(); + ragged_inputs.insert(ragged_inputs.end(), string_inputs.begin(), string_inputs.end()); + return ragged_inputs; +} + +ov::Output post_translate_string_tensor_output(const OutputVector& outputs) { + FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); + return std::make_shared(outputs, "begins_ends"); +} + +ov::Output post_translate_ragged_tensor_output(const OutputVector& outputs) { + FRONT_END_GENERAL_CHECK(outputs.size() == 3, "Expected 3 tensors in decomposed string tensor representation"); + return std::make_shared(outputs); +} + +bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorVector& inputs, std::function normalizer) { + auto begins = inputs[0].data(); + auto ends = inputs[1].data(); + auto chars = inputs[2].data(); + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_elements = inputs[0].get_size(); + + // TODO: How to avoid copying from this temporary buffer? + // TODO: It can be possible to collect output symbols directly in the output tensor memory if `normalizer` has reasonable estimation for the final size. + std::deque buffer; + + // For the whole implementation below the input shapes can be ignored, we are working with the flatten representaions + // and only number of elements in the original tensors matter + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + + for(size_t i = 0; i < num_elements; ++i) { + new_begins[i] = buffer.size(); + std::string new_str = normalizer(std::string(chars + begins[i], chars + ends[i])); + buffer.insert(buffer.end(), new_str.begin(), new_str.end()); + new_ends[i] = buffer.size(); + } + + // Copy collected symbols to the target output tensor + + outputs[2].set_shape(Shape{buffer.size()}); + auto new_chars = outputs[2].data(); + std::copy(buffer.begin(), buffer.end(), new_chars); + + return true; +} + +std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name) { + auto value = node.get_attribute(name); + + // TODO: How to translate attribute `replace_global`? + + #if USE_STRING_TENSORS + return std::make_shared(element::string, Shape{}, &value); + #else + return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); + #endif +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp new file mode 100644 index 000000000..8ffbc9e04 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + + +#ifndef OPENVINO_ELEMENT_STRING_SUPPORTED + #define OPENVINO_ELEMENT_STRING_SUPPORTED 0 +#endif + +#ifndef OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK + #define OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK 0 +#endif + +#define USE_STRING_TENSORS 0 // modify this depending on willingness to use explicit string tensors + +#if USE_STRING_TENSORS && !OPENVINO_ELEMENT_STRING_SUPPORTED + #error "USE_STRING_TENSORS = 1 can be used only when OpenVINO supports element::string that is determined by OPENVINO_ELEMENT_STRING_SUPPORTED == 1" +#endif + +#define SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS 0 + + +void parse_packed_strings ( + const ov::Tensor& packed, + int32_t& batch_size, + const int32_t*& begin_ids, + const int32_t*& end_ids, + const uint8_t*& symbols); + + +void check_string_input(const ov::Node* node, size_t input_index); + +void check_string_scalar_input(const ov::Node* node, size_t input_index); + +void check_ragged_input(const ov::Node* node, size_t input_index); + +void check_ragged_string_input(const ov::Node* node, size_t input_index); + +void set_string_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape); + +void set_ragged_string_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape); + +void set_ragged_output(ov::Node* node, size_t output_index, const ov::PartialShape& shape, ov::element::Type type); + +void unpack_strings_to_tensors(const std::string* strings, const ov::Shape shape, ov::Tensor& begins, ov::Tensor& ends, ov::Tensor& chars); + +void override_parameter (std::shared_ptr node, ov::element::Type type, const ov::PartialShape& shape); + +ov::OutputVector pre_translate_string_tensor_input(ov::Output input); + +ov::OutputVector pre_translate_ragged_tensor_input(ov::Output input); + +ov::OutputVector pre_translate_ragged_string_tensor_input(ov::Output input); + +ov::Output post_translate_string_tensor_output(const ov::OutputVector& outputs); + +ov::Output post_translate_ragged_tensor_output(const ov::OutputVector& outputs); + +bool evaluate_normalization_helper ( + ov::TensorVector& outputs, + const ov::TensorVector& inputs, + std::function normalizer); + +std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp new file mode 100644 index 000000000..1d173abce --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" + +#include "vocab_decoder.hpp" +#include "utils.hpp" + +using namespace ov; + +void VocabDecoder::validate_and_infer_types() { + check_string_input(this, 1); + const auto shape = get_input_partial_shape(0); + set_ragged_string_output(this, 0, {shape[0]}); +} + +bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto batch_size = inputs[0].get_shape()[0]; + auto seq_len = inputs[0].get_shape()[1]; + auto input_data = inputs[0].data(); + + auto vocab_begins = inputs[1].data(); + auto vocab_ends = inputs[2].data(); + auto vocab_chars = inputs[3].data(); + auto vocab_size = inputs[1].get_size(); + + std::vector> vocab; + vocab.resize(vocab_size); + + OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern"); + + for(size_t id = 0; id < vocab_size; ++id) { + vocab[id] = std::vector(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + } + // Set output shapes + outputs[0].set_shape({batch_size}); + outputs[1].set_shape({batch_size}); + outputs[2].set_shape({batch_size * seq_len}); + outputs[3].set_shape({batch_size * seq_len}); + outputs[4].set_shape({batch_size * seq_len * 100}); // 100 chars - max token length + const size_t num_rows = inputs[0].get_size(); + + // Get pointers in the output tensors + auto new_ragged_begins = outputs[0].data(); + auto new_ragged_ends = outputs[1].data(); + auto new_begins = outputs[2].data(); + auto new_ends = outputs[3].data(); + auto new_chars = outputs[4].data(); + uint32_t char_offset = 0; + + for(size_t batch = 0; batch < batch_size; ++batch) { + new_ragged_begins[batch] = batch * seq_len; + new_ragged_ends[batch] = new_ragged_begins[batch] + seq_len; + + for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) { + auto token_id = input_data[seq]; + auto token = vocab[token_id]; + + std::copy(token.begin(), token.end(), &new_chars[char_offset]); + + new_begins[seq] = char_offset; + char_offset += token.size(); + new_ends[seq] = char_offset; + } + } + outputs[4].set_shape({char_offset}); + return true; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp new file mode 100644 index 000000000..14d91032c --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/vocab_decoder.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +class VocabDecoder : public ov::op::Op { +public: + OPENVINO_OP("VocabDecoder"); + + VocabDecoder () = default; + + VocabDecoder(const ov::OutputVector& arguments) : + ov::op::Op(arguments) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } +}; diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp new file mode 100644 index 000000000..a4e853ec7 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "wordpiece_tokenizer.hpp" +#include "utils.hpp" +#include "openvino/opsets/opset10.hpp" + +using namespace ov; +using namespace ov::opset10; + + +WordpieceTokenizer::WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::string& suffix_indicator, + int max_bytes_per_word +) : + ov::op::Op(arguments), + m_suffix_indicator(suffix_indicator), + m_max_bytes_per_word(max_bytes_per_word) { + + constructor_validate_and_infer_types(); +} + +WordpieceTokenizer::WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr& tokenizer, + const std::string& suffix_indicator, + int max_bytes_per_word +) : + ov::op::Op(arguments), + m_tokenizer(tokenizer), + m_suffix_indicator(suffix_indicator), + m_max_bytes_per_word(max_bytes_per_word) { + + if (m_tokenizer == nullptr) { + // vocab constant folding doesn't work, get packed constant + auto packed_vocab_const = as_type_ptr(arguments[5].get_node_shared_ptr()->get_input_node_shared_ptr(0)); + auto packed_vocab_buf = static_cast(packed_vocab_const->get_data_ptr()); + auto vocab_size = *reinterpret_cast(packed_vocab_buf + 0); + auto vocab_begins = reinterpret_cast(packed_vocab_buf + 4); + auto vocab_ends = reinterpret_cast(packed_vocab_buf + 4 + 4); + auto vocab_chars = packed_vocab_buf + 4 + 4 + 4 * vocab_size; + + auto unk_token_id_const = as_type_ptr(arguments[8].get_node_shared_ptr()); + auto unk_token_id = *static_cast(unk_token_id_const->get_data_ptr()); + + core::Vocab vocab; + std::string unk_token; + if(unk_token_id < 0) + unk_token_id += vocab_size; + for(size_t id = 0; id < vocab_size; ++id) { + auto token = std::string(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]); + vocab[token] = int32_t(id); // TODO: Check range + if(id == unk_token_id) + unk_token = token; + } + m_tokenizer = std::make_shared(vocab, unk_token, m_max_bytes_per_word, m_suffix_indicator, true); + } + constructor_validate_and_infer_types(); +} + + +void WordpieceTokenizer::validate_and_infer_types() { + check_ragged_string_input(this, 0); + check_string_input(this, 5); + set_ragged_output(this, 0, get_input_partial_shape(0), element::i32); +} + + +bool WordpieceTokenizer::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + auto ragged_begins = inputs[0].data(); + auto ragged_ends = inputs[1].data(); + auto begins = inputs[2].data(); + auto ends = inputs[3].data(); + auto chars = inputs[4].data(); + + // Set output shapes + outputs[0].set_shape(inputs[0].get_shape()); + outputs[1].set_shape(inputs[1].get_shape()); + const size_t num_rows = inputs[0].get_size(); + + + // FIXME: Not accurate estimation as there is theoretical possibility for re-use the same symbol area + // to represent different elements in ragged tensor + outputs[2].set_shape({inputs[4].get_size()}); + + // Get pointers in the output tensors + auto new_begins = outputs[0].data(); + auto new_ends = outputs[1].data(); + auto new_elems = outputs[2].data(); + int32_t ragged_offset = 0; + + for(size_t seq = 0; seq < num_rows; ++seq) { + new_begins[seq] = ragged_offset; + + for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) { + + auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]); + std::vector results = m_tokenizer->Tokenize(str); + + for (const core::Token& token : results) { + OPENVINO_ASSERT(ragged_offset < outputs[2].get_size()); + new_elems[ragged_offset++] = token.id_; + }; + } + new_ends[seq] = ragged_offset; + } + outputs[2].set_shape({size_t(ragged_offset)}); + return true; +} + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp new file mode 100644 index 000000000..c6e785e55 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/wordpiece_tokenizer.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "fast_tokenizer/models/models.h" + +using namespace paddlenlp::fast_tokenizer; + +#undef tokenizer + +class WordpieceTokenizer : public ov::op::Op { +public: + OPENVINO_OP("WordpieceTokenizer"); + + WordpieceTokenizer () = default; + WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::string& suffix_indicator = "##", + int max_bytes_per_word = 100 + ); + WordpieceTokenizer( + const ov::OutputVector& arguments, + const std::shared_ptr& tokenizer, + const std::string& suffix_indicator = "##", + int max_bytes_per_word = 100 + ); + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { + return std::make_shared(inputs, m_tokenizer, m_suffix_indicator, m_max_bytes_per_word); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("suffix_indicator", m_suffix_indicator); + visitor.on_attribute("max_bytes_per_word", m_max_bytes_per_word); + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + + bool has_evaluate() const { + return true; + } + +private: + std::shared_ptr m_tokenizer; + std::string m_suffix_indicator = "##"; + int m_max_bytes_per_word = 100; // TODO: Can it be done outside the op as preprocessing of the input? +}; diff --git a/third-party-programs.txt b/third-party-programs.txt index 89624c7dc..1d9a9fcd6 100644 --- a/third-party-programs.txt +++ b/third-party-programs.txt @@ -3272,6 +3272,501 @@ limitations under the License. ------------------------------------------------------------- +fast_tokenizer + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +------------------------------------------------------------- + +re2 + +// Copyright (c) 2009 The RE2 Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------- + +icu4c + +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +------------------------------------------------------------- + +sentencepiece + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +------------------------------------------------------------- + vscode-extension-samples Copyright (c) Microsoft Corporation