diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 4c41f1ddd..37b9e94cf 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -172,7 +172,7 @@ jobs: python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt python -m pip install $(INSTALL_DIR)/tools/openvino-*.whl python -m pip install $(OPENVINO_REPO_DIR)/tools/mo/ - python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all] + python -m pip install $(REPO_DIR)/modules/custom_operations/.[all] workingDirectory: $(WORK_DIR) displayName: 'Create virtual env' diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index 121a19029..946f57609 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -163,7 +163,7 @@ jobs: python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt python -m pip install $(OPENVINO_REPO_DIR)/tools/mo/ python -m pip install $(INSTALL_DIR)/tools/openvino-*.whl - python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[transformers] + python -m pip install $(REPO_DIR)/modules/custom_operations/.[transformers] workingDirectory: $(WORK_DIR) displayName: 'Create virtual env' diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index c04b95af8..3c0536a29 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -104,6 +104,7 @@ jobs: $(PYTHON_EXE) -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\requirements.txt $(PYTHON_EXE) -m pip install -r $(REPO_DIR)\modules\custom_operations\tests\requirements.txt $(PYTHON_EXE) -m pip install $(OPENVINO_REPO_DIR)\tools\mo + $(PYTHON_EXE) -m pip install $(REPO_DIR)\modules\custom_operations\.[all] powershell -command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" choco install opencv -y workingDirectory: $(WORK_DIR) @@ -180,3 +181,9 @@ jobs: $(PYTHON_EXE) -m pytest -k "not sparse_conv" tests\run_tests.py workingDirectory: $(REPO_DIR)\modules\custom_operations displayName: 'Custom user operation tests' + + - script: | + call $(SETUPVARS) -pyver 3.8 && ^ + $(PYTHON_EXE) -m pytest --tb=no tokenizers_test.py + workingDirectory: $(REPO_DIR)\modules\custom_operations\user_ie_extensions\tokenizer\python\tests + displayName: 'Tokenizers extension regression test' diff --git a/modules/custom_operations/pyproject.toml b/modules/custom_operations/pyproject.toml new file mode 100644 index 000000000..416bc6ccf --- /dev/null +++ b/modules/custom_operations/pyproject.toml @@ -0,0 +1,58 @@ +[project] +name = "ov_tokenizer" +version = "0.0.1" +description = "Convert tokenizers into OpenVINO models" +requires-python = ">=3.8" +authors = [ + { name = "OpenVINO Developers", email = "openvino@intel.com" }, +] +classifiers = [ + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', +] + +dependencies = [ + "openvino>=2023.1", + "numpy" +] + +[project.optional-dependencies] +dev = [ + "black", + "ruff", + "pytest", +] +transformers = [ + "transformers[sentencepiece]" +] +all = [ + "ov_tokenizer[dev,transformers]" +] + + +[tool.black] +line-length = 119 +target-versions = ["py38", "py39", "py310", "py311", "py312"] + + +[tool.ruff] +ignore = ["C901", "E501", "E741", "W605"] +select = ["C", "E", "F", "I", "W"] +line-length = 119 + +[tool.ruff.per-file-ignores] +"__init__.py" = ["F401"] +"ov_tokenizer/hf_parser.py" = ["F821"] + +[tool.ruff.isort] +lines-after-imports = 2 + +[build-system] +requires = [ + "setuptools>=42", + "scikit-build~=0.17.0", + "cmake>=3.14" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/modules/custom_operations/setup.py b/modules/custom_operations/setup.py new file mode 100644 index 000000000..710b52816 --- /dev/null +++ b/modules/custom_operations/setup.py @@ -0,0 +1,15 @@ +from skbuild import setup +from skbuild import constants + +setup( + packages=["ov_tokenizer"], + package_dir={"": "user_ie_extensions/tokenizer/python"}, + cmake_install_dir="user_ie_extensions/tokenizer/python/ov_tokenizer/libs", + cmake_args=['-DCUSTOM_OPERATIONS:STRING=tokenizer', + '-DBUILD_FAST_TOKENIZERS=OFF'] +) + +# When building extension modules `cmake_install_dir` should always be set to the +# location of the package you are building extension modules for. +# Specifying the installation directory in the CMakeLists subtley breaks the relative +# paths in the helloTargets.cmake file to all of the library components. \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 433b2b10a..bc5e08031 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -96,9 +96,9 @@ if(tokenizer IN_LIST CUSTOM_OPERATIONS) add_subdirectory(tokenizer) # post build steps - if(extra_dlls) - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${extra_dlls} $) + if(extra_libs) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${extra_libs} $) endif() endif() @@ -108,3 +108,24 @@ target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_A # TODO: remove target_include_directories(${TARGET_NAME} PUBLIC ./include/) + +# Wheel packaging using skbuild +if(DEFINED SKBUILD) + # Installing the extension module to the root of the package + if(LINUX) + set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN") + install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION .) + elseif(APPLE) + set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "@loader_path") + install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION .) + elseif(WIN32 AND X86_64) + install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION .) + else() + message(FATAL "Unsupported build platform") + endif() + + if(extra_libs) + install(FILES ${extra_libs} DESTINATION .) + endif() + +endif() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 1bc875238..c4885a268 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -163,12 +163,16 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}" # Post build steps to copy core_tokenizers dependencies # -if(WIN32 AND X86_64) - if(BUILD_FAST_TOKENIZERS) - # TODO - else() - set(extra_dlls "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll" - "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll" - "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE) +if(BUILD_FAST_TOKENIZERS) + # TODO +else() + if(WIN32 AND X86_64) + set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll" + "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll" + "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE) + elseif(LINUX) + set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.so" PARENT_SCOPE) + elseif(APPLE) + set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.dylib" PARENT_SCOPE) endif() endif() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py index ce757b861..5b86aba3a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py @@ -2,7 +2,45 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os +import sys + +import openvino +from openvino.runtime.utils.node_factory import NodeFactory + from .convert_tokenizer import convert_tokenizer -from .node_factory import init_extension +from .node_factory import init_extension, _extension_path from .str_pack import pack_strings, unpack_strings from .utils import add_greedy_decoding, connect_models + +_ext_name = "user_ov_extensions" +if _extension_path: + # when the path to extension set manually + _ext_libs_path = os.path.dirname(_extension_path) +else: + # python installation case + _ext_libs_path = os.path.join(os.path.dirname(__file__), "libs") + +if sys.platform == "win32": + _ext_path = os.path.join(_ext_libs_path, f'{_ext_name}.dll') + if os.path.isdir(_ext_libs_path): + # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH. + os.add_dll_directory(os.path.abspath(_ext_libs_path)) + else: + sys.exit(f'Error: extention libriary path {_ext_libs_path} not found') +elif sys.platform == "darwin": + _ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.dylib') +elif sys.platform == "linux": + _ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.so') +else: + sys.exit(f'Error: extention does not support platform {sys.platform}') + +# patching openvino +old_core_init = openvino.runtime.Core.__init__ +def new_core_init(self, *k, **kw): + old_core_init(self, *k, **kw) + self.add_extension(_ext_path) +openvino.runtime.Core.__init__ = new_core_init + +_factory = NodeFactory() +_factory.add_extension(_ext_path) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 401c8ea2b..92c27b03b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -25,7 +25,7 @@ TOKENIZER_DECODER_NAME, TOKENIZER_ENCODER_NAME, ) -from .node_factory import factory +from . import _factory from .tokenizer_pipeline import ( BPETokenizationStep, BytesToCharsStep, @@ -365,7 +365,7 @@ def convert_sentencepiece_model_tokenizer( ) add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False - tokenizer_node = factory.create( + tokenizer_node = _factory.create( "SentencepieceTokenizer", [sp_model_node, input_node], { @@ -383,7 +383,7 @@ def convert_sentencepiece_model_tokenizer( default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type) broadcast = opset.broadcast(default_value, dense_shape) - scatternd_input_ids = factory.create( + scatternd_input_ids = _factory.create( "ScatterNDUpdate", [broadcast, indices, values], # FIXME: pad left side instead of right ) @@ -399,7 +399,7 @@ def convert_sentencepiece_model_tokenizer( outputs = scatternd_input_ids.outputs() if add_attention_mask: - attention_mask = factory.create( + attention_mask = _factory.create( "ScatterNDUpdate", [ broadcast, @@ -432,7 +432,7 @@ def convert_sentencepiece_model_tokenizer( def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model: token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) - decoder = factory.create( + decoder = _factory.create( "SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer", [sp_model_node, token_ids], ).outputs() @@ -440,7 +440,7 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode if streaming_decoder: decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) - string_output = factory.create("StringTensorPack", decoder).outputs() + string_output = _factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) tokenizer_decoder.validate_nodes_and_infer_types() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index 74654344a..75e25d45f 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -22,7 +22,7 @@ TOKENIZER_DECODER_NAME, TOKENIZER_ENCODER_NAME, ) -from .node_factory import factory +from . import _factory from .str_pack import pack_string, pack_strings @@ -61,7 +61,7 @@ def create_string_constant_node(value: Union[str, List[str]]) -> op.Constant: else: # support only 1D strings for now ps = pack_strings(value) - return factory.create("StringTensorUnpack", op.Constant(ps).outputs()) + return _factory.create("StringTensorUnpack", op.Constant(ps).outputs()) @dataclass @@ -74,7 +74,7 @@ class NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return factory.create( + return _factory.create( "NormalizeUnicode", input_nodes, {"normalization_form": self.normalization_form}, @@ -84,7 +84,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: @dataclass class CaseFoldStep(NormalizationStep): def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return factory.create("CaseFold", input_nodes).outputs() + return _factory.create("CaseFold", input_nodes).outputs() @dataclass @@ -122,7 +122,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: self.create_string_constant_node(self.replace_term), ) ) - return factory.create("RegexNormalization", input_nodes).outputs() + return _factory.create("RegexNormalization", input_nodes).outputs() @dataclass @@ -233,7 +233,7 @@ def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs()) - return factory.create( + return _factory.create( "RegexSplit", input_nodes, { @@ -263,7 +263,7 @@ class BytesToCharsStep(PreTokenizatinStep): """Maps chars to other chars for Byte-level BPE Tokenizer""" def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return factory.create( + return _factory.create( "BytesToChars", input_nodes, ).outputs() @@ -307,7 +307,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: *as_node(self.unk_token_id).outputs(), ) ) - return factory.create( + return _factory.create( "WordpieceTokenizer", input_nodes, { @@ -379,7 +379,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: *self.create_string_constant_node(self.merges).outputs(), ) ) - return factory.create( + return _factory.create( "BPETokenizer", input_nodes, { @@ -587,7 +587,7 @@ def get_ov_subgraph(self, input_nodes): raise UserInputError(f"Unexpected node type in CombineSegments: {type(node)}") op_inputs.append(make_constant_node(self.segment_ids, Type.i32).output(0)) - return factory.create("CombineSegments", op_inputs).outputs() + return _factory.create("CombineSegments", op_inputs).outputs() @dataclass @@ -632,7 +632,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: names = [TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME][: len(input_nodes) // 3] for i, name in enumerate(names): - cur_outputs = factory.create( + cur_outputs = _factory.create( "RaggedToDense", input_nodes[3 * i : 3 * (i + 1)] + max_length.outputs() + make_constant_node(0, Type.i32).outputs(), ).outputs() @@ -662,13 +662,13 @@ def get_vocab_node_outputs(self) -> Optional[List[Output]]: def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend(self.get_vocab_node_outputs()) - return factory.create("VocabDecoder", input_nodes, {}).outputs() + return _factory.create("VocabDecoder", input_nodes, {}).outputs() @dataclass class CharsToBytesStep(DecodingStep): def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return factory.create("CharsToBytes", input_nodes, {}).outputs() + return _factory.create("CharsToBytes", input_nodes, {}).outputs() @dataclass @@ -690,7 +690,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: *self.create_string_constant_node(self.replace_term).outputs(), ) ) - return factory.create("RegexNormalization", input_nodes).outputs() + return _factory.create("RegexNormalization", input_nodes).outputs() @classmethod def replace_sp_spaces(cls) -> "RegexDecodingStep": @@ -733,7 +733,7 @@ def get_encoder_ov_subgraph(self) -> Model: processing_outputs = [] for input_node in string_inputs: - input_node = factory.create("StringTensorUnpack", input_node.outputs()).outputs() + input_node = _factory.create("StringTensorUnpack", input_node.outputs()).outputs() for step in self.normalization_steps: input_node = step.get_ov_subgraph(input_node) input_node = self.add_ragged_dimension(input_node) @@ -783,7 +783,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: pipeline_step = step.get_ov_subgraph(input_nodes) input_nodes = pipeline_step - return factory.create("StringTensorPack", input_nodes).outputs() + return _factory.create("StringTensorPack", input_nodes).outputs() def get_decoder_ov_subgraph(self) -> Model: input_node = op.Parameter(Type.i32, PartialShape(["?", "?"]))