Skip to content

Commit

Permalink
Enabled wheel packaging
Browse files Browse the repository at this point in the history
  • Loading branch information
mryzhov committed Dec 1, 2023
1 parent 4baa7ed commit d370125
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .ci/azure/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ jobs:
python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt
python -m pip install $(INSTALL_DIR)/tools/openvino-*.whl
python -m pip install $(OPENVINO_REPO_DIR)/tools/mo/
python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[all]
python -m pip install $(REPO_DIR)/modules/custom_operations/.[all]
workingDirectory: $(WORK_DIR)
displayName: 'Create virtual env'
Expand Down
2 changes: 1 addition & 1 deletion .ci/azure/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ jobs:
python -m pip install -r $(REPO_DIR)/modules/custom_operations/tests/requirements.txt
python -m pip install $(OPENVINO_REPO_DIR)/tools/mo/
python -m pip install $(INSTALL_DIR)/tools/openvino-*.whl
python -m pip install $(REPO_DIR)/modules/custom_operations/user_ie_extensions/tokenizer/python/.[transformers]
python -m pip install $(REPO_DIR)/modules/custom_operations/.[transformers]
workingDirectory: $(WORK_DIR)
displayName: 'Create virtual env'
Expand Down
7 changes: 7 additions & 0 deletions .ci/azure/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ jobs:
$(PYTHON_EXE) -m pip install -r $(OPENVINO_REPO_DIR)\src\bindings\python\requirements.txt
$(PYTHON_EXE) -m pip install -r $(REPO_DIR)\modules\custom_operations\tests\requirements.txt
$(PYTHON_EXE) -m pip install $(OPENVINO_REPO_DIR)\tools\mo
$(PYTHON_EXE) -m pip install $(REPO_DIR)\modules\custom_operations\.[all]
powershell -command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))"
choco install opencv -y
workingDirectory: $(WORK_DIR)
Expand Down Expand Up @@ -180,3 +181,9 @@ jobs:
$(PYTHON_EXE) -m pytest -k "not sparse_conv" tests\run_tests.py
workingDirectory: $(REPO_DIR)\modules\custom_operations
displayName: 'Custom user operation tests'
- script: |
call $(SETUPVARS) -pyver 3.8 && ^
$(PYTHON_EXE) -m pytest --tb=no tokenizers_test.py
workingDirectory: $(REPO_DIR)\modules\custom_operations\user_ie_extensions\tokenizer\python\tests
displayName: 'Tokenizers extension regression test'
58 changes: 58 additions & 0 deletions modules/custom_operations/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
[project]
name = "ov_tokenizer"
version = "0.0.1"
description = "Convert tokenizers into OpenVINO models"
requires-python = ">=3.8"
authors = [
{ name = "OpenVINO Developers", email = "[email protected]" },
]
classifiers = [
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]

dependencies = [
"openvino>=2023.1",
"numpy"
]

[project.optional-dependencies]
dev = [
"black",
"ruff",
"pytest",
]
transformers = [
"transformers[sentencepiece]"
]
all = [
"ov_tokenizer[dev,transformers]"
]


[tool.black]
line-length = 119
target-versions = ["py38", "py39", "py310", "py311", "py312"]


[tool.ruff]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
line-length = 119

[tool.ruff.per-file-ignores]
"__init__.py" = ["F401"]
"ov_tokenizer/hf_parser.py" = ["F821"]

[tool.ruff.isort]
lines-after-imports = 2

[build-system]
requires = [
"setuptools>=42",
"scikit-build~=0.17.0",
"cmake>=3.14"
]
build-backend = "setuptools.build_meta"
15 changes: 15 additions & 0 deletions modules/custom_operations/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from skbuild import setup
from skbuild import constants

setup(
packages=["ov_tokenizer"],
package_dir={"": "user_ie_extensions/tokenizer/python"},
cmake_install_dir="user_ie_extensions/tokenizer/python/ov_tokenizer/libs",
cmake_args=['-DCUSTOM_OPERATIONS:STRING=tokenizer',
'-DBUILD_FAST_TOKENIZERS=OFF']
)

# When building extension modules `cmake_install_dir` should always be set to the
# location of the package you are building extension modules for.
# Specifying the installation directory in the CMakeLists subtley breaks the relative
# paths in the helloTargets.cmake file to all of the library components.
27 changes: 24 additions & 3 deletions modules/custom_operations/user_ie_extensions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ if(tokenizer IN_LIST CUSTOM_OPERATIONS)
add_subdirectory(tokenizer)

# post build steps
if(extra_dlls)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${extra_dlls} $<TARGET_FILE_DIR:${TARGET_NAME}>)
if(extra_libs)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${extra_libs} $<TARGET_FILE_DIR:${TARGET_NAME}>)
endif()
endif()

Expand All @@ -108,3 +108,24 @@ target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_A

# TODO: remove
target_include_directories(${TARGET_NAME} PUBLIC ./include/)

# Wheel packaging using skbuild
if(DEFINED SKBUILD)
# Installing the extension module to the root of the package
if(LINUX)
set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION .)
elseif(APPLE)
set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "@loader_path")
install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION .)
elseif(WIN32 AND X86_64)
install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION .)
else()
message(FATAL "Unsupported build platform")
endif()

if(extra_libs)
install(FILES ${extra_libs} DESTINATION .)
endif()

endif()
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,16 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}"
# Post build steps to copy core_tokenizers dependencies
#

if(WIN32 AND X86_64)
if(BUILD_FAST_TOKENIZERS)
# TODO
else()
set(extra_dlls "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE)
if(BUILD_FAST_TOKENIZERS)
# TODO
else()
if(WIN32 AND X86_64)
set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE)
elseif(LINUX)
set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.so" PARENT_SCOPE)
elseif(APPLE)
set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.dylib" PARENT_SCOPE)
endif()
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,45 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os
import sys

import openvino
from openvino.runtime.utils.node_factory import NodeFactory

from .convert_tokenizer import convert_tokenizer
from .node_factory import init_extension
from .node_factory import init_extension, _extension_path
from .str_pack import pack_strings, unpack_strings
from .utils import add_greedy_decoding, connect_models

_ext_name = "user_ov_extensions"
if _extension_path:
# when the path to extension set manually
_ext_libs_path = os.path.dirname(_extension_path)
else:
# python installation case
_ext_libs_path = os.path.join(os.path.dirname(__file__), "libs")

if sys.platform == "win32":
_ext_path = os.path.join(_ext_libs_path, f'{_ext_name}.dll')
if os.path.isdir(_ext_libs_path):
# On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH.
os.add_dll_directory(os.path.abspath(_ext_libs_path))
else:
sys.exit(f'Error: extention libriary path {_ext_libs_path} not found')
elif sys.platform == "darwin":
_ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.dylib')
elif sys.platform == "linux":
_ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.so')
else:
sys.exit(f'Error: extention does not support platform {sys.platform}')

# patching openvino
old_core_init = openvino.runtime.Core.__init__
def new_core_init(self, *k, **kw):
old_core_init(self, *k, **kw)
self.add_extension(_ext_path)
openvino.runtime.Core.__init__ = new_core_init

_factory = NodeFactory()
_factory.add_extension(_ext_path)
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TOKENIZER_DECODER_NAME,
TOKENIZER_ENCODER_NAME,
)
from .node_factory import factory
from . import _factory
from .tokenizer_pipeline import (
BPETokenizationStep,
BytesToCharsStep,
Expand Down Expand Up @@ -365,7 +365,7 @@ def convert_sentencepiece_model_tokenizer(
)
add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False

tokenizer_node = factory.create(
tokenizer_node = _factory.create(
"SentencepieceTokenizer",
[sp_model_node, input_node],
{
Expand All @@ -383,7 +383,7 @@ def convert_sentencepiece_model_tokenizer(

default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type)
broadcast = opset.broadcast(default_value, dense_shape)
scatternd_input_ids = factory.create(
scatternd_input_ids = _factory.create(
"ScatterNDUpdate",
[broadcast, indices, values], # FIXME: pad left side instead of right
)
Expand All @@ -399,7 +399,7 @@ def convert_sentencepiece_model_tokenizer(
outputs = scatternd_input_ids.outputs()

if add_attention_mask:
attention_mask = factory.create(
attention_mask = _factory.create(
"ScatterNDUpdate",
[
broadcast,
Expand Down Expand Up @@ -432,15 +432,15 @@ def convert_sentencepiece_model_tokenizer(
def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model:
token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence)

decoder = factory.create(
decoder = _factory.create(
"SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer",
[sp_model_node, token_ids],
).outputs()

if streaming_decoder:
decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder)

string_output = factory.create("StringTensorPack", decoder).outputs()
string_output = _factory.create("StringTensorPack", decoder).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME)
tokenizer_decoder.validate_nodes_and_infer_types()
Expand Down
Loading

0 comments on commit d370125

Please sign in to comment.