Skip to content

Commit

Permalink
[TOKENIZERS] Ported PRs from master (#801)
Browse files Browse the repository at this point in the history
* [TOKENIZERS] Disabled  C4703 (#796)

* disabled error C4703

* Update modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt

---------

Co-authored-by: Ilya Lavrenov <[email protected]>

* [Tokenizer] Add CLI Tokenizer Converter (#792)

* Add CLI Tokenizer Converter

* Fix space

* Add more flags to CLI tool

* [TOKENIZERS] Update license field and version (#793)

* update license field and version

* flex dependency in master

* removed version of openvino in master

* [TOKENIZERS] Extended extension searching paths (#797)

* extended site-packages pathes

* Apply suggestions from code review

Co-authored-by: Artur Paniukov <[email protected]>

---------

Co-authored-by: Artur Paniukov <[email protected]>

* [TOKENIZERS] added build of tokenizers in wheel (#798)

* added build tokenizers for wheel

* fixed azure pipeline

* [Tokenizers] Update README.md (#799)

* Fix --streaming-detokenizer flag

* Rewrite README.md

* Rewrite README.md

* Rewrite README.md

---------

Co-authored-by: Mikhail Ryzhov <[email protected]>
Co-authored-by: Artur Paniukov <[email protected]>
  • Loading branch information
3 people authored Dec 22, 2023
1 parent 99a9928 commit b32281d
Show file tree
Hide file tree
Showing 11 changed files with 290 additions and 61 deletions.
3 changes: 3 additions & 0 deletions .ci/azure/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ jobs:
source $(WORK_DIR)/.env3/bin/activate && source $(SETUPVARS)
python -m pip install build
python -m build --wheel --outdir $(BUILD_WHEEL_DIR) $(REPO_DIR)/modules/custom_operations
env:
CMAKE_ARGS: '-DBUILD_FAST_TOKENIZERS=ON'
CMAKE_GENERATOR: 'Unix Makefiles'
workingDirectory: $(WORK_DIR)
displayName: 'Build tokenizers wheel'
Expand Down
9 changes: 6 additions & 3 deletions modules/custom_operations/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
[project]
name = "openvino-tokenizers"
version = "2023.3.0"
version = "2023.3.0.0"
description = "Convert tokenizers into OpenVINO models"
requires-python = ">=3.8"
readme = {file = "user_ie_extensions/tokenizer/python/README.md", content-type="text/markdown"}
license = {text = "OSI Approved :: Apache Software License"}

authors = [
{ name = "OpenVINO Developers", email = "[email protected]" },
Expand All @@ -17,8 +18,7 @@ classifiers = [
]

dependencies = [
"openvino",
"numpy"
'openvino'
]

[project.optional-dependencies]
Expand All @@ -39,6 +39,9 @@ all = [
"openvino_tokenizers[dev,transformers,tiktoken]"
]

[project.scripts]
convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer"

[tool.ruff]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ endif()

option(BUILD_FAST_TOKENIZERS OFF)

# to build only direct dependencies
set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL ON)

#
# Compile flags
Expand All @@ -31,6 +29,11 @@ if(SUGGEST_OVERRIDE_SUPPORTED)
set(cxx_flags "${cxx_flags} -Wno-suggest-override")
endif()

if(WIN32 AND X86_64)
# disable compiler warning C4703
set(cxx_flags "${cxx_flags} /wd4703")
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx_flags} ${c_cxx_flags}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${c_cxx_flags}")

Expand Down Expand Up @@ -168,8 +171,12 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}"
# Post build steps to copy core_tokenizers dependencies
#

set_property(DIRECTORY ${sentencepiece_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL ON)

if(BUILD_FAST_TOKENIZERS)
# TODO
install(TARGETS core_tokenizers
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
else()
if(WIN32 AND X86_64)
set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,46 +1,89 @@
# OpenVINO Tokenizers

OpenVINO Tokenizers adds text processing operations to OpenVINO.

## Features

- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer:
- Fast tokenizers based on Wordpiece and BPE models
- Slow tokenizers based on SentencePiece model file
- Perform tokenization and detokenization without third-party dependencies
- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer
- Combine OpenVINO models into a single model
- Add greedy decoding pipeline to text generation model

## Installation

1. Install [OpenVINO Runtime for C++](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#for-c-developers).
2. (Recommended) Create and activate virtual env:
(Recommended) Create and activate virtual env:
```bash
python3 -m venv venv
source venv/bin/activate
# or
conda create --name openvino_tokenizer
conda activate openvino_tokenizer
```
3. Go to `modules/custom_operations` and run:

### Minimal Installation

Use minimal installation when you have a converted OpenVINO tokenizer:
```bash
pip install openvino-tokenizers
# or
conda install -c conda-forge openvino openvino-tokenizers
```

### Convert Tokenizers Installation

If you want to convert HuggingFace tokenizers into OpenVINO tokenizers:
```bash
pip install openvino-tokenizers[transformers]
# or
conda install -c conda-forge openvino openvino-tokenizers && pip install transformers[sentencepiece] tiktoken
```

### Build and install from source after [OpenVINO installation](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html)
```bash
source path/to/installed/openvino/setupvars.sh
git clone https://github.com/openvinotoolkit/openvino_contrib.git
cd openvino_contrib/modules/custom_operations/
pip install -e .[transformers]
```

### Build and install for development
```bash
# to use converted tokenizers or models combined with tokenizers
pip install .
# to convert tokenizers from transformers library
pip install .[transformers]
# for development and testing the library
source path/to/installed/openvino/setupvars.sh
git clone https://github.com/openvinotoolkit/openvino_contrib.git
cd openvino_contrib/modules/custom_operations/
pip install -e .[all]
# verify installation by running tests
cd user_ie_extensions/tokenizer/python/tests/
pytest .
```

## Usage

### Convert HuggingFace tokenizer

OpenVINO Tokenizers ships with CLI tool that can convert tokenizers from Huggingface Hub
or Huggingface tokenizers saved on disk:

```shell
convert_tokenizer codellama/CodeLlama-7b-hf --with-detokenizer -o output_dir
```

There is also `convert_tokenizer` function that can convert tokenizer python object.

```python
import numpy as np
from transformers import AutoTokenizer
from openvino import compile_model
from openvino import compile_model, save_model
from openvino_tokenizers import convert_tokenizer

hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
ov_tokenizer = convert_tokenizer(hf_tokenizer)

compiled_tokenzier = compile_model(ov_tokenizer)
text_input = "Test string"
text_input = ["Test string"]

hf_output = hf_tokenizer([text_input], return_tensors="np")
ov_output = compiled_tokenzier([text_input])
hf_output = hf_tokenizer(text_input, return_tensors="np")
ov_output = compiled_tokenzier(text_input)

for output_name in hf_output:
print(f"OpenVINO {output_name} = {ov_output[output_name]}")
Expand All @@ -51,10 +94,20 @@ for output_name in hf_output:
# HuggingFace token_type_ids = [[0 0 0 0]]
# OpenVINO attention_mask = [[1 1 1 1]]
# HuggingFace attention_mask = [[1 1 1 1]]

# save tokenizer for later use
save_model(ov_tokenizer, "openvino_tokenizer.xml")

loaded_tokenizer = compile_model("openvino_tokenizer.xml")
loaded_ov_output = loaded_tokenizer(text_input)
for output_name in hf_output:
assert np.all(loaded_ov_output[output_name] == ov_output[output_name])
```

### Connect Tokenizer to a Model

To infer and convert the original model, install torch or torch-cpu to the virtual environment.

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from openvino import compile_model, convert_model
Expand Down Expand Up @@ -83,10 +136,12 @@ print(f"HuggingFace logits {hf_output.logits}")

### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer

To work with converted tokenizer and detokenizer, numpy string tensors are used.
Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO,
after which you can work with saved tokenizers and detokenizers.

```python
import numpy as np
import openvino_tokenizers
from openvino import Core

core = Core()
Expand Down Expand Up @@ -160,17 +215,27 @@ print(f"HuggingFace output string: `{hf_output}`")
# HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']`
```

## Test Coverage
## Supported Tokenizer Types

| Huggingface <br/>Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
|---------------------------------|----------------------|----------|------------|
| Fast | WordPiece |||
| | BPE |||
| | Unigram |||
| Legacy | SentencePiece .model |||
| Custom | tiktoken |||

## Test Results

This report is autogenerated and includes tokenizers and detokenizers tests. To update it run pytest with `--update_readme` flag.
This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.

### Coverage by Tokenizer Type
### Output Match by Tokenizer Type

<table>
<thead>
<tr>
<th >Tokenizer Type</th>
<th >Pass Rate, %</th>
<th >Output Matched, %</th>
<th >Number of Tests</th>
</tr>
</thead>
Expand Down Expand Up @@ -198,14 +263,14 @@ This report is autogenerated and includes tokenizers and detokenizers tests. To
</tbody>
</table>

### Coverage by Model Type
### Output Match by Model

<table>
<thead>
<tr>
<th >Tokenizer Type</th>
<th >Model</th>
<th >Pass Rate, %</th>
<th >Output Matched, %</th>
<th >Number of Tests</th>
</tr>
</thead>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import functools
import os
import sys
import sysconfig
from itertools import chain
import site
from pathlib import Path

import openvino
Expand All @@ -14,15 +15,6 @@
from .str_pack import pack_strings, unpack_strings
from .utils import add_greedy_decoding, connect_models


_extension_path = os.environ.get("OV_TOKENIZER_PREBUILD_EXTENSION_PATH")
if _extension_path:
# when the path to the extension set manually
_ext_libs_path = Path(_extension_path).parent
else:
# python installation case
_ext_libs_path = Path(sysconfig.get_paths()["purelib"]) / __name__ / "lib"

_ext_name = "user_ov_extensions"
if sys.platform == "win32":
_ext_name = f"{_ext_name}.dll"
Expand All @@ -33,23 +25,34 @@
else:
sys.exit(f"Error: extension does not support the platform {sys.platform}")

_ext_path = _ext_libs_path / _ext_name
if not _ext_path.is_file():
# Case when the library can be found in the PATH/LD_LIBRAY_PATH
_ext_path = _ext_name
# when the path to the extension set manually
_extension_path = os.environ.get("OV_TOKENIZER_PREBUILD_EXTENSION_PATH")
if _extension_path and Path(_extension_path).is_file():
# when the path to the extension set manually
_ext_path = Path(_extension_path)
else:
site_packages = chain((Path(__file__).parent.parent, ), site.getusersitepackages(), site.getsitepackages())
_ext_path = next(
(
ext
for site_package in map(Path, site_packages)
if (ext := site_package / __name__ / "lib" / _ext_name).is_file()
),
_ext_name # Case when the library can be found in the PATH/LD_LIBRAY_PATH
)

del _ext_name
del _ext_libs_path
del _extension_path

# patching openvino
old_core_init = openvino.runtime.Core.__init__


@functools.wraps(old_core_init)
def new_core_init(self, *args, **kwargs):
old_core_init(self, *args, **kwargs)
self.add_extension(str(_ext_path)) # Core.add_extension doesn't support Path object


openvino.runtime.Core.__init__ = new_core_init

_factory = NodeFactory()
Expand Down
Loading

0 comments on commit b32281d

Please sign in to comment.