diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md
index 925c049cc..760a6de6c 100644
--- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md
+++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md
@@ -1,46 +1,89 @@
# OpenVINO Tokenizers
+OpenVINO Tokenizers adds text processing operations to OpenVINO.
+
## Features
-- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer:
- - Fast tokenizers based on Wordpiece and BPE models
- - Slow tokenizers based on SentencePiece model file
+- Perform tokenization and detokenization without third-party dependencies
+- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer
- Combine OpenVINO models into a single model
- Add greedy decoding pipeline to text generation model
## Installation
-1. Install [OpenVINO Runtime for C++](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#for-c-developers).
-2. (Recommended) Create and activate virtual env:
+(Recommended) Create and activate virtual env:
```bash
python3 -m venv venv
source venv/bin/activate
+ # or
+conda create --name openvino_tokenizer
+conda activate openvino_tokenizer
```
-3. Go to `modules/custom_operations` and run:
+
+### Minimal Installation
+
+Use minimal installation when you have a converted OpenVINO tokenizer:
+```bash
+pip install openvino-tokenizers
+ # or
+conda install -c conda-forge openvino openvino-tokenizers
+```
+
+### Convert Tokenizers Installation
+
+If you want to convert HuggingFace tokenizers into OpenVINO tokenizers:
+```bash
+pip install openvino-tokenizers[transformers]
+ # or
+conda install -c conda-forge openvino openvino-tokenizers && pip install transformers[sentencepiece] tiktoken
+```
+
+### Build and install from source after [OpenVINO installation](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html)
+```bash
+source path/to/installed/openvino/setupvars.sh
+git clone https://github.com/openvinotoolkit/openvino_contrib.git
+cd openvino_contrib/modules/custom_operations/
+pip install -e .[transformers]
+```
+
+### Build and install for development
```bash
-# to use converted tokenizers or models combined with tokenizers
-pip install .
-# to convert tokenizers from transformers library
-pip install .[transformers]
-# for development and testing the library
+source path/to/installed/openvino/setupvars.sh
+git clone https://github.com/openvinotoolkit/openvino_contrib.git
+cd openvino_contrib/modules/custom_operations/
pip install -e .[all]
+# verify installation by running tests
+cd user_ie_extensions/tokenizer/python/tests/
+pytest .
```
+## Usage
+
### Convert HuggingFace tokenizer
+OpenVINO Tokenizers ships with CLI tool that can convert tokenizers from Huggingface Hub
+or Huggingface tokenizers saved on disk:
+
+```shell
+convert_tokenizer codellama/CodeLlama-7b-hf --with-detokenizer -o output_dir
+```
+
+There is also `convert_tokenizer` function that can convert tokenizer python object.
+
```python
+import numpy as np
from transformers import AutoTokenizer
-from openvino import compile_model
+from openvino import compile_model, save_model
from openvino_tokenizers import convert_tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
ov_tokenizer = convert_tokenizer(hf_tokenizer)
compiled_tokenzier = compile_model(ov_tokenizer)
-text_input = "Test string"
+text_input = ["Test string"]
-hf_output = hf_tokenizer([text_input], return_tensors="np")
-ov_output = compiled_tokenzier([text_input])
+hf_output = hf_tokenizer(text_input, return_tensors="np")
+ov_output = compiled_tokenzier(text_input)
for output_name in hf_output:
print(f"OpenVINO {output_name} = {ov_output[output_name]}")
@@ -51,10 +94,20 @@ for output_name in hf_output:
# HuggingFace token_type_ids = [[0 0 0 0]]
# OpenVINO attention_mask = [[1 1 1 1]]
# HuggingFace attention_mask = [[1 1 1 1]]
+
+# save tokenizer for later use
+save_model(ov_tokenizer, "openvino_tokenizer.xml")
+
+loaded_tokenizer = compile_model("openvino_tokenizer.xml")
+loaded_ov_output = loaded_tokenizer(text_input)
+for output_name in hf_output:
+ assert np.all(loaded_ov_output[output_name] == ov_output[output_name])
```
### Connect Tokenizer to a Model
+To infer and convert the original model, install torch or torch-cpu to the virtual environment.
+
```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from openvino import compile_model, convert_model
@@ -83,10 +136,12 @@ print(f"HuggingFace logits {hf_output.logits}")
### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer
-To work with converted tokenizer and detokenizer, numpy string tensors are used.
+Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO,
+after which you can work with saved tokenizers and detokenizers.
```python
import numpy as np
+import openvino_tokenizers
from openvino import Core
core = Core()
@@ -160,17 +215,27 @@ print(f"HuggingFace output string: `{hf_output}`")
# HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']`
```
-## Test Coverage
+## Supported Tokenizer Types
+
+| Huggingface
Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
+|---------------------------------|----------------------|----------|------------|
+| Fast | WordPiece | ✅ | ❌ |
+| | BPE | ✅ | ✅ |
+| | Unigram | ❌ | ❌ |
+| Legacy | SentencePiece .model | ✅ | ✅ |
+| Custom | tiktoken | ✅ | ✅ |
+
+## Test Results
-This report is autogenerated and includes tokenizers and detokenizers tests. To update it run pytest with `--update_readme` flag.
+This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.
-### Coverage by Tokenizer Type
+### Output Match by Tokenizer Type
Tokenizer Type |
- Pass Rate, % |
+ Output Matched, % |
Number of Tests |
@@ -198,14 +263,14 @@ This report is autogenerated and includes tokenizers and detokenizers tests. To
-### Coverage by Model Type
+### Output Match by Model
Tokenizer Type |
Model |
- Pass Rate, % |
+ Output Matched, % |
Number of Tests |
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py
index 03365eae2..d55799712 100644
--- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py
+++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py
@@ -64,7 +64,7 @@ def get_parser() -> ArgumentParser:
"Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace "
"tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. "
"See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/"
- "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential "
+ "custom_operations/user_ie_extensions/tokenizer/python#output-match-by-model to check the potential "
"difference between original and OpenVINO tokenizers"
),
)
@@ -96,6 +96,7 @@ def get_parser() -> ArgumentParser:
parser.add_argument(
"--streaming-detokenizer",
required=False,
+ action="store_true",
help=(
"[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. "
"Can be used to stream a model output without TextStreamer buffer"
@@ -105,8 +106,14 @@ def get_parser() -> ArgumentParser:
def convert_hf_tokenizer() -> None:
- from transformers import AutoTokenizer
-
+ try:
+ from transformers import AutoTokenizer
+ except (ImportError, ModuleNotFoundError):
+ raise EnvironmentError(
+ "No transformers library in the environment. Install required dependencies with one of two options:\n"
+ "1. pip install openvino-tokenizers[transformers]\n"
+ "2. pip install transformers[sentencepiece] tiktoken\n"
+ )
args = get_parser().parse_args()
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py
index 2cb0d7750..35a6f05bb 100644
--- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py
+++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py
@@ -61,6 +61,12 @@ def convert_tokenizer(
with_detokenizer=with_detokenizer,
skip_special_tokens=skip_special_tokens,
)
+ else:
+ raise EnvironmentError(
+ "No transformers library in the environment. Install required dependencies with one of two options:\n"
+ "1. pip install openvino-tokenizers[transformers]\n"
+ "2. pip install transformers[sentencepiece] tiktoken\n"
+ )
if ov_tokenizers is None:
raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}")
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py
index 47402ee29..fff64d451 100644
--- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py
+++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py
@@ -42,30 +42,33 @@ def add_tokenizer_type(row):
results_df = results_df[["Tokenizer Type", "Model", "test_string", "status"]]
grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg({"status": ["mean", "count"]}).reset_index()
- grouped_by_model.columns = ["Tokenizer Type", "Model", "Pass Rate, %", "Number of Tests"]
- grouped_by_model["Pass Rate, %"] *= 100
+ grouped_by_model.columns = ["Tokenizer Type", "Model", "Output Matched, %", "Number of Tests"]
+ grouped_by_model["Output Matched, %"] *= 100
grouped_by_type = results_df.groupby(["Tokenizer Type"]).agg({"status": ["mean", "count"]}).reset_index()
- grouped_by_type.columns = ["Tokenizer Type", "Pass Rate, %", "Number of Tests"]
- grouped_by_type["Pass Rate, %"] *= 100
+ grouped_by_type.columns = ["Tokenizer Type", "Output Matched, %", "Number of Tests"]
+ grouped_by_type["Output Matched, %"] *= 100
readme_path = Path("../README.md")
with open(readme_path) as f:
- old_readme = f.read().split("## Test Coverage")[0]
+ old_readme = f.read().split("## Test Results")[0]
new_readme = StringIO()
new_readme.write(old_readme)
new_readme.write(
- "## Test Coverage\n\n"
+ "## Test Results\n\n"
"This report is autogenerated and includes tokenizers and detokenizers tests. "
- "To update it run pytest with `--update_readme` flag.\n\n"
- "### Coverage by Tokenizer Type\n\n"
+ "The `Output Matched, %` column shows the percent of test strings "
+ "for which the results of OpenVINO and Hugingface Tokenizers are the same. "
+ "To update the report run `pytest tokenizers_test.py --update_readme` in "
+ "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n"
+ "### Output Match by Tokenizer Type\n\n"
)
is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
if is_pandas_2:
grouped_by_type.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True)
else:
grouped_by_type.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True)
- new_readme.write("\n### Coverage by Model Type\n\n")
+ new_readme.write("\n### Output Match by Model\n\n")
if is_pandas_2:
grouped_by_model.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True)
else: