From c6a4cfc5ae751581e8ed7829087b4cfeeff5db51 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 21 Dec 2023 23:49:07 +0000 Subject: [PATCH] [Tokenizers] Update README.md (#799) * Fix --streaming-detokenizer flag * Rewrite README.md * Rewrite README.md * Rewrite README.md --- .../tokenizer/python/README.md | 109 ++++++++++++++---- .../python/openvino_tokenizers/cli.py | 13 ++- .../openvino_tokenizers/convert_tokenizer.py | 6 + .../tokenizer/python/tests/conftest.py | 21 ++-- 4 files changed, 115 insertions(+), 34 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 925c049cc..760a6de6c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -1,46 +1,89 @@ # OpenVINO Tokenizers +OpenVINO Tokenizers adds text processing operations to OpenVINO. + ## Features -- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer: - - Fast tokenizers based on Wordpiece and BPE models - - Slow tokenizers based on SentencePiece model file +- Perform tokenization and detokenization without third-party dependencies +- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer - Combine OpenVINO models into a single model - Add greedy decoding pipeline to text generation model ## Installation -1. Install [OpenVINO Runtime for C++](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#for-c-developers). -2. (Recommended) Create and activate virtual env: +(Recommended) Create and activate virtual env: ```bash python3 -m venv venv source venv/bin/activate + # or +conda create --name openvino_tokenizer +conda activate openvino_tokenizer ``` -3. Go to `modules/custom_operations` and run: + +### Minimal Installation + +Use minimal installation when you have a converted OpenVINO tokenizer: +```bash +pip install openvino-tokenizers + # or +conda install -c conda-forge openvino openvino-tokenizers +``` + +### Convert Tokenizers Installation + +If you want to convert HuggingFace tokenizers into OpenVINO tokenizers: +```bash +pip install openvino-tokenizers[transformers] + # or +conda install -c conda-forge openvino openvino-tokenizers && pip install transformers[sentencepiece] tiktoken +``` + +### Build and install from source after [OpenVINO installation](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html) +```bash +source path/to/installed/openvino/setupvars.sh +git clone https://github.com/openvinotoolkit/openvino_contrib.git +cd openvino_contrib/modules/custom_operations/ +pip install -e .[transformers] +``` + +### Build and install for development ```bash -# to use converted tokenizers or models combined with tokenizers -pip install . -# to convert tokenizers from transformers library -pip install .[transformers] -# for development and testing the library +source path/to/installed/openvino/setupvars.sh +git clone https://github.com/openvinotoolkit/openvino_contrib.git +cd openvino_contrib/modules/custom_operations/ pip install -e .[all] +# verify installation by running tests +cd user_ie_extensions/tokenizer/python/tests/ +pytest . ``` +## Usage + ### Convert HuggingFace tokenizer +OpenVINO Tokenizers ships with CLI tool that can convert tokenizers from Huggingface Hub +or Huggingface tokenizers saved on disk: + +```shell +convert_tokenizer codellama/CodeLlama-7b-hf --with-detokenizer -o output_dir +``` + +There is also `convert_tokenizer` function that can convert tokenizer python object. + ```python +import numpy as np from transformers import AutoTokenizer -from openvino import compile_model +from openvino import compile_model, save_model from openvino_tokenizers import convert_tokenizer hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") ov_tokenizer = convert_tokenizer(hf_tokenizer) compiled_tokenzier = compile_model(ov_tokenizer) -text_input = "Test string" +text_input = ["Test string"] -hf_output = hf_tokenizer([text_input], return_tensors="np") -ov_output = compiled_tokenzier([text_input]) +hf_output = hf_tokenizer(text_input, return_tensors="np") +ov_output = compiled_tokenzier(text_input) for output_name in hf_output: print(f"OpenVINO {output_name} = {ov_output[output_name]}") @@ -51,10 +94,20 @@ for output_name in hf_output: # HuggingFace token_type_ids = [[0 0 0 0]] # OpenVINO attention_mask = [[1 1 1 1]] # HuggingFace attention_mask = [[1 1 1 1]] + +# save tokenizer for later use +save_model(ov_tokenizer, "openvino_tokenizer.xml") + +loaded_tokenizer = compile_model("openvino_tokenizer.xml") +loaded_ov_output = loaded_tokenizer(text_input) +for output_name in hf_output: + assert np.all(loaded_ov_output[output_name] == ov_output[output_name]) ``` ### Connect Tokenizer to a Model +To infer and convert the original model, install torch or torch-cpu to the virtual environment. + ```python from transformers import AutoTokenizer, AutoModelForSequenceClassification from openvino import compile_model, convert_model @@ -83,10 +136,12 @@ print(f"HuggingFace logits {hf_output.logits}") ### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer -To work with converted tokenizer and detokenizer, numpy string tensors are used. +Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO, +after which you can work with saved tokenizers and detokenizers. ```python import numpy as np +import openvino_tokenizers from openvino import Core core = Core() @@ -160,17 +215,27 @@ print(f"HuggingFace output string: `{hf_output}`") # HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']` ``` -## Test Coverage +## Supported Tokenizer Types + +| Huggingface
Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer | +|---------------------------------|----------------------|----------|------------| +| Fast | WordPiece | ✅ | ❌ | +| | BPE | ✅ | ✅ | +| | Unigram | ❌ | ❌ | +| Legacy | SentencePiece .model | ✅ | ✅ | +| Custom | tiktoken | ✅ | ✅ | + +## Test Results -This report is autogenerated and includes tokenizers and detokenizers tests. To update it run pytest with `--update_readme` flag. +This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory. -### Coverage by Tokenizer Type +### Output Match by Tokenizer Type - + @@ -198,14 +263,14 @@ This report is autogenerated and includes tokenizers and detokenizers tests. To
Tokenizer TypePass Rate, %Output Matched, % Number of Tests
-### Coverage by Model Type +### Output Match by Model - + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py index 03365eae2..d55799712 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -64,7 +64,7 @@ def get_parser() -> ArgumentParser: "Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace " "tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. " "See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/" - "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential " + "custom_operations/user_ie_extensions/tokenizer/python#output-match-by-model to check the potential " "difference between original and OpenVINO tokenizers" ), ) @@ -96,6 +96,7 @@ def get_parser() -> ArgumentParser: parser.add_argument( "--streaming-detokenizer", required=False, + action="store_true", help=( "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. " "Can be used to stream a model output without TextStreamer buffer" @@ -105,8 +106,14 @@ def get_parser() -> ArgumentParser: def convert_hf_tokenizer() -> None: - from transformers import AutoTokenizer - + try: + from transformers import AutoTokenizer + except (ImportError, ModuleNotFoundError): + raise EnvironmentError( + "No transformers library in the environment. Install required dependencies with one of two options:\n" + "1. pip install openvino-tokenizers[transformers]\n" + "2. pip install transformers[sentencepiece] tiktoken\n" + ) args = get_parser().parse_args() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py index 2cb0d7750..35a6f05bb 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py @@ -61,6 +61,12 @@ def convert_tokenizer( with_detokenizer=with_detokenizer, skip_special_tokens=skip_special_tokens, ) + else: + raise EnvironmentError( + "No transformers library in the environment. Install required dependencies with one of two options:\n" + "1. pip install openvino-tokenizers[transformers]\n" + "2. pip install transformers[sentencepiece] tiktoken\n" + ) if ov_tokenizers is None: raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py index 47402ee29..fff64d451 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py @@ -42,30 +42,33 @@ def add_tokenizer_type(row): results_df = results_df[["Tokenizer Type", "Model", "test_string", "status"]] grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg({"status": ["mean", "count"]}).reset_index() - grouped_by_model.columns = ["Tokenizer Type", "Model", "Pass Rate, %", "Number of Tests"] - grouped_by_model["Pass Rate, %"] *= 100 + grouped_by_model.columns = ["Tokenizer Type", "Model", "Output Matched, %", "Number of Tests"] + grouped_by_model["Output Matched, %"] *= 100 grouped_by_type = results_df.groupby(["Tokenizer Type"]).agg({"status": ["mean", "count"]}).reset_index() - grouped_by_type.columns = ["Tokenizer Type", "Pass Rate, %", "Number of Tests"] - grouped_by_type["Pass Rate, %"] *= 100 + grouped_by_type.columns = ["Tokenizer Type", "Output Matched, %", "Number of Tests"] + grouped_by_type["Output Matched, %"] *= 100 readme_path = Path("../README.md") with open(readme_path) as f: - old_readme = f.read().split("## Test Coverage")[0] + old_readme = f.read().split("## Test Results")[0] new_readme = StringIO() new_readme.write(old_readme) new_readme.write( - "## Test Coverage\n\n" + "## Test Results\n\n" "This report is autogenerated and includes tokenizers and detokenizers tests. " - "To update it run pytest with `--update_readme` flag.\n\n" - "### Coverage by Tokenizer Type\n\n" + "The `Output Matched, %` column shows the percent of test strings " + "for which the results of OpenVINO and Hugingface Tokenizers are the same. " + "To update the report run `pytest tokenizers_test.py --update_readme` in " + "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n" + "### Output Match by Tokenizer Type\n\n" ) is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0) if is_pandas_2: grouped_by_type.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True) else: grouped_by_type.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True) - new_readme.write("\n### Coverage by Model Type\n\n") + new_readme.write("\n### Output Match by Model\n\n") if is_pandas_2: grouped_by_model.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True) else:
Tokenizer Type ModelPass Rate, %Output Matched, % Number of Tests