From 3d589596feeb7bf9235464514734cc648638eb06 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 18 Dec 2023 20:28:10 +0000 Subject: [PATCH 1/3] Add CLI Tokenizer Converter --- modules/custom_operations/pyproject.toml | 3 + .../python/openvino_tokenizers/__init__.py | 2 + .../python/openvino_tokenizers/cli.py | 78 +++++++++++++++++++ .../python/openvino_tokenizers/constants.py | 4 + .../openvino_tokenizers/convert_tokenizer.py | 7 +- .../python/openvino_tokenizers/hf_parser.py | 2 +- .../openvino_tokenizers/tokenizer_pipeline.py | 5 +- 7 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py diff --git a/modules/custom_operations/pyproject.toml b/modules/custom_operations/pyproject.toml index 0eeba0a25..1120ba5bb 100644 --- a/modules/custom_operations/pyproject.toml +++ b/modules/custom_operations/pyproject.toml @@ -39,6 +39,9 @@ all = [ "openvino_tokenizers[dev,transformers,tiktoken]" ] +[project.scripts] +convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer" + [tool.ruff] ignore = ["C901", "E501", "E741", "W605"] select = ["C", "E", "F", "I", "W"] diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py index c2adec7ce..91585619f 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py @@ -45,11 +45,13 @@ # patching openvino old_core_init = openvino.runtime.Core.__init__ + @functools.wraps(old_core_init) def new_core_init(self, *args, **kwargs): old_core_init(self, *args, **kwargs) self.add_extension(str(_ext_path)) # Core.add_extension doesn't support Path object + openvino.runtime.Core.__init__ = new_core_init _factory = NodeFactory() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py new file mode 100644 index 000000000..cf69ac859 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from argparse import ArgumentParser +from pathlib import Path + +from openvino import save_model + +from openvino_tokenizers import convert_tokenizer + + +def get_parser() -> ArgumentParser: + parser = ArgumentParser( + prog="convert_tokenizer", description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model." + ) + parser.add_argument( + "name", + type=str, + help=( + "The model id of a tokenizer hosted inside a model repo on huggingface.co " + "or a path to a saved Huggingface tokenizer directory" + ), + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=Path(), + required=False, + help="Output directory", + ) + parser.add_argument( + "--with-detokenizer", + required=False, + action="store_true", + help="Add a detokenizer model to the output", + ) + parser.add_argument( + "--trust-remote-code", + required=False, + action="store_true", + help=( + "Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will" + "execute code present on the Hub on your local machine." + ), + ) + parser.add_argument( + "-v", + "--verbose", + required=False, + action="store_true", + ) + parser.add_argument( + "-s", + "--streaming-detokenizer", + required=False, + help=( + "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. " + "Can be used to stream a model output without TextStreamer buffer." + ), + ) + return parser + + +def convert_hf_tokenizer() -> None: + from transformers import AutoTokenizer + + args = get_parser().parse_args() + hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code) + converted = convert_tokenizer( + hf_tokenizer, with_detokenizer=args.with_detokenizer, streaming_detokenizer=args.streaming_detokenizer + ) + if not isinstance(converted, tuple): + converted = (converted,) + + for converted_model, name in zip(converted, ("tokenizer", "detokenizer")): + save_model(converted_model, args.output / f"{name}.xml") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py index 64720f970..208eaafd8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py @@ -1,3 +1,7 @@ +#!/usr/bin/env python3 +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + ATTENTION_MASK_INPUT_NAME = "attention_mask" TOKEN_IDS_INPUT_NAME = "input_ids" TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py index 5c8cf256a..2cb0d7750 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py @@ -17,17 +17,12 @@ def convert_tokenizer( tokenizer_object: Any, - number_of_inputs: int = 1, with_detokenizer: bool = False, streaming_detokenizer: bool = False, skip_special_tokens: bool = False, tokenizer_output_type: Type = Type.i64, detokenizer_input_type: Type = Type.i64, ) -> Union[Model, Tuple[Model, Model]]: - # todo: add support for more then 1 input - if number_of_inputs > 1: - raise ValueError("Tokenizers with more then one input are not supported yet.") - ov_tokenizers = None if "transformers" in sys.modules: @@ -62,7 +57,7 @@ def convert_tokenizer( logger.info("Convert Huggingface Fast tokenizer pipeline.") ov_tokenizers = convert_fast_tokenizer( tokenizer_object, - number_of_inputs=number_of_inputs, + number_of_inputs=1, with_detokenizer=with_detokenizer, skip_special_tokens=skip_special_tokens, ) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py index 0193b8a55..665a5a3d8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py @@ -21,10 +21,10 @@ from . import _factory from .constants import ( ATTENTION_MASK_INPUT_NAME, + DETOKENIZER_NAME, STRING_OUTPUT_NAME, TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME, - DETOKENIZER_NAME, TOKENIZER_NAME, ) from .tokenizer_pipeline import ( diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py index abb4e63c3..507426d7c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py @@ -17,10 +17,10 @@ from . import _factory from .constants import ( ATTENTION_MASK_INPUT_NAME, + DETOKENIZER_NAME, STRING_OUTPUT_NAME, TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME, - DETOKENIZER_NAME, TOKENIZER_NAME, ) from .str_pack import pack_string, pack_strings @@ -807,6 +807,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return _factory.create("StringTensorPack", input_nodes).outputs() def get_detokenizer_ov_subgraph(self) -> Model: + if not any(isinstance(step, VocabDecoderStep) for step in self.decoding_steps): + raise NotImplementedError("Detokenizer is not supported for this model yet!") + input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) token_ids = input_node outputs = self.create_decoding_pipeline([token_ids]) From 7cd708e9c414c1f66c6e7c6e283228e9b6fb8658 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 18 Dec 2023 20:35:59 +0000 Subject: [PATCH 2/3] Fix space --- .../tokenizer/python/openvino_tokenizers/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py index cf69ac859..d0497965f 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -41,7 +41,7 @@ def get_parser() -> ArgumentParser: required=False, action="store_true", help=( - "Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will" + "Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will " "execute code present on the Hub on your local machine." ), ) From 5d3a2943ee0deb44703f9330bcb5bac422b44ee1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 20 Dec 2023 18:45:41 +0000 Subject: [PATCH 3/3] Add more flags to CLI tool --- .../python/openvino_tokenizers/cli.py | 74 ++++++++++++++++--- .../python/openvino_tokenizers/constants.py | 2 +- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py index d0497965f..03365eae2 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -2,17 +2,28 @@ # Copyright (C) 2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from argparse import ArgumentParser +from argparse import ArgumentParser, Action from pathlib import Path -from openvino import save_model +from openvino import save_model, Type from openvino_tokenizers import convert_tokenizer +class StringToTypeAction(Action): + string_to_type_dict = { + "i32": Type.i32, + "i64": Type.i64, + } + + def __call__(self, parser, namespace, values, option_string=None) -> None: + setattr(namespace, self.dest, self.string_to_type_dict[values]) + + def get_parser() -> ArgumentParser: parser = ArgumentParser( - prog="convert_tokenizer", description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model." + prog="convert_tokenizer", + description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model.", ) parser.add_argument( "name", @@ -36,28 +47,58 @@ def get_parser() -> ArgumentParser: action="store_true", help="Add a detokenizer model to the output", ) + parser.add_argument( + "--skip_special_tokens", + required=False, + action="store_true", + help=( + "Produce detokenizer that will skip special tokens during decoding, similar to " + "huggingface_tokenizer.decode(token_ids, skip_special_tokens=True)." + ), + ) + parser.add_argument( + "--use-fast-false", + required=False, + action="store_false", + help=( + "Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace " + "tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. " + "See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/" + "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential " + "difference between original and OpenVINO tokenizers" + ), + ) parser.add_argument( "--trust-remote-code", required=False, action="store_true", help=( "Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will " - "execute code present on the Hub on your local machine." + "execute code present on the Hub on your local machine" ), ) parser.add_argument( - "-v", - "--verbose", + "--tokenizer-output-type", required=False, - action="store_true", + action=StringToTypeAction, + default=Type.i64, + choices=["i32", "i64"], + help="Type of the output tensors for tokenizer", + ) + parser.add_argument( + "--detokenizer-input-type", + required=False, + action=StringToTypeAction, + default=Type.i64, + choices=["i32", "i64"], + help="Type of the input tensor for detokenizer", ) parser.add_argument( - "-s", "--streaming-detokenizer", required=False, help=( "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. " - "Can be used to stream a model output without TextStreamer buffer." + "Can be used to stream a model output without TextStreamer buffer" ), ) return parser @@ -66,13 +107,24 @@ def get_parser() -> ArgumentParser: def convert_hf_tokenizer() -> None: from transformers import AutoTokenizer + args = get_parser().parse_args() + + print("Loading Huggingface Tokenizer...") hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code) + + print("Converting Huggingface Tokenizer to OpenVINO...") converted = convert_tokenizer( - hf_tokenizer, with_detokenizer=args.with_detokenizer, streaming_detokenizer=args.streaming_detokenizer + hf_tokenizer, + with_detokenizer=args.with_detokenizer, + tokenizer_output_type=args.tokenizer_output_type, + detokenizer_input_type=args.detokenizer_input_type, + streaming_detokenizer=args.streaming_detokenizer, ) if not isinstance(converted, tuple): converted = (converted,) for converted_model, name in zip(converted, ("tokenizer", "detokenizer")): - save_model(converted_model, args.output / f"{name}.xml") + save_path = args.output / f"openvino_{name}.xml" + save_model(converted_model, save_path) + print(f"Saved OpenVINO {name.capitalize()}: {save_path}, {save_path.with_suffix('.bin')}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py index 208eaafd8..9fc27e8ba 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +# -*- coding: utf-8 -*- # Copyright (C) 2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0