diff --git a/modules/custom_operations/pyproject.toml b/modules/custom_operations/pyproject.toml index 0eeba0a25..1120ba5bb 100644 --- a/modules/custom_operations/pyproject.toml +++ b/modules/custom_operations/pyproject.toml @@ -39,6 +39,9 @@ all = [ "openvino_tokenizers[dev,transformers,tiktoken]" ] +[project.scripts] +convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer" + [tool.ruff] ignore = ["C901", "E501", "E741", "W605"] select = ["C", "E", "F", "I", "W"] diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py index c2adec7ce..91585619f 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py @@ -45,11 +45,13 @@ # patching openvino old_core_init = openvino.runtime.Core.__init__ + @functools.wraps(old_core_init) def new_core_init(self, *args, **kwargs): old_core_init(self, *args, **kwargs) self.add_extension(str(_ext_path)) # Core.add_extension doesn't support Path object + openvino.runtime.Core.__init__ = new_core_init _factory = NodeFactory() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py new file mode 100644 index 000000000..03365eae2 --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from argparse import ArgumentParser, Action +from pathlib import Path + +from openvino import save_model, Type + +from openvino_tokenizers import convert_tokenizer + + +class StringToTypeAction(Action): + string_to_type_dict = { + "i32": Type.i32, + "i64": Type.i64, + } + + def __call__(self, parser, namespace, values, option_string=None) -> None: + setattr(namespace, self.dest, self.string_to_type_dict[values]) + + +def get_parser() -> ArgumentParser: + parser = ArgumentParser( + prog="convert_tokenizer", + description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model.", + ) + parser.add_argument( + "name", + type=str, + help=( + "The model id of a tokenizer hosted inside a model repo on huggingface.co " + "or a path to a saved Huggingface tokenizer directory" + ), + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=Path(), + required=False, + help="Output directory", + ) + parser.add_argument( + "--with-detokenizer", + required=False, + action="store_true", + help="Add a detokenizer model to the output", + ) + parser.add_argument( + "--skip_special_tokens", + required=False, + action="store_true", + help=( + "Produce detokenizer that will skip special tokens during decoding, similar to " + "huggingface_tokenizer.decode(token_ids, skip_special_tokens=True)." + ), + ) + parser.add_argument( + "--use-fast-false", + required=False, + action="store_false", + help=( + "Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace " + "tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. " + "See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/" + "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential " + "difference between original and OpenVINO tokenizers" + ), + ) + parser.add_argument( + "--trust-remote-code", + required=False, + action="store_true", + help=( + "Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will " + "execute code present on the Hub on your local machine" + ), + ) + parser.add_argument( + "--tokenizer-output-type", + required=False, + action=StringToTypeAction, + default=Type.i64, + choices=["i32", "i64"], + help="Type of the output tensors for tokenizer", + ) + parser.add_argument( + "--detokenizer-input-type", + required=False, + action=StringToTypeAction, + default=Type.i64, + choices=["i32", "i64"], + help="Type of the input tensor for detokenizer", + ) + parser.add_argument( + "--streaming-detokenizer", + required=False, + help=( + "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. " + "Can be used to stream a model output without TextStreamer buffer" + ), + ) + return parser + + +def convert_hf_tokenizer() -> None: + from transformers import AutoTokenizer + + + args = get_parser().parse_args() + + print("Loading Huggingface Tokenizer...") + hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code) + + print("Converting Huggingface Tokenizer to OpenVINO...") + converted = convert_tokenizer( + hf_tokenizer, + with_detokenizer=args.with_detokenizer, + tokenizer_output_type=args.tokenizer_output_type, + detokenizer_input_type=args.detokenizer_input_type, + streaming_detokenizer=args.streaming_detokenizer, + ) + if not isinstance(converted, tuple): + converted = (converted,) + + for converted_model, name in zip(converted, ("tokenizer", "detokenizer")): + save_path = args.output / f"openvino_{name}.xml" + save_model(converted_model, save_path) + print(f"Saved OpenVINO {name.capitalize()}: {save_path}, {save_path.with_suffix('.bin')}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py index 64720f970..9fc27e8ba 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py @@ -1,3 +1,7 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + ATTENTION_MASK_INPUT_NAME = "attention_mask" TOKEN_IDS_INPUT_NAME = "input_ids" TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids" diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py index 5c8cf256a..2cb0d7750 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py @@ -17,17 +17,12 @@ def convert_tokenizer( tokenizer_object: Any, - number_of_inputs: int = 1, with_detokenizer: bool = False, streaming_detokenizer: bool = False, skip_special_tokens: bool = False, tokenizer_output_type: Type = Type.i64, detokenizer_input_type: Type = Type.i64, ) -> Union[Model, Tuple[Model, Model]]: - # todo: add support for more then 1 input - if number_of_inputs > 1: - raise ValueError("Tokenizers with more then one input are not supported yet.") - ov_tokenizers = None if "transformers" in sys.modules: @@ -62,7 +57,7 @@ def convert_tokenizer( logger.info("Convert Huggingface Fast tokenizer pipeline.") ov_tokenizers = convert_fast_tokenizer( tokenizer_object, - number_of_inputs=number_of_inputs, + number_of_inputs=1, with_detokenizer=with_detokenizer, skip_special_tokens=skip_special_tokens, ) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py index 0193b8a55..665a5a3d8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py @@ -21,10 +21,10 @@ from . import _factory from .constants import ( ATTENTION_MASK_INPUT_NAME, + DETOKENIZER_NAME, STRING_OUTPUT_NAME, TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME, - DETOKENIZER_NAME, TOKENIZER_NAME, ) from .tokenizer_pipeline import ( diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py index abb4e63c3..507426d7c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py @@ -17,10 +17,10 @@ from . import _factory from .constants import ( ATTENTION_MASK_INPUT_NAME, + DETOKENIZER_NAME, STRING_OUTPUT_NAME, TOKEN_IDS_INPUT_NAME, TOKEN_TYPE_IDS_INPUT_NAME, - DETOKENIZER_NAME, TOKENIZER_NAME, ) from .str_pack import pack_string, pack_strings @@ -807,6 +807,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]: return _factory.create("StringTensorPack", input_nodes).outputs() def get_detokenizer_ov_subgraph(self) -> Model: + if not any(isinstance(step, VocabDecoderStep) for step in self.decoding_steps): + raise NotImplementedError("Detokenizer is not supported for this model yet!") + input_node = op.Parameter(Type.i32, PartialShape(["?", "?"])) token_ids = input_node outputs = self.create_decoding_pipeline([token_ids])