[Tokenizer] Add CLI Tokenizer Converter (#792)

* Add CLI Tokenizer Converter * Fix space * Add more flags to CLI tool
openvinotoolkit · Dec 20, 2023 · 9c6cce9 · 9c6cce9
1 parent e89a1d9
commit 9c6cce9
Show file tree

Hide file tree

Showing 7 changed files with 145 additions and 8 deletions.
diff --git a/modules/custom_operations/pyproject.toml b/modules/custom_operations/pyproject.toml
@@ -39,6 +39,9 @@ all = [
     "openvino_tokenizers[dev,transformers,tiktoken]"
 ]
 
+[project.scripts]
+convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer"
+
 [tool.ruff]
 ignore = ["C901", "E501", "E741", "W605"]
 select = ["C", "E", "F", "I", "W"]

diff --git a/...les/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py b/...les/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py
@@ -45,11 +45,13 @@
 # patching openvino
 old_core_init = openvino.runtime.Core.__init__
 
+
 @functools.wraps(old_core_init)
 def new_core_init(self, *args, **kwargs):
     old_core_init(self, *args, **kwargs)
     self.add_extension(str(_ext_path))  # Core.add_extension doesn't support Path object
 
+
 openvino.runtime.Core.__init__ = new_core_init
 
 _factory = NodeFactory()

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import ArgumentParser, Action
+from pathlib import Path
+
+from openvino import save_model, Type
+
+from openvino_tokenizers import convert_tokenizer
+
+
+class StringToTypeAction(Action):
+    string_to_type_dict = {
+        "i32": Type.i32,
+        "i64": Type.i64,
+    }
+
+    def __call__(self, parser, namespace, values, option_string=None) -> None:
+        setattr(namespace, self.dest, self.string_to_type_dict[values])
+
+
+def get_parser() -> ArgumentParser:
+    parser = ArgumentParser(
+        prog="convert_tokenizer",
+        description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model.",
+    )
+    parser.add_argument(
+        "name",
+        type=str,
+        help=(
+            "The model id of a tokenizer hosted inside a model repo on huggingface.co "
+            "or a path to a saved Huggingface tokenizer directory"
+        ),
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=Path(),
+        required=False,
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--with-detokenizer",
+        required=False,
+        action="store_true",
+        help="Add a detokenizer model to the output",
+    )
+    parser.add_argument(
+        "--skip_special_tokens",
+        required=False,
+        action="store_true",
+        help=(
+            "Produce detokenizer that will skip special tokens during decoding, similar to "
+            "huggingface_tokenizer.decode(token_ids, skip_special_tokens=True)."
+        ),
+    )
+    parser.add_argument(
+        "--use-fast-false",
+        required=False,
+        action="store_false",
+        help=(
+            "Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace "
+            "tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. "
+            "See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/"
+            "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential "
+            "difference between original and OpenVINO tokenizers"
+        ),
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        required=False,
+        action="store_true",
+        help=(
+            "Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will "
+            "execute code present on the Hub on your local machine"
+        ),
+    )
+    parser.add_argument(
+        "--tokenizer-output-type",
+        required=False,
+        action=StringToTypeAction,
+        default=Type.i64,
+        choices=["i32", "i64"],
+        help="Type of the output tensors for tokenizer",
+    )
+    parser.add_argument(
+        "--detokenizer-input-type",
+        required=False,
+        action=StringToTypeAction,
+        default=Type.i64,
+        choices=["i32", "i64"],
+        help="Type of the input tensor for detokenizer",
+    )
+    parser.add_argument(
+        "--streaming-detokenizer",
+        required=False,
+        help=(
+            "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. "
+            "Can be used to stream a model output without TextStreamer buffer"
+        ),
+    )
+    return parser
+
+
+def convert_hf_tokenizer() -> None:
+    from transformers import AutoTokenizer
+
+
+    args = get_parser().parse_args()
+
+    print("Loading Huggingface Tokenizer...")
+    hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code)
+
+    print("Converting Huggingface Tokenizer to OpenVINO...")
+    converted = convert_tokenizer(
+        hf_tokenizer,
+        with_detokenizer=args.with_detokenizer,
+        tokenizer_output_type=args.tokenizer_output_type,
+        detokenizer_input_type=args.detokenizer_input_type,
+        streaming_detokenizer=args.streaming_detokenizer,
+    )
+    if not isinstance(converted, tuple):
+        converted = (converted,)
+
+    for converted_model, name in zip(converted, ("tokenizer", "detokenizer")):
+        save_path = args.output / f"openvino_{name}.xml"
+        save_model(converted_model, save_path)
+        print(f"Saved OpenVINO {name.capitalize()}: {save_path}, {save_path.with_suffix('.bin')}")
diff --git a/...es/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py b/...es/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/constants.py
@@ -1,3 +1,7 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 ATTENTION_MASK_INPUT_NAME = "attention_mask"
 TOKEN_IDS_INPUT_NAME = "input_ids"
 TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids"

diff --git a/...m_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py b/...m_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py
@@ -17,17 +17,12 @@
 
 def convert_tokenizer(
     tokenizer_object: Any,
-    number_of_inputs: int = 1,
     with_detokenizer: bool = False,
     streaming_detokenizer: bool = False,
     skip_special_tokens: bool = False,
     tokenizer_output_type: Type = Type.i64,
     detokenizer_input_type: Type = Type.i64,
 ) -> Union[Model, Tuple[Model, Model]]:
-    # todo: add support for more then 1 input
-    if number_of_inputs > 1:
-        raise ValueError("Tokenizers with more then one input are not supported yet.")
-
     ov_tokenizers = None
 
     if "transformers" in sys.modules:
@@ -62,7 +57,7 @@ def convert_tokenizer(
                 logger.info("Convert Huggingface Fast tokenizer pipeline.")
                 ov_tokenizers = convert_fast_tokenizer(
                     tokenizer_object,
-                    number_of_inputs=number_of_inputs,
+                    number_of_inputs=1,
                     with_detokenizer=with_detokenizer,
                     skip_special_tokens=skip_special_tokens,
                 )

diff --git a/...es/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py b/...es/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py
@@ -21,10 +21,10 @@
 from . import _factory
 from .constants import (
     ATTENTION_MASK_INPUT_NAME,
+    DETOKENIZER_NAME,
     STRING_OUTPUT_NAME,
     TOKEN_IDS_INPUT_NAME,
     TOKEN_TYPE_IDS_INPUT_NAME,
-    DETOKENIZER_NAME,
     TOKENIZER_NAME,
 )
 from .tokenizer_pipeline import (

diff --git a/..._operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py b/..._operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -17,10 +17,10 @@
 from . import _factory
 from .constants import (
     ATTENTION_MASK_INPUT_NAME,
+    DETOKENIZER_NAME,
     STRING_OUTPUT_NAME,
     TOKEN_IDS_INPUT_NAME,
     TOKEN_TYPE_IDS_INPUT_NAME,
-    DETOKENIZER_NAME,
     TOKENIZER_NAME,
 )
 from .str_pack import pack_string, pack_strings
@@ -807,6 +807,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
         return _factory.create("StringTensorPack", input_nodes).outputs()
 
     def get_detokenizer_ov_subgraph(self) -> Model:
+        if not any(isinstance(step, VocabDecoderStep) for step in self.decoding_steps):
+            raise NotImplementedError("Detokenizer is not supported for this model yet!")
+
         input_node = op.Parameter(Type.i32, PartialShape(["?", "?"]))
         token_ids = input_node
         outputs = self.create_decoding_pipeline([token_ids])