Skip to content

Commit

Permalink
[Tokenizer] Add CLI Tokenizer Converter (#792)
Browse files Browse the repository at this point in the history
* Add CLI Tokenizer Converter

* Fix space

* Add more flags to CLI tool
  • Loading branch information
apaniukov authored Dec 20, 2023
1 parent e89a1d9 commit 9c6cce9
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 8 deletions.
3 changes: 3 additions & 0 deletions modules/custom_operations/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ all = [
"openvino_tokenizers[dev,transformers,tiktoken]"
]

[project.scripts]
convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer"

[tool.ruff]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@
# patching openvino
old_core_init = openvino.runtime.Core.__init__


@functools.wraps(old_core_init)
def new_core_init(self, *args, **kwargs):
old_core_init(self, *args, **kwargs)
self.add_extension(str(_ext_path)) # Core.add_extension doesn't support Path object


openvino.runtime.Core.__init__ = new_core_init

_factory = NodeFactory()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from argparse import ArgumentParser, Action
from pathlib import Path

from openvino import save_model, Type

from openvino_tokenizers import convert_tokenizer


class StringToTypeAction(Action):
string_to_type_dict = {
"i32": Type.i32,
"i64": Type.i64,
}

def __call__(self, parser, namespace, values, option_string=None) -> None:
setattr(namespace, self.dest, self.string_to_type_dict[values])


def get_parser() -> ArgumentParser:
parser = ArgumentParser(
prog="convert_tokenizer",
description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model.",
)
parser.add_argument(
"name",
type=str,
help=(
"The model id of a tokenizer hosted inside a model repo on huggingface.co "
"or a path to a saved Huggingface tokenizer directory"
),
)
parser.add_argument(
"-o",
"--output",
type=Path,
default=Path(),
required=False,
help="Output directory",
)
parser.add_argument(
"--with-detokenizer",
required=False,
action="store_true",
help="Add a detokenizer model to the output",
)
parser.add_argument(
"--skip_special_tokens",
required=False,
action="store_true",
help=(
"Produce detokenizer that will skip special tokens during decoding, similar to "
"huggingface_tokenizer.decode(token_ids, skip_special_tokens=True)."
),
)
parser.add_argument(
"--use-fast-false",
required=False,
action="store_false",
help=(
"Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace "
"tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. "
"See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/"
"custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential "
"difference between original and OpenVINO tokenizers"
),
)
parser.add_argument(
"--trust-remote-code",
required=False,
action="store_true",
help=(
"Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will "
"execute code present on the Hub on your local machine"
),
)
parser.add_argument(
"--tokenizer-output-type",
required=False,
action=StringToTypeAction,
default=Type.i64,
choices=["i32", "i64"],
help="Type of the output tensors for tokenizer",
)
parser.add_argument(
"--detokenizer-input-type",
required=False,
action=StringToTypeAction,
default=Type.i64,
choices=["i32", "i64"],
help="Type of the input tensor for detokenizer",
)
parser.add_argument(
"--streaming-detokenizer",
required=False,
help=(
"[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. "
"Can be used to stream a model output without TextStreamer buffer"
),
)
return parser


def convert_hf_tokenizer() -> None:
from transformers import AutoTokenizer


args = get_parser().parse_args()

print("Loading Huggingface Tokenizer...")
hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code)

print("Converting Huggingface Tokenizer to OpenVINO...")
converted = convert_tokenizer(
hf_tokenizer,
with_detokenizer=args.with_detokenizer,
tokenizer_output_type=args.tokenizer_output_type,
detokenizer_input_type=args.detokenizer_input_type,
streaming_detokenizer=args.streaming_detokenizer,
)
if not isinstance(converted, tuple):
converted = (converted,)

for converted_model, name in zip(converted, ("tokenizer", "detokenizer")):
save_path = args.output / f"openvino_{name}.xml"
save_model(converted_model, save_path)
print(f"Saved OpenVINO {name.capitalize()}: {save_path}, {save_path.with_suffix('.bin')}")
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

ATTENTION_MASK_INPUT_NAME = "attention_mask"
TOKEN_IDS_INPUT_NAME = "input_ids"
TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,12 @@

def convert_tokenizer(
tokenizer_object: Any,
number_of_inputs: int = 1,
with_detokenizer: bool = False,
streaming_detokenizer: bool = False,
skip_special_tokens: bool = False,
tokenizer_output_type: Type = Type.i64,
detokenizer_input_type: Type = Type.i64,
) -> Union[Model, Tuple[Model, Model]]:
# todo: add support for more then 1 input
if number_of_inputs > 1:
raise ValueError("Tokenizers with more then one input are not supported yet.")

ov_tokenizers = None

if "transformers" in sys.modules:
Expand Down Expand Up @@ -62,7 +57,7 @@ def convert_tokenizer(
logger.info("Convert Huggingface Fast tokenizer pipeline.")
ov_tokenizers = convert_fast_tokenizer(
tokenizer_object,
number_of_inputs=number_of_inputs,
number_of_inputs=1,
with_detokenizer=with_detokenizer,
skip_special_tokens=skip_special_tokens,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
from . import _factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
DETOKENIZER_NAME,
TOKENIZER_NAME,
)
from .tokenizer_pipeline import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from . import _factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
DETOKENIZER_NAME,
TOKENIZER_NAME,
)
from .str_pack import pack_string, pack_strings
Expand Down Expand Up @@ -807,6 +807,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
return _factory.create("StringTensorPack", input_nodes).outputs()

def get_detokenizer_ov_subgraph(self) -> Model:
if not any(isinstance(step, VocabDecoderStep) for step in self.decoding_steps):
raise NotImplementedError("Detokenizer is not supported for this model yet!")

input_node = op.Parameter(Type.i32, PartialShape(["?", "?"]))
token_ids = input_node
outputs = self.create_decoding_pipeline([token_ids])
Expand Down

0 comments on commit 9c6cce9

Please sign in to comment.