Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tokenizer] Add CLI Tokenizer Converter #792

Merged
merged 4 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions modules/custom_operations/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ all = [
"openvino_tokenizers[dev,transformers,tiktoken]"
]

[project.scripts]
convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer"

[tool.ruff]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@
# patching openvino
old_core_init = openvino.runtime.Core.__init__


@functools.wraps(old_core_init)
def new_core_init(self, *args, **kwargs):
old_core_init(self, *args, **kwargs)
self.add_extension(str(_ext_path)) # Core.add_extension doesn't support Path object


openvino.runtime.Core.__init__ = new_core_init

_factory = NodeFactory()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from argparse import ArgumentParser, Action
from pathlib import Path

from openvino import save_model, Type

from openvino_tokenizers import convert_tokenizer


class StringToTypeAction(Action):
string_to_type_dict = {
"i32": Type.i32,
"i64": Type.i64,
}

def __call__(self, parser, namespace, values, option_string=None) -> None:
setattr(namespace, self.dest, self.string_to_type_dict[values])


def get_parser() -> ArgumentParser:
parser = ArgumentParser(
prog="convert_tokenizer",
description="Converts tokenizers from Huggingface Hub to OpenVINO Tokenizer model.",
)
parser.add_argument(
"name",
type=str,
help=(
"The model id of a tokenizer hosted inside a model repo on huggingface.co "
"or a path to a saved Huggingface tokenizer directory"
),
)
parser.add_argument(
"-o",
"--output",
type=Path,
default=Path(),
required=False,
help="Output directory",
)
parser.add_argument(
"--with-detokenizer",
required=False,
action="store_true",
help="Add a detokenizer model to the output",
)
parser.add_argument(
"--skip_special_tokens",
required=False,
action="store_true",
help=(
"Produce detokenizer that will skip special tokens during decoding, similar to "
"huggingface_tokenizer.decode(token_ids, skip_special_tokens=True)."
),
)
parser.add_argument(
"--use-fast-false",
required=False,
action="store_false",
help=(
"Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace "
"tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. "
"See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/"
"custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential "
"difference between original and OpenVINO tokenizers"
),
)
parser.add_argument(
"--trust-remote-code",
required=False,
action="store_true",
help=(
"Pass `trust_remote_code=True` to `AutoTokenizer.from_pretrained`. It will "
"execute code present on the Hub on your local machine"
),
)
parser.add_argument(
"--tokenizer-output-type",
required=False,
apaniukov marked this conversation as resolved.
Show resolved Hide resolved
action=StringToTypeAction,
default=Type.i64,
choices=["i32", "i64"],
help="Type of the output tensors for tokenizer",
)
parser.add_argument(
"--detokenizer-input-type",
required=False,
action=StringToTypeAction,
default=Type.i64,
choices=["i32", "i64"],
help="Type of the input tensor for detokenizer",
)
parser.add_argument(
"--streaming-detokenizer",
required=False,
help=(
"[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. "
"Can be used to stream a model output without TextStreamer buffer"
),
)
return parser


def convert_hf_tokenizer() -> None:
from transformers import AutoTokenizer


args = get_parser().parse_args()

print("Loading Huggingface Tokenizer...")
hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code)

print("Converting Huggingface Tokenizer to OpenVINO...")
converted = convert_tokenizer(
hf_tokenizer,
with_detokenizer=args.with_detokenizer,
tokenizer_output_type=args.tokenizer_output_type,
detokenizer_input_type=args.detokenizer_input_type,
streaming_detokenizer=args.streaming_detokenizer,
)
apaniukov marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(converted, tuple):
converted = (converted,)

for converted_model, name in zip(converted, ("tokenizer", "detokenizer")):
apaniukov marked this conversation as resolved.
Show resolved Hide resolved
save_path = args.output / f"openvino_{name}.xml"
save_model(converted_model, save_path)
print(f"Saved OpenVINO {name.capitalize()}: {save_path}, {save_path.with_suffix('.bin')}")
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

ATTENTION_MASK_INPUT_NAME = "attention_mask"
TOKEN_IDS_INPUT_NAME = "input_ids"
TOKEN_TYPE_IDS_INPUT_NAME = "token_type_ids"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,12 @@

def convert_tokenizer(
tokenizer_object: Any,
number_of_inputs: int = 1,
with_detokenizer: bool = False,
streaming_detokenizer: bool = False,
skip_special_tokens: bool = False,
tokenizer_output_type: Type = Type.i64,
detokenizer_input_type: Type = Type.i64,
) -> Union[Model, Tuple[Model, Model]]:
# todo: add support for more then 1 input
if number_of_inputs > 1:
raise ValueError("Tokenizers with more then one input are not supported yet.")

ov_tokenizers = None

if "transformers" in sys.modules:
Expand Down Expand Up @@ -62,7 +57,7 @@ def convert_tokenizer(
logger.info("Convert Huggingface Fast tokenizer pipeline.")
ov_tokenizers = convert_fast_tokenizer(
tokenizer_object,
number_of_inputs=number_of_inputs,
number_of_inputs=1,
with_detokenizer=with_detokenizer,
skip_special_tokens=skip_special_tokens,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
from . import _factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
DETOKENIZER_NAME,
TOKENIZER_NAME,
)
from .tokenizer_pipeline import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from . import _factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
DETOKENIZER_NAME,
TOKENIZER_NAME,
)
from .str_pack import pack_string, pack_strings
Expand Down Expand Up @@ -807,6 +807,9 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
return _factory.create("StringTensorPack", input_nodes).outputs()

def get_detokenizer_ov_subgraph(self) -> Model:
if not any(isinstance(step, VocabDecoderStep) for step in self.decoding_steps):
raise NotImplementedError("Detokenizer is not supported for this model yet!")

input_node = op.Parameter(Type.i32, PartialShape(["?", "?"]))
token_ids = input_node
outputs = self.create_decoding_pipeline([token_ids])
Expand Down