Skip to content

Commit

Permalink
Merge branch 'openvinotoolkit:master' into add-cli-tool
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov authored Dec 20, 2023
2 parents 37beeab + 80f9bc1 commit 7cab10a
Show file tree
Hide file tree
Showing 15 changed files with 420 additions and 352 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def convert_tokenizer(
tokenizer_object: Any,
with_detokenizer: bool = False,
streaming_detokenizer: bool = False,
skip_special_tokens: bool = False,
tokenizer_output_type: Type = Type.i64,
detokenizer_input_type: Type = Type.i64,
) -> Union[Model, Tuple[Model, Model]]:
Expand All @@ -43,19 +44,22 @@ def convert_tokenizer(
add_attention_mask=True,
with_detokenizer=with_detokenizer,
streaming_detokenizer=streaming_detokenizer,
skip_special_tokens=skip_special_tokens,
)
elif is_tiktoken_model(tokenizer_object):
logger.info("Convert tiktoken-based tokenizer")
ov_tokenizers = convert_tiktoken_model_tokenizer(
tokenizer_object,
with_detokenizer=with_detokenizer,
skip_special_tokens=skip_special_tokens,
)
elif isinstance(tokenizer_object, PreTrainedTokenizerFast):
logger.info("Convert Huggingface Fast tokenizer pipeline.")
ov_tokenizers = convert_fast_tokenizer(
tokenizer_object,
number_of_inputs=1,
with_detokenizer=with_detokenizer,
skip_special_tokens=skip_special_tokens,
)

if ov_tokenizers is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import tempfile
from copy import deepcopy
from functools import partial
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
Expand Down Expand Up @@ -124,15 +125,15 @@ def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None:
self.number_of_inputs = number_of_inputs
self.num_of_added_tokens = 0

def parse(self, number_of_inputs: Optional[int] = None) -> TokenizerPipeline:
def parse(self, number_of_inputs: Optional[int] = None, skip_special_tokens: bool = False) -> TokenizerPipeline:
self.number_of_inputs = self.number_of_inputs if number_of_inputs is None else number_of_inputs
self.pipeline.number_of_inputs = self.number_of_inputs
for add_steps in [
self.normalization,
self.pre_tokenization,
self.tokenization_model,
self.post_tokenization,
self.decoding,
partial(self.decoding, skip_special_tokens=skip_special_tokens),
]:
add_steps()

Expand Down Expand Up @@ -261,25 +262,48 @@ def add_padding(self) -> None:
else:
self.pipeline.add_steps(PaddingStep())

def decoding(self) -> None:
def decoding(self, skip_special_tokens: bool = False) -> None:
if self.tokenizer_json["decoder"] is None:
return

skip_tokens = parse_special_tokens(self.original_tokenizer) if skip_special_tokens else {}
if self.tokenizer_json["decoder"]["type"] == "ByteLevel":
self.pipeline.add_steps(VocabDecoderStep())
self.pipeline.add_steps(VocabDecoderStep(list(skip_tokens)))
self.pipeline.add_steps(CharsToBytesStep())

if suffix := self.tokenizer_json["model"].get("end_of_word_suffix"):
self.pipeline.add_steps(RegexDecodingStep.replace_end_of_word_suffix(suffix))

if prefix := self.tokenizer_json["model"].get("continuing_subword_prefix"):
self.pipeline.add_steps(RegexDecodingStep.replace_continuing_subword_prefix(prefix))

if self.original_tokenizer.clean_up_tokenization_spaces and self.pipeline.decoding_steps:
self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces())
return



def parse_special_tokens(hf_tokenizer: "PreTrainedTokenizerBase") -> Dict[int, str]:
# the order matters
if getattr(hf_tokenizer, "added_tokens_decoder", False):
return {idx: added_token.content for idx, added_token in hf_tokenizer.added_tokens_decoder.items() if added_token.special}
elif getattr(hf_tokenizer, "tokenizer", False) and getattr(hf_tokenizer.tokenizer, "index_special_tokens", False):
return hf_tokenizer.tokenizer.index_special_tokens
elif getattr(hf_tokenizer, "special_tokens", False):
return {idx: token for token, idx in sorted(hf_tokenizer.special_tokens.items(), key=lambda x: x[1])}

return {}


def convert_fast_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
number_of_inputs: int = 1,
with_detokenizer: bool = False,
skip_special_tokens: bool = False,
) -> Union[Model, Tuple[Model, Model]]:
pipeline = TransformersTokenizerPipelineParser(hf_tokenizer).parse(number_of_inputs=number_of_inputs)
pipeline = TransformersTokenizerPipelineParser(hf_tokenizer).parse(
number_of_inputs=number_of_inputs, skip_special_tokens=skip_special_tokens
)
ov_tokenizer = pipeline.get_tokenizer_ov_subgraph()
output_names = hf_tokenizer.model_input_names

Expand Down Expand Up @@ -312,17 +336,36 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model")


def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None:
def modify_sentencepiece_model(
sp_model_path: Path, add_tokens: Dict[int, str], skip_special_tokens: bool = False, reference_vocab: Optional[List[str]] = None
) -> None:
model_pb = import_protobuf()
model = model_pb.ModelProto()
with open(sp_model_path, "rb") as model_file:
model.ParseFromString(model_file.read())

add_token_dict = hf_tokenizer.tokenizer.index_special_tokens
for idx, token in sorted(add_token_dict.items()):
new_piece = deepcopy(model.pieces[-1])
new_piece.piece = token
model.pieces.append(new_piece)
existing = {piece.piece: piece for piece in model.pieces}
for idx, token in sorted(add_tokens.items()):
if to_add := ((idx >= len(model.pieces) or model.pieces[idx].piece != token)):
if exists := existing.get(token):
new_piece = model.pieces.pop(next(idx for idx, piece in enumerate(model.pieces) if piece == exists))
else:
new_piece = deepcopy(model.pieces[-1])
new_piece.piece = token
else:
new_piece = model.pieces[idx]

if skip_special_tokens and new_piece.type != 2: # type 2 is for unk symbol
new_piece.type = 3 # make it control symbol so it will not decode during detokenization
elif not skip_special_tokens and new_piece.type == 3:
new_piece.type = 4 # change control type to userdef type

if to_add:
model.pieces.insert(idx, new_piece)

# change unk token representation from ⁇ to token string
unk_token = next(piece for piece in model.pieces if piece.type == 2)
model.trainer_spec.unk_surface = unk_token.piece

with open(sp_model_path, "wb") as model_file:
model_file.write(model.SerializeToString())
Expand All @@ -333,6 +376,7 @@ def convert_sentencepiece_model_tokenizer(
add_attention_mask: bool = True,
with_detokenizer: bool = False,
streaming_detokenizer: bool = False,
skip_special_tokens: bool = False,
) -> Union[Model, Tuple[Model, Model]]:
if not is_sentencepiece_model(hf_tokenizer):
raise OVTypeError("Cannot convert tokenizer that does not have `.model` file.")
Expand All @@ -343,8 +387,13 @@ def convert_sentencepiece_model_tokenizer(
hf_tokenizer.save_pretrained(tmp)
vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"]

if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer":
add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer)
add_tokens = parse_special_tokens(hf_tokenizer)
modify_sentencepiece_model(
sp_model_path=vocab_file,
add_tokens=add_tokens,
skip_special_tokens=skip_special_tokens,
# reference_vocab=[token for token, idx in sorted(hf_tokenizer.vocab.items(), key=lambda x: x[1])],
)

sp_model = np.fromfile(vocab_file, dtype=np.uint8)
sp_model_node = as_node(sp_model)
Expand All @@ -353,9 +402,10 @@ def convert_sentencepiece_model_tokenizer(
hf_slow_tokenizer = hf_tokenizer.slow_tokenizer_class.from_pretrained(tmp)
fairseq_offset = getattr(hf_slow_tokenizer, "fairseq_offset", None)

input_node = op.Parameter(Type.u8, PartialShape(["?"]))
input_node = op.Parameter(Type.string, PartialShape(["?"]))
input_node.set_friendly_name("string_input")

is_chatglm = getattr(hf_tokenizer, "name", None) == "GLMTokenizer"
if is_chatglm:
add_eos_token = False
elif hasattr(hf_tokenizer, "add_eos_token"):
Expand All @@ -380,8 +430,8 @@ def convert_sentencepiece_model_tokenizer(

indices, values, dense_shape = tokenizer_node.outputs()

if fairseq_offset:
values = opset.add(values, make_constant_node(fairseq_offset, values.element_type)).output(0)
# if fairseq_offset:
# values = opset.add(values, make_constant_node(fairseq_offset, values.element_type)).output(0)

default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type)
broadcast = opset.broadcast(default_value, dense_shape)
Expand Down Expand Up @@ -432,7 +482,7 @@ def convert_sentencepiece_model_tokenizer(


def get_sp_detokenizer(sp_model_node: Node, streaming_detokenizer: bool = False) -> Model:
token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence)
model_input = token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence)

detokenizer = _factory.create(
"SentencepieceStreamDetokenizer" if streaming_detokenizer else "SentencepieceDetokenizer",
Expand All @@ -444,7 +494,7 @@ def get_sp_detokenizer(sp_model_node: Node, streaming_detokenizer: bool = False)

string_output = _factory.create("StringTensorPack", detokenizer).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_detokenizer = Model(string_output, [token_ids], DETOKENIZER_NAME)
tokenizer_detokenizer = Model(string_output, [model_input], DETOKENIZER_NAME)
tokenizer_detokenizer.validate_nodes_and_infer_types()
return tokenizer_detokenizer

Expand All @@ -463,11 +513,16 @@ def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def convert_tiktoken_model_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
with_detokenizer: bool = False,
skip_special_tokens: bool = False,
) -> Union[Model, Tuple[Model, Model]]:
encoding = getattr(hf_tokenizer, "tokenizer", None) or hf_tokenizer.encoder
split_pattern = encoding._pat_str

pipeline = TokenizerPipeline()
skip_tokens = []
if skip_special_tokens:
skip_tokens = list(parse_special_tokens(hf_tokenizer))

pipeline.add_steps(
[
NormalizeUnicode("NFC"),
Expand All @@ -478,7 +533,7 @@ def convert_tiktoken_model_tokenizer(
max_length=hf_tokenizer.model_max_length, truncate_right=(hf_tokenizer.truncation_side == "right")
),
PaddingStep(pad_right=(hf_tokenizer.padding_side == "right")),
VocabDecoderStep(),
VocabDecoderStep(skip_tokens),
CharsToBytesStep(),
]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -657,12 +657,18 @@ class DecodingStep(BasePipelineStep):

@dataclass
class VocabDecoderStep(DecodingStep):
skip_tokens: Optional[List[int]] = None

def __post_init__(self):
if self.skip_tokens is None:
self.skip_tokens = self.get_pipeline().skip_tokens or {}

def get_vocab_node_outputs(self) -> Optional[List[Output]]:
return self.get_pipeline().vocab_node_outputs

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(self.get_vocab_node_outputs())
return _factory.create("VocabDecoder", input_nodes, {}).outputs()
return _factory.create("VocabDecoder", input_nodes, {"skip_tokens": self.skip_tokens}).outputs()


@dataclass
Expand All @@ -683,6 +689,20 @@ def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep":
replace_term=r"\1",
)

@classmethod
def replace_end_of_word_suffix(cls, suffix: str = "</w>") -> "RegexDecodingStep":
return cls(
regex_search_pattern=suffix,
replace_term=" ",
)

@classmethod
def replace_continuing_subword_prefix(cls, prefix: str = "##") -> "RegexDecodingStep":
return cls(
regex_search_pattern=prefix,
replace_term="",
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(
(
Expand All @@ -704,6 +724,7 @@ def replace_sp_spaces(cls) -> "RegexDecodingStep":
class TokenizerPipeline:
steps: List[BasePipelineStep] = field(default_factory=list)
vocab: Optional[List[str]] = field(default=None, repr=False)
skip_tokens: Optional[List[int]] = field(default=None, repr=False)
number_of_inputs: int = 1
vocab_node_outputs: Optional[List[Output]] = field(default=None, repr=False)

Expand All @@ -729,7 +750,7 @@ def __getitem__(self, item: int) -> BasePipelineStep:
return self.steps[item]

def get_tokenizer_ov_subgraph(self) -> Model:
string_inputs = [op.Parameter(Type.u8, PartialShape(["?"])) for _ in range(self.number_of_inputs)]
string_inputs = [op.Parameter(Type.string, PartialShape(["?"])) for _ in range(self.number_of_inputs)]

processing_outputs = []
for input_node in string_inputs:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from io import StringIO
from math import isclose
from pathlib import Path
from importlib.metadata import version

import pytest

Expand All @@ -19,31 +20,31 @@ def build_coverege_report(session: pytest.Session) -> None:
from pytest_harvest import get_session_results_df

def add_tokenizer_type(row):
if not pd.isnull(row["wordpiece_tokenizers_param"]):
if not pd.isnull(row["hf_wordpiece_tokenizers_param"]):
return "WordPiece"
if not pd.isnull(row["bpe_tokenizers_param"]):
if not pd.isnull(row["hf_bpe_tokenizers_param"]):
return "BPE"
if not pd.isnull(row["sentencepice_tokenizers_param"]):
if not pd.isnull(row["hf_sentencepiece_tokenizers_param"]):
return "SentencePiece"
if not pd.isnull(row["tiktoken_tokenizers_param"]):
if not pd.isnull(row["hf_tiktoken_tokenizers_param"]):
return "Tiktoken"

results_df = get_session_results_df(session)
results_df["Tokenizer Type"] = results_df.apply(add_tokenizer_type, axis=1)
results_df.wordpiece_tokenizers_param.fillna(results_df.bpe_tokenizers_param, inplace=True)
results_df.wordpiece_tokenizers_param.fillna(results_df.sentencepice_tokenizers_param, inplace=True)
results_df.wordpiece_tokenizers_param.fillna(results_df.tiktoken_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_bpe_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_sentencepiece_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_tiktoken_tokenizers_param, inplace=True)
results_df.is_fast_tokenizer_param.fillna(True, inplace=True)
results_df.status = (results_df.status == "passed").astype(int)
results_df["Model"] = results_df.wordpiece_tokenizers_param + results_df.is_fast_tokenizer_param.apply(
results_df["Model"] = results_df.hf_wordpiece_tokenizers_param + results_df.is_fast_tokenizer_param.apply(
lambda x: "" if x else "_slow"
)

results_df = results_df[["Tokenizer Type", "Model", "test_string", "status"]]
grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg(["mean", "count"]).reset_index()
grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg({"status": ["mean", "count"]}).reset_index()
grouped_by_model.columns = ["Tokenizer Type", "Model", "Pass Rate, %", "Number of Tests"]
grouped_by_model["Pass Rate, %"] *= 100
grouped_by_type = results_df.groupby(["Tokenizer Type"]).agg(["mean", "count"]).reset_index()
grouped_by_type = results_df.groupby(["Tokenizer Type"]).agg({"status": ["mean", "count"]}).reset_index()
grouped_by_type.columns = ["Tokenizer Type", "Pass Rate, %", "Number of Tests"]
grouped_by_type["Pass Rate, %"] *= 100

Expand All @@ -59,10 +60,16 @@ def add_tokenizer_type(row):
"To update it run pytest with `--update_readme` flag.\n\n"
"### Coverage by Tokenizer Type\n\n"
)
grouped_by_type.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True)
is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
if is_pandas_2:
grouped_by_type.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True)
else:
grouped_by_type.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True)
new_readme.write("\n### Coverage by Model Type\n\n")
grouped_by_model.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True)

if is_pandas_2:
grouped_by_model.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True)
else:
grouped_by_model.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True)
with open(readme_path, "w") as f:
f.write(new_readme.getvalue())

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tokenizers_test.py::test_": 0.7941872254139912
"tokenizers_test.py::test_": 0.8955393718707328
}
Loading

0 comments on commit 7cab10a

Please sign in to comment.