From e666b7477ff0dc623113ddb33be418bb98ed4376 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 4 Jan 2024 19:46:22 +0000 Subject: [PATCH] Add New Test String --- .../tokenizer/python/README.md | 196 +++++++++--------- .../tokenizer/python/tests/pass_rates.json | 2 +- .../tokenizer/python/tests/tokenizers_test.py | 1 + 3 files changed, 100 insertions(+), 99 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 40c55964e..1ef42c3d0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -244,23 +244,23 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE - 95.76 - 3325 + 95.82 + 3420 SentencePiece - 86.14 - 2800 + 86.28 + 2880 Tiktoken - 97.62 - 210 + 97.69 + 216 WordPiece - 86.79 - 507 + 82.12 + 520 @@ -280,302 +280,302 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE EleutherAI/gpt-j-6b - 98.29 - 175 + 98.33 + 180 BPE EleutherAI/gpt-neo-125m - 98.29 - 175 + 98.33 + 180 BPE EleutherAI/gpt-neox-20b - 97.71 - 175 + 97.78 + 180 BPE EleutherAI/pythia-12b-deduped - 97.71 - 175 + 97.78 + 180 BPE KoboldAI/fairseq-dense-13B - 98.86 - 175 + 98.89 + 180 BPE Salesforce/codegen-16B-multi - 97.14 - 175 + 97.22 + 180 BPE ai-forever/rugpt3large_based_on_gpt2 - 97.71 - 175 + 97.78 + 180 BPE bigscience/bloom - 99.43 - 175 + 99.44 + 180 BPE databricks/dolly-v2-3b - 97.71 - 175 + 97.78 + 180 BPE facebook/bart-large-mnli - 97.14 - 175 + 97.22 + 180 BPE facebook/galactica-120b - 98.29 - 175 + 98.33 + 180 BPE facebook/opt-66b - 98.86 - 175 + 98.89 + 180 BPE gpt2 - 97.14 - 175 + 97.22 + 180 BPE laion/CLIP-ViT-bigG-14-laion2B-39B-b160k - 61.14 - 175 + 61.11 + 180 BPE microsoft/deberta-base - 96.00 - 175 + 96.11 + 180 BPE roberta-base - 96.00 - 175 + 96.11 + 180 BPE sentence-transformers/all-roberta-large-v1 - 96.00 - 175 + 96.11 + 180 BPE stabilityai/stablecode-completion-alpha-3b-4k - 98.29 - 175 + 98.33 + 180 BPE stabilityai/stablelm-tuned-alpha-7b - 97.71 - 175 + 97.78 + 180 SentencePiece NousResearch/Llama-2-13b-hf 100.00 - 175 + 180 SentencePiece NousResearch/Llama-2-13b-hf_slow 100.00 - 175 + 180 SentencePiece THUDM/chatglm2-6b 100.00 - 175 + 180 SentencePiece THUDM/chatglm2-6b_slow 100.00 - 175 + 180 SentencePiece THUDM/chatglm3-6b 100.00 - 175 + 180 SentencePiece THUDM/chatglm3-6b_slow 100.00 - 175 + 180 SentencePiece camembert-base 0.00 - 175 + 180 SentencePiece camembert-base_slow - 74.29 - 175 + 75.00 + 180 SentencePiece codellama/CodeLlama-7b-hf 100.00 - 175 + 180 SentencePiece codellama/CodeLlama-7b-hf_slow 100.00 - 175 + 180 SentencePiece microsoft/deberta-v3-base - 93.14 - 175 + 93.33 + 180 SentencePiece microsoft/deberta-v3-base_slow 100.00 - 175 + 180 SentencePiece xlm-roberta-base - 98.86 - 175 + 98.89 + 180 SentencePiece xlm-roberta-base_slow - 98.86 - 175 + 98.89 + 180 SentencePiece xlnet-base-cased - 60.57 - 175 + 61.11 + 180 SentencePiece xlnet-base-cased_slow - 52.57 - 175 + 53.33 + 180 Tiktoken Qwen/Qwen-14B-Chat - 98.10 - 105 + 98.15 + 108 Tiktoken Salesforce/xgen-7b-8k-base - 97.14 - 105 + 97.22 + 108 WordPiece ProsusAI/finbert - 84.62 - 39 + 80.00 + 40 WordPiece bert-base-multilingual-cased - 84.62 - 39 + 80.00 + 40 WordPiece bert-large-cased - 84.62 - 39 + 80.00 + 40 WordPiece cointegrated/rubert-tiny2 - 84.62 - 39 + 80.00 + 40 WordPiece distilbert-base-uncased-finetuned-sst-2-english - 84.62 - 39 + 80.00 + 40 WordPiece google/electra-base-discriminator - 84.62 - 39 + 80.00 + 40 WordPiece google/mobilebert-uncased - 100.00 - 39 + 95.00 + 40 WordPiece jhgan/ko-sbert-sts - 79.49 - 39 + 75.00 + 40 WordPiece prajjwal1/bert-mini - 100.00 - 39 + 95.00 + 40 WordPiece rajiv003/ernie-finetuned-qqp - 100.00 - 39 + 95.00 + 40 WordPiece rasa/LaBSE - 76.92 - 39 + 72.50 + 40 WordPiece sentence-transformers/all-MiniLM-L6-v2 - 79.49 - 39 + 75.00 + 40 WordPiece squeezebert/squeezebert-uncased - 84.62 - 39 + 80.00 + 40 diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index 1567f78c3..5ff8bad4c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -1,3 +1,3 @@ { - "tokenizers_test.py::test_": 0.9121858562244302 + "tokenizers_test.py::test_": 0.9096334185848253 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index a9bae43fc..90bf2230c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -61,6 +61,7 @@ def unpack_strings(strings): ] misc_strings = [ "", + b"\x06".decode(), # control char " ", " " * 10, "\n",