Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : remove notion of CLS token #11064

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ class Tokenizer:
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
CLS_ID = "tokenizer.ggml.cls_token_id"
MASK_ID = "tokenizer.ggml.mask_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
Expand Down Expand Up @@ -1782,7 +1781,6 @@ def get_type(val: Any) -> GGUFValueType:
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
Expand Down
3 changes: 0 additions & 3 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,9 +854,6 @@ def add_sep_token_id(self, id: int) -> None:
def add_pad_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PAD_ID, id)

def add_cls_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.CLS_ID, id)

def add_mask_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MASK_ID, id)

Expand Down
5 changes: 4 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -924,11 +924,14 @@ extern "C" {
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding

// CLS is equivalent to BOS
DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_model * model), // classification
"use llama_token_bos instead");

LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);

Expand Down
9 changes: 1 addition & 8 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1134,7 +1134,6 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
vocab.linefeed_id = LLAMA_TOKEN_NULL;

Expand All @@ -1155,18 +1154,16 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
vocab.special_unk_id = 0;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;

// default special tokens
vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_bos_id = 101;
vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = 100;
vocab.special_sep_id = 102;
vocab.special_pad_id = 0;
vocab.special_cls_id = 101;
vocab.special_mask_id = 103;
} else if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE;
Expand Down Expand Up @@ -1201,7 +1198,6 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "t5") {
vocab.type = LLAMA_VOCAB_TYPE_UGM;
Expand All @@ -1212,7 +1208,6 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
vocab.special_unk_id = 2;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = 0;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;

const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
Expand Down Expand Up @@ -1495,7 +1490,6 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
Expand Down Expand Up @@ -1930,7 +1924,6 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }

if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
Expand Down
10 changes: 3 additions & 7 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1522,8 +1522,8 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
case LLAMA_VOCAB_TYPE_WPM:
{
if (add_special) {
GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_cls_id);
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
}

llm_tokenizer_wpm_session session(vocab);
Expand Down Expand Up @@ -1650,7 +1650,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
}

llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
return vocab.special_bos_id;
}

llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
Expand All @@ -1665,10 +1665,6 @@ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
}

llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
return vocab.special_cls_id;
}

llama_token llama_token_sep_impl(const struct llama_vocab & vocab) {
return vocab.special_sep_id;
}
Expand Down
2 changes: 0 additions & 2 deletions src/llama-vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ struct llama_vocab {
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL;

id linefeed_id = 13;
Expand Down Expand Up @@ -124,7 +123,6 @@ llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12267,7 +12267,7 @@ llama_token llama_token_eot(const struct llama_model * model) {
}

llama_token llama_token_cls(const struct llama_model * model) {
return llama_token_cls_impl(model->vocab);
return llama_token_bos_impl(model->vocab);
}

llama_token llama_token_sep(const struct llama_model * model) {
Expand Down
Loading