Skip to content

Commit

Permalink
update tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
zmh-program committed Jan 2, 2024
1 parent 628f370 commit 89e5e13
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 13 deletions.
2 changes: 1 addition & 1 deletion adapter/azure/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func formatMessages(props *ChatProps) interface{} {
}
props.Message[len(props.Message)-1].Content = base
return props.Message
} else if props.Model == globals.GPT41106VisionPreview {
} else if globals.IsGPT41106VisionPreview(props.Model) {
return utils.Each[globals.Message, Message](props.Message, func(message globals.Message) Message {
if message.Role == globals.User {
urls := utils.ExtractImageUrls(message.Content)
Expand Down
2 changes: 1 addition & 1 deletion adapter/chatgpt/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func formatMessages(props *ChatProps) interface{} {
}
props.Message[len(props.Message)-1].Content = base
return props.Message
} else if props.Model == globals.GPT41106VisionPreview {
} else if globals.IsGPT41106VisionPreview(props.Model) {
return utils.Each[globals.Message, Message](props.Message, func(message globals.Message) Message {
if message.Role == globals.User {
urls := utils.ExtractImageUrls(message.Content)
Expand Down
4 changes: 4 additions & 0 deletions globals/variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,7 @@ func IsClaude100KModel(model string) bool {
func IsMidjourneyFastModel(model string) bool {
return model == MidjourneyFast
}

func IsGPT41106VisionPreview(model string) bool {
return model == GPT41106VisionPreview || strings.Contains(model, GPT41106VisionPreview)
}
8 changes: 3 additions & 5 deletions utils/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ func (i *Image) GetPixelColor(x int, y int) (int, int, int) {
}

func (i *Image) CountTokens(model string) int {
switch model {
case globals.GPT41106VisionPreview:
if globals.IsGPT41106VisionPreview(model) {
// tile size is 512x512
// the max size of image is 2048x2048
// the image that is larger than 2048x2048 will be resized in 16 tiles
Expand All @@ -97,8 +96,7 @@ func (i *Image) CountTokens(model string) int {
tiles := int(x) * int(y)

return 85 + 170*tiles

default:
return 0
}

return 0
}
16 changes: 10 additions & 6 deletions utils/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,24 @@ func GetWeightByModel(model string) int {
}
}
func NumTokensFromMessages(messages []globals.Message, model string) (tokens int) {
weight := GetWeightByModel(model)
tokensPerMessage := GetWeightByModel(model)
tkm, err := tiktoken.EncodingForModel(model)
if err != nil {
// the method above was deprecated, use the recall method instead
// can not encode messages, use length of messages as a proxy for number of tokens
// using rune instead of byte to account for unicode characters (e.g. emojis, non-english characters)
// data := Marshal(messages)
// return len([]rune(data)) * weight

data := Marshal(messages)
return len([]rune(data)) * weight
// use the recall method instead (default encoder model is gpt-3.5-turbo-0613)
return NumTokensFromMessages(messages, globals.GPT3Turbo0613)
}

for _, message := range messages {
tokens += weight
tokens += len(tkm.Encode(message.Content, nil, nil))
tokens += len(tkm.Encode(message.Role, nil, nil))
tokens +=
len(tkm.Encode(message.Content, nil, nil)) +
len(tkm.Encode(message.Role, nil, nil)) +
tokensPerMessage
}
tokens += 3 // every reply is primed with <|start|>assistant<|message|>
return tokens
Expand Down

0 comments on commit 89e5e13

Please sign in to comment.