FEATURE: Update OpenAI tokenizer to GPT-4o and later (#1467)

This commit is contained in:
Rafael dos Santos Silva 2025-06-26 15:26:09 -03:00 committed by GitHub
parent 2fe99a0bec
commit a40e2d3156
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 7 additions and 29 deletions

View File

@ -1,13 +0,0 @@
# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class OpenAiGpt4oTokenizer < OpenAiTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end
end
end
end
end

View File

@ -5,7 +5,7 @@ module DiscourseAi
class OpenAiTokenizer < BasicTokenizer class OpenAiTokenizer < BasicTokenizer
class << self class << self
def tokenizer def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base") @@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end end
def tokenize(text) def tokenize(text)

View File

@ -99,7 +99,8 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
end end
it "limits the system message to 60% of available tokens" do it "limits the system message to 60% of available tokens" do
prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens") prompt =
DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay")
prompt.push(type: :user, content: five_token_msg) prompt.push(type: :user, content: five_token_msg)
dialect = TestDialect.new(prompt, llm_model) dialect = TestDialect.new(prompt, llm_model)
@ -109,7 +110,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
expect(trimmed).to eq( expect(trimmed).to eq(
[ [
{ type: :system, content: "I'm a system message consisting of 10" }, { type: :system, content: "I'm a system message consisting of 10 tokens" },
{ type: :user, content: five_token_msg }, { type: :user, content: five_token_msg },
], ],
) )

View File

@ -18,7 +18,7 @@ class OpenAiMock < EndpointMock
model: "gpt-3.5-turbo-0301", model: "gpt-3.5-turbo-0301",
usage: { usage: {
prompt_tokens: 8, prompt_tokens: 8,
completion_tokens: 13, completion_tokens: 12,
total_tokens: 499, total_tokens: 499,
}, },
choices: [ choices: [

View File

@ -79,7 +79,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
it "truncates a sentence successfully at a multibyte unicode character" do it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud" sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿") expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
end end
it "truncates unicode characters properly when they use more than one token per char" do it "truncates unicode characters properly when they use more than one token per char" do
@ -104,17 +104,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
end end
it "handles unicode characters properly when they use more than one token per char" do it "handles unicode characters properly when they use more than one token per char" do
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false) expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
end
end
end
describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(6)
end
end end
end end
end end