FEATURE: Update OpenAI tokenizer to GPT-4o and later (#1467)
This commit is contained in:
parent
2fe99a0bec
commit
a40e2d3156
|
@ -1,13 +0,0 @@
|
||||||
# frozen_string_literal: true
|
|
||||||
|
|
||||||
module DiscourseAi
|
|
||||||
module Tokenizer
|
|
||||||
class OpenAiGpt4oTokenizer < OpenAiTokenizer
|
|
||||||
class << self
|
|
||||||
def tokenizer
|
|
||||||
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -5,7 +5,7 @@ module DiscourseAi
|
||||||
class OpenAiTokenizer < BasicTokenizer
|
class OpenAiTokenizer < BasicTokenizer
|
||||||
class << self
|
class << self
|
||||||
def tokenizer
|
def tokenizer
|
||||||
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
|
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
|
||||||
end
|
end
|
||||||
|
|
||||||
def tokenize(text)
|
def tokenize(text)
|
||||||
|
|
|
@ -99,7 +99,8 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
|
||||||
end
|
end
|
||||||
|
|
||||||
it "limits the system message to 60% of available tokens" do
|
it "limits the system message to 60% of available tokens" do
|
||||||
prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens")
|
prompt =
|
||||||
|
DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay")
|
||||||
prompt.push(type: :user, content: five_token_msg)
|
prompt.push(type: :user, content: five_token_msg)
|
||||||
|
|
||||||
dialect = TestDialect.new(prompt, llm_model)
|
dialect = TestDialect.new(prompt, llm_model)
|
||||||
|
@ -109,7 +110,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
|
||||||
|
|
||||||
expect(trimmed).to eq(
|
expect(trimmed).to eq(
|
||||||
[
|
[
|
||||||
{ type: :system, content: "I'm a system message consisting of 10" },
|
{ type: :system, content: "I'm a system message consisting of 10 tokens" },
|
||||||
{ type: :user, content: five_token_msg },
|
{ type: :user, content: five_token_msg },
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -18,7 +18,7 @@ class OpenAiMock < EndpointMock
|
||||||
model: "gpt-3.5-turbo-0301",
|
model: "gpt-3.5-turbo-0301",
|
||||||
usage: {
|
usage: {
|
||||||
prompt_tokens: 8,
|
prompt_tokens: 8,
|
||||||
completion_tokens: 13,
|
completion_tokens: 12,
|
||||||
total_tokens: 499,
|
total_tokens: 499,
|
||||||
},
|
},
|
||||||
choices: [
|
choices: [
|
||||||
|
|
|
@ -79,7 +79,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
|
||||||
|
|
||||||
it "truncates a sentence successfully at a multibyte unicode character" do
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
||||||
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||||
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
||||||
end
|
end
|
||||||
|
|
||||||
it "truncates unicode characters properly when they use more than one token per char" do
|
it "truncates unicode characters properly when they use more than one token per char" do
|
||||||
|
@ -104,17 +104,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
|
||||||
end
|
end
|
||||||
|
|
||||||
it "handles unicode characters properly when they use more than one token per char" do
|
it "handles unicode characters properly when they use more than one token per char" do
|
||||||
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
|
expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
|
|
||||||
describe "#size" do
|
|
||||||
describe "returns a token count" do
|
|
||||||
it "for a sentence with punctuation and capitalization and numbers" do
|
|
||||||
expect(described_class.size("Hello, World! 123")).to eq(6)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue