discourse-ai/spec/lib/utils/research/llm_formatter_spec.rb

75 lines
2.5 KiB
Ruby

# frozen_string_literal: true
#
describe DiscourseAi::Utils::Research::LlmFormatter do
fab!(:user) { Fabricate(:user, username: "test_user") }
fab!(:topic) { Fabricate(:topic, title: "This is a Test Topic", user: user) }
fab!(:post) { Fabricate(:post, topic: topic, user: user) }
let(:tokenizer) { DiscourseAi::Tokenizer::OpenAiTokenizer }
let(:filter) { DiscourseAi::Utils::Research::Filter.new("@#{user.username}") }
describe "#truncate_if_needed" do
it "returns original content when under token limit" do
formatter =
described_class.new(
filter,
max_tokens_per_batch: 1000,
tokenizer: tokenizer,
max_tokens_per_post: 100,
)
short_text = "This is a short post"
expect(formatter.send(:truncate_if_needed, short_text)).to eq(short_text)
end
it "truncates content when over token limit" do
# Create a post with content that will exceed our token limit
long_text = ("word " * 200).strip
formatter =
described_class.new(
filter,
max_tokens_per_batch: 1000,
tokenizer: tokenizer,
max_tokens_per_post: 50,
)
truncated = formatter.send(:truncate_if_needed, long_text)
expect(truncated).to include("... elided 150 tokens ...")
expect(truncated).to_not eq(long_text)
# Should have roughly 25 words before and 25 after (half of max_tokens_per_post)
first_chunk = truncated.split("\n\n")[0]
expect(first_chunk.split(" ").length).to be_within(5).of(25)
last_chunk = truncated.split("\n\n")[2]
expect(last_chunk.split(" ").length).to be_within(5).of(25)
end
end
describe "#format_post" do
it "formats posts with truncation for long content" do
# Set up a post with long content
long_content = ("word " * 200).strip
long_post = Fabricate(:post, raw: long_content, topic: topic, user: user)
formatter =
described_class.new(
filter,
max_tokens_per_batch: 1000,
tokenizer: tokenizer,
max_tokens_per_post: 50,
)
formatted = formatter.send(:format_post, long_post)
# Should have standard formatting elements
expect(formatted).to include("## Post by #{user.username}")
expect(formatted).to include("Post url: /t/-/#{long_post.topic_id}/#{long_post.post_number}")
# Should include truncation marker
expect(formatted).to include("... elided 150 tokens ...")
end
end
end