discourse-ai/spec/lib/modules/embeddings/vector_spec.rb

153 lines
5.2 KiB
Ruby

# frozen_string_literal: true
RSpec.describe DiscourseAi::Embeddings::Vector do
shared_examples "generates and store embeddings using a vector definition" do
subject(:vector) { described_class.new(vdef) }
let(:expected_embedding_1) { [0.0038493] * vdef.dimensions }
let(:expected_embedding_2) { [0.0037684] * vdef.dimensions }
before { SiteSetting.ai_embeddings_selected_model = vdef.id }
let(:topics_schema) { DiscourseAi::Embeddings::Schema.for(Topic) }
let(:posts_schema) { DiscourseAi::Embeddings::Schema.for(Post) }
fab!(:topic)
fab!(:post) { Fabricate(:post, post_number: 1, topic: topic) }
fab!(:post2) { Fabricate(:post, post_number: 2, topic: topic) }
describe "#vector_from" do
it "creates a vector from a given string" do
text = "This is a piece of text"
stub_vector_mapping(text, expected_embedding_1)
expect(vector.vector_from(text)).to eq(expected_embedding_1)
end
end
describe "#generate_representation_from" do
it "creates a vector from a topic and stores it in the database" do
text = vdef.prepare_target_text(topic)
stub_vector_mapping(text, expected_embedding_1)
vector.generate_representation_from(topic)
expect(topics_schema.find_by_embedding(expected_embedding_1).topic_id).to eq(topic.id)
end
it "creates a vector from a post and stores it in the database" do
text = vdef.prepare_target_text(post2)
stub_vector_mapping(text, expected_embedding_1)
vector.generate_representation_from(post)
expect(posts_schema.find_by_embedding(expected_embedding_1).post_id).to eq(post.id)
end
end
describe "#gen_bulk_reprensentations" do
fab!(:topic_2) { Fabricate(:topic) }
fab!(:post_2_1) { Fabricate(:post, post_number: 1, topic: topic_2) }
fab!(:post_2_2) { Fabricate(:post, post_number: 2, topic: topic_2) }
it "creates a vector for each object in the relation" do
text = vdef.prepare_target_text(topic)
text2 = vdef.prepare_target_text(topic_2)
stub_vector_mapping(text, expected_embedding_1)
stub_vector_mapping(text2, expected_embedding_2)
vector.gen_bulk_reprensentations(Topic.where(id: [topic.id, topic_2.id]))
expect(topics_schema.find_by_embedding(expected_embedding_1).topic_id).to eq(topic.id)
end
it "does nothing if passed record has no content" do
expect { vector.gen_bulk_reprensentations([Topic.new]) }.not_to raise_error
end
it "doesn't ask for a new embedding if digest is the same" do
text = vdef.prepare_target_text(topic)
stub_vector_mapping(text, expected_embedding_1)
original_vector_gen = Time.zone.parse("2021-06-04 10:00")
freeze_time(original_vector_gen) do
vector.gen_bulk_reprensentations(Topic.where(id: [topic.id]))
end
# check vector exists
expect(topics_schema.find_by_embedding(expected_embedding_1).topic_id).to eq(topic.id)
vector.gen_bulk_reprensentations(Topic.where(id: [topic.id]))
expect(topics_schema.find_by_target(topic).updated_at).to eq_time(original_vector_gen)
end
end
end
context "with open_ai as the provider" do
fab!(:vdef) { Fabricate(:open_ai_embedding_def) }
def stub_vector_mapping(text, expected_embedding)
EmbeddingsGenerationStubs.openai_service(
vdef.lookup_custom_param("model_name"),
text,
expected_embedding,
)
end
it_behaves_like "generates and store embeddings using a vector definition"
context "when matryoshka_dimensions is enabled" do
it "passes the dimensions param" do
shorter_dimensions = 10
vdef.update!(dimensions: shorter_dimensions, matryoshka_dimensions: true)
text = "This is a piece of text"
short_expected_embedding = [0.0038493] * shorter_dimensions
EmbeddingsGenerationStubs.openai_service(
vdef.lookup_custom_param("model_name"),
text,
short_expected_embedding,
extra_args: {
dimensions: shorter_dimensions,
},
)
expect(described_class.new(vdef).vector_from(text)).to eq(short_expected_embedding)
end
end
end
context "with hugging_face as the provider" do
fab!(:vdef) { Fabricate(:embedding_definition) }
def stub_vector_mapping(text, expected_embedding)
EmbeddingsGenerationStubs.hugging_face_service(text, expected_embedding)
end
it_behaves_like "generates and store embeddings using a vector definition"
end
context "with google as the provider" do
fab!(:vdef) { Fabricate(:gemini_embedding_def) }
def stub_vector_mapping(text, expected_embedding)
EmbeddingsGenerationStubs.gemini_service(vdef.api_key, text, expected_embedding)
end
it_behaves_like "generates and store embeddings using a vector definition"
end
context "with cloudflare as the provider" do
fab!(:vdef) { Fabricate(:cloudflare_embedding_def) }
def stub_vector_mapping(text, expected_embedding)
EmbeddingsGenerationStubs.cloudflare_service(text, expected_embedding)
end
it_behaves_like "generates and store embeddings using a vector definition"
end
end