FIX: Fix embeddings truncation strategy (#139)

2023-08-16 15:09:41 -03:00 · 2023-08-16 15:09:41 -03:00 · 0738f67fa4
parent 525c8b0913
commit 0738f67fa4
2 changed files with 38 additions and 7 deletions
--- a/lib/modules/embeddings/strategies/truncation.rb
+++ b/lib/modules/embeddings/strategies/truncation.rb
@ -22,17 +22,17 @@ module DiscourseAi
          @model = model
          @target = target
          @tokenizer = @model.tokenizer
-          @max_length = @model.max_sequence_length
+          @max_length = @model.max_sequence_length - 2
-          @processed_target = +""
+          @processed_target = nil
        end
        # Need a better name for this method
        def process!
          case @target
          when Topic
-            topic_truncation(@target)
+            @processed_target = topic_truncation(@target)
          when Post
-            post_truncation(@target)
+            @processed_target = post_truncation(@target)
          else
            raise ArgumentError, "Invalid target type"
          end
@ -41,7 +41,7 @@ module DiscourseAi
        end
        def topic_truncation(topic)
-          t = @processed_target
+          t = +""
          t << topic.title
          t << "\n\n"
@ -54,7 +54,7 @@ module DiscourseAi
          topic.posts.find_each do |post|
            t << post.raw
-            break if @tokenizer.size(t) >= @max_length
+            break if @tokenizer.size(t) >= @max_length #maybe keep a partial counter to speed this up?
            t << "\n\n"
          end
@ -62,7 +62,7 @@ module DiscourseAi
        end
        def post_truncation(post)
-          t = processed_target
+          t = +""
          t << post.topic.title
          t << "\n\n"
--- a/spec/lib/modules/embeddings/strategies/truncation_spec.rb
+++ b/spec/lib/modules/embeddings/strategies/truncation_spec.rb
@ -0,0 +1,31 @@
 # frozen_string_literal: true
 RSpec.describe DiscourseAi::Embeddings::Strategies::Truncation do
  describe "#process!" do
    context "when the model uses OpenAI to create embeddings" do
      before { SiteSetting.max_post_length = 100_000 }
      fab!(:topic) { Fabricate(:topic) }
      fab!(:post) do
        Fabricate(:post, topic: topic, raw: "Baby, bird, bird, bird\nBird is the word\n" * 500)
      end
      fab!(:post) do
        Fabricate(
          :post,
          topic: topic,
          raw: "Don't you know about the bird?\nEverybody knows that the bird is a word\n" * 400,
        )
      end
      fab!(:post) { Fabricate(:post, topic: topic, raw: "Surfin' bird\n" * 800) }
      let(:model) { DiscourseAi::Embeddings::Models::Base.descendants.sample(1).first }
      let(:truncation) { described_class.new(topic, model) }
      it "truncates a topic" do
        truncation.process!
        expect(model.tokenizer.size(truncation.processed_target)).to be <= model.max_sequence_length
      end
    end
  end
 end