discourse-ai/lib/ai_helper/semantic_categorizer.rb

# frozen_string_literal: true
module DiscourseAi
  module AiHelper
    class SemanticCategorizer
      def initialize(user, opts)
        @user = user
        @text = opts[:text]
        @vector = DiscourseAi::Embeddings::Vector.instance
        @schema = DiscourseAi::Embeddings::Schema.for(Topic)
        @topic_id = opts[:topic_id]
      end

      def categories
        return [] if @text.blank? && @topic_id.nil?
        return [] if !DiscourseAi::Embeddings.enabled?

        candidates = nearest_neighbors
        return [] if candidates.empty?

        candidate_ids = candidates.map(&:first)

        ::Topic
          .joins(:category)
          .where(id: candidate_ids)
          .where("categories.id IN (?)", Category.topic_create_allowed(@user.guardian).pluck(:id))
          .order("array_position(ARRAY#{candidate_ids}, topics.id)")
          .pluck(
            "categories.id",
            "categories.name",
            "categories.slug",
            "categories.color",
            "categories.topic_count",
          )
          .map
          .with_index do |(id, name, slug, color, topic_count), index|
            {
              id: id,
              name: name,
              slug: slug,
              color: color,
              topicCount: topic_count,
              score: candidates[index].last,
            }
          end
          .map do |c|
            # Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators
            c[:score] = (c[:score] + 1).abs if @vector.vdef.pg_function = "<#>"

            c[:score] = 1 / (c[:score] + 1) # inverse of the distance
            c
          end
          .group_by { |c| c[:name] }
          .map { |name, scores| scores.first.merge(score: scores.sum { |s| s[:score] }) }
          .sort_by { |c| -c[:score] }
          .take(5)
      end

      def tags
        return [] if @text.blank? && @topic_id.nil?
        return [] if !DiscourseAi::Embeddings.enabled?

        candidates = nearest_neighbors(limit: 100)
        return [] if candidates.empty?

        candidate_ids = candidates.map(&:first)

        count_column = Tag.topic_count_column(@user.guardian) # Determine the count column

        ::Topic
          .joins(:topic_tags, :tags)
          .where(id: candidate_ids)
          .where("tags.id IN (?)", DiscourseTagging.visible_tags(@user.guardian).pluck(:id))
          .group("topics.id")
          .order("array_position(ARRAY#{candidate_ids}, topics.id)")
          .pluck("array_agg(tags.name)")
          .map(&:uniq)
          .map
          .with_index { |tag_list, index| { tags: tag_list, score: candidates[index].last } }
          .flat_map { |c| c[:tags].map { |t| { name: t, score: c[:score] } } }
          .map do |c|
            # Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators
            c[:score] = (c[:score] + 1).abs if @vector.vdef.pg_function = "<#>"

            c[:score] = 1 / (c[:score] + 1) # inverse of the distance
            c
          end
          .group_by { |c| c[:name] }
          .map { |name, scores| { name: name, score: scores.sum { |s| s[:score] } } }
          .sort_by { |c| -c[:score] }
          .take(7)
          .then do |tags|
            models = Tag.where(name: tags.map { _1[:name] }).index_by(&:name)
            tags.map do |tag|
              tag[:id] = models.dig(tag[:name])&.id
              tag[:count] = models.dig(tag[:name])&.public_send(count_column) || 0
              tag
            end
          end
      end

      private

      def nearest_neighbors(limit: 50)
        if @topic_id
          target = Topic.find_by(id: @topic_id)
          embeddings = @schema.find_by_target(target)&.embeddings

          if embeddings.blank?
            @text =
              DiscourseAi::Summarization::Strategies::TopicSummary
                .new(target)
                .targets_data
                .pluck(:text)
            raw_vector = @vector.vector_from(@text)
          else
            raw_vector = JSON.parse(embeddings)
          end
        else
          raw_vector = @vector.vector_from(@text)
        end

        muted_category_ids = nil
        if @user.present?
          muted_category_ids =
            CategoryUser.where(
              user: @user,
              notification_level: CategoryUser.notification_levels[:muted],
            ).pluck(:category_id)
        end

        @schema
          .asymmetric_similarity_search(raw_vector, limit: limit, offset: 0) do |builder|
            builder.join("topics t on t.id = topic_id")
            unless muted_category_ids.empty?
              builder.where(<<~SQL, exclude_category_ids: muted_category_ids.map(&:to_i))
                t.category_id NOT IN (:exclude_category_ids) AND
                t.category_id NOT IN (SELECT categories.id FROM categories WHERE categories.parent_category_id IN (:exclude_category_ids))
              SQL
            end
          end
          .map { |r| [r.topic_id, r.distance] }
      end
    end
  end
end