Dedup concepts

2025-05-15 16:38:45 -03:00 · 2025-05-15 16:38:45 -03:00 · 0664ec512b
parent 1c4806beb6
commit 0664ec512b
8 changed files with 218 additions and 70 deletions
--- a/app/jobs/scheduled/generate_concepts_from_popular_items.rb
+++ b/app/jobs/scheduled/generate_concepts_from_popular_items.rb
@ -6,76 +6,80 @@ module Jobs

    # This job runs daily and generates new concepts from popular topics and posts
    # It selects items based on engagement metrics and generates concepts from their content
-    def execute(args = {})
+    def execute(_args)
      return unless SiteSetting.inferred_concepts_enabled

      process_popular_topics
      process_popular_posts
    end
-    
-    private
-    
-    def process_popular_topics

+    private
+
+    def process_popular_topics
      # Find candidate topics that are popular and don't have concepts yet
-      candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
-        limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
-        min_posts: SiteSetting.inferred_concepts_min_posts || 5,
-        min_likes: SiteSetting.inferred_concepts_min_likes || 10,
-        min_views: SiteSetting.inferred_concepts_min_views || 100,
-        created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
-      )
+      candidates =
+        DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
+          limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
+          min_posts: SiteSetting.inferred_concepts_min_posts || 5,
+          min_likes: SiteSetting.inferred_concepts_min_likes || 10,
+          min_views: SiteSetting.inferred_concepts_min_views || 100,
+          created_after: SiteSetting.inferred_concepts_lookback_days.days.ago,
+        )

      return if candidates.blank?
-      
+
      # Process candidate topics - first generate concepts, then match
      Jobs.enqueue(
        :generate_inferred_concepts,
-        item_type: 'topics',
-        item_ids: candidates.map(&:id),
-        batch_size: 10
-      )
-      
-      # Schedule a follow-up job to match existing concepts
-      Jobs.enqueue_in(
-        1.hour,
-        :generate_inferred_concepts,
-        item_type: 'topics',
+        item_type: "topics",
        item_ids: candidates.map(&:id),
        batch_size: 10,
-        match_only: true
      )
-    end
-    
-    def process_popular_posts

+      if SiteSetting.inferred_concepts_background_match
+        # Schedule a follow-up job to match existing concepts
+        Jobs.enqueue_in(
+          1.hour,
+          :generate_inferred_concepts,
+          item_type: "topics",
+          item_ids: candidates.map(&:id),
+          batch_size: 10,
+          match_only: true,
+        )
+      end
+    end
+
+    def process_popular_posts
      # Find candidate posts that are popular and don't have concepts yet
-      candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_posts(
-        limit: SiteSetting.inferred_concepts_daily_posts_limit || 30,
-        min_likes: SiteSetting.inferred_concepts_post_min_likes || 5,
-        exclude_first_posts: true,
-        created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
-      )
+      candidates =
+        DiscourseAi::InferredConcepts::Manager.find_candidate_posts(
+          limit: SiteSetting.inferred_concepts_daily_posts_limit || 30,
+          min_likes: SiteSetting.inferred_concepts_post_min_likes || 5,
+          exclude_first_posts: true,
+          created_after: SiteSetting.inferred_concepts_lookback_days.days.ago,
+        )

      return if candidates.blank?
-      
+
      # Process candidate posts - first generate concepts, then match
      Jobs.enqueue(
        :generate_inferred_concepts,
-        item_type: 'posts',
-        item_ids: candidates.map(&:id),
-        batch_size: 10
-      )
-      
-      # Schedule a follow-up job to match against existing concepts
-      Jobs.enqueue_in(
-        1.hour,
-        :generate_inferred_concepts,
-        item_type: 'posts',
+        item_type: "posts",
        item_ids: candidates.map(&:id),
        batch_size: 10,
-        match_only: true
      )
+
+      if SiteSetting.inferred_concepts_background_match
+        # Schedule a follow-up job to match against existing concepts
+        Jobs.enqueue_in(
+          1.hour,
+          :generate_inferred_concepts,
+          item_type: "posts",
+          item_ids: candidates.map(&:id),
+          batch_size: 10,
+          match_only: true,
+        )
+      end
    end
  end
-end
+end
--- a/config/settings.yml
+++ b/config/settings.yml
@ -405,32 +405,39 @@ discourse_ai:
  inferred_concepts_enabled:
    default: false
    client: true
-    description: "Enable the inferred concepts system that automatically generates and applies concepts to topics"
+  inferred_concepts_background_match:
+    default: false
+    client: false
  inferred_concepts_daily_topics_limit:
    default: 20
    client: false
-    description: "Maximum number of topics to process each day for concept generation"
  inferred_concepts_min_posts:
    default: 5
    client: false
-    description: "Minimum number of posts a topic must have to be considered for concept generation"
  inferred_concepts_min_likes:
    default: 10
    client: false
-    description: "Minimum number of likes a topic must have to be considered for concept generation"
  inferred_concepts_min_views:
    default: 100
    client: false
-    description: "Minimum number of views a topic must have to be considered for concept generation"
  inferred_concepts_lookback_days:
    default: 30
    client: false
-    description: "Only consider topics created within this many days for concept generation"
  inferred_concepts_daily_posts_limit:
    default: 30
    client: false
-    description: "Maximum number of posts to process each day for concept generation"
  inferred_concepts_post_min_likes:
    default: 5
    client: false
-    description: "Minimum number of likes a post must have to be considered for concept generation"
+  inferred_concepts_generate_persona:
+    default: "-15"
+    type: enum
+    enum: "DiscourseAi::Configuration::PersonaEnumerator"
+  inferred_concepts_match_persona:
+    default: "-16"
+    type: enum
+    enum: "DiscourseAi::Configuration::PersonaEnumerator"
+  inferred_concepts_deduplicate_persona:
+    default: "-17"
+    type: enum
+    enum: "DiscourseAi::Configuration::PersonaEnumerator"
--- a/db/migrate/20250508183456_create_topics_inferred_concepts.rb
+++ b/db/migrate/20250508183456_create_topics_inferred_concepts.rb
@ -2,14 +2,12 @@

 class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0]
  def change
-    create_table :topics_inferred_concepts do |t|
+    create_table :topics_inferred_concepts, primary_key: %i[topic_id inferred_concept_id] do |t|
      t.integer :topic_id, null: false
      t.integer :inferred_concept_id, null: false
      t.timestamps
    end

-    add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept'
-    add_index :topics_inferred_concepts, :topic_id
    add_index :topics_inferred_concepts, :inferred_concept_id
  end
-end
+end
--- a/db/migrate/20250509000001_create_posts_inferred_concepts.rb
+++ b/db/migrate/20250509000001_create_posts_inferred_concepts.rb
@ -2,14 +2,12 @@

 class CreatePostsInferredConcepts < ActiveRecord::Migration[7.0]
  def change
-    create_table :posts_inferred_concepts do |t|
+    create_table :posts_inferred_concepts, primary_key: %i[post_id inferred_concept_id] do |t|
      t.integer :post_id, null: false
      t.integer :inferred_concept_id, null: false
      t.timestamps
    end

-    add_index :posts_inferred_concepts, [:post_id, :inferred_concept_id], unique: true, name: 'idx_unique_post_inferred_concept'
-    add_index :posts_inferred_concepts, :post_id
    add_index :posts_inferred_concepts, :inferred_concept_id
  end
-end
+end
--- a/lib/inferred_concepts/finder.rb
+++ b/lib/inferred_concepts/finder.rb
@ -9,8 +9,13 @@ module DiscourseAi
        return [] if content.blank?

        # Use the ConceptFinder persona to identify concepts
-        llm = DiscourseAi::Completions::Llm.default_llm
-        persona = DiscourseAi::Personas::ConceptFinder.new
+        persona =
+          AiPersona
+            .all_personas(enabled_only: false)
+            .find { |persona| persona.id == SiteSetting.inferred_concepts_generate_persona.to_i }
+            .new
+
+        llm = LlmModel.find(persona.class.default_llm_id)
        context =
          DiscourseAi::Personas::BotContext.new(
            messages: [{ type: :user, content: content }],
@ -18,12 +23,11 @@ module DiscourseAi
            inferred_concepts: DiscourseAi::InferredConcepts::Manager.list_concepts,
          )

-        prompt = persona.craft_prompt(context)
-        response = llm.completion(prompt, extract_json: true)
+        bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm)

-        return [] unless response.success?
+        response = bot.reply(context)

-        concepts = response.parsed_output["concepts"]
+        concepts = JSON.parse(response[0][0]).dig("concepts")
        concepts || []
      end

@ -68,7 +72,7 @@ module DiscourseAi
        query = query.where("topics.created_at >= ?", created_after) if created_after.present?

        # Exclude PM topics (if they exist in Discourse)
-        query = query.where(archetype: Topic.public_archetype)
+        query = query.where(archetype: Archetype.default)

        # Exclude topics that already have concepts
        topics_with_concepts = <<~SQL
@ -134,6 +138,34 @@ module DiscourseAi
        # Return limited number of posts
        query.limit(limit)
      end
+
+      # Deduplicate and standardize a list of concepts
+      # @param concept_names [Array<String>] List of concept names to deduplicate
+      # @return [Hash] Hash with deduplicated concepts and mapping
+      def self.deduplicate_concepts(concept_names)
+        return { deduplicated_concepts: [], mapping: {} } if concept_names.blank?
+
+        # Use the ConceptDeduplicator persona to deduplicate concepts
+        persona =
+          AiPersona
+            .all_personas(enabled_only: false)
+            .find { |persona| persona.id == SiteSetting.inferred_concepts_deduplicate_persona.to_i }
+            .new
+
+        llm = LlmModel.find(persona.class.default_llm_id)
+
+        # Create the input for the deduplicator
+        input = { type: :user, content: concept_names.join(", ") }
+
+        context =
+          DiscourseAi::Personas::BotContext.new(messages: [input], user: Discourse.system_user)
+
+        bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm)
+
+        response = bot.reply(context)
+
+        concepts = JSON.parse(response[0][0]).dig("streamlined_tags")
+      end
    end
  end
 end
--- a/lib/inferred_concepts/manager.rb
+++ b/lib/inferred_concepts/manager.rb
@ -14,6 +14,65 @@ module DiscourseAi

        query.pluck(:name)
      end
+
+      # Deduplicate concepts in batches by letter
+      # This method will:
+      # 1. Group concepts by first letter
+      # 2. Process each letter group separately through the deduplicator
+      # 3. Do a final pass with all deduplicated concepts
+      # @return [Hash] Statistics about the deduplication process
+      def self.deduplicate_concepts_by_letter(per_letter_batch: 50, full_pass_batch: 150)
+        # Get all concepts
+        all_concepts = list_concepts
+        return if all_concepts.empty?
+
+        letter_groups = Hash.new { |h, k| h[k] = [] }
+
+        # Group concepts by first letter
+        all_concepts.each do |concept|
+          first_char = concept[0]&.upcase
+
+          if first_char && first_char.match?(/[A-Z]/)
+            letter_groups[first_char] << concept
+          else
+            # Non-alphabetic or empty concepts go in a special group
+            letter_groups["#"] << concept
+          end
+        end
+
+        # Process each letter group
+        letter_deduplicated_concepts = []
+
+        letter_groups.each do |letter, concepts|
+          next if concepts.empty?
+
+          batches = concepts.each_slice(per_letter_batch).to_a
+
+          batches.each do |batch|
+            result = Finder.deduplicate_concepts(batch)
+            letter_deduplicated_concepts.concat(result)
+          end
+        end
+
+        # Final pass with all deduplicated concepts
+        if letter_deduplicated_concepts.present?
+          final_result = []
+
+          batches = letter_deduplicated_concepts.each_slice(full_pass_batch).to_a
+          batches.each do |batch|
+            dedups = Finder.deduplicate_concepts(batch)
+            final_result.concat(dedups)
+          end
+
+          # Remove duplicates
+          final_result.uniq!
+
+          # Apply the deduplicated concepts
+          InferredConcept.destroy_all
+          InferredConcept.insert_all(final_result.map { { name: it } })
+        end
+      end
+
      # Generate new concepts for a topic and apply them
      # @param topic [Topic] A Topic instance
      # @return [Array<InferredConcept>] The concepts that were applied
@ -139,7 +198,7 @@ module DiscourseAi
      # @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time
      # @return [Array<Topic>] Array of Topic objects that are good candidates
      def self.find_candidate_topics(opts = {})
-        Finder.find_candidate_topics(opts)
+        Finder.find_candidate_topics(**opts)
      end

      # Find candidate posts that are good for concept generation
--- a/lib/personas/concept_deduplicator.rb
+++ b/lib/personas/concept_deduplicator.rb
@ -0,0 +1,49 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Personas
+    class ConceptDeduplicator < Persona
+      def system_prompt
+        <<~PROMPT.strip
+          You will be given a list of machine-generated tags.
+          Your task is to streamline this list by merging entries who are similar or related.
+
+          Please follow these steps to create a streamlined list of tags:
+
+          1. Review the entire list of tags carefully.
+          2. Identify and remove any exact duplicates.
+          3. Look for tags that are too specific or niche, and consider removing them or replacing them with more general terms.
+          4. If there are multiple tags that convey similar concepts, choose the best one and remove the others, or add a new one that covers the missing aspect.
+          5. Ensure that the remaining tags are relevant and useful for describing the content.
+
+          When deciding which tags are "best", consider the following criteria:
+          - Relevance: How well does the tag describe the core content or theme?
+          - Generality: Is the tag specific enough to be useful, but not so specific that it's unlikely to be searched for?
+          - Clarity: Is the tag easy to understand and free from ambiguity?
+          - Popularity: Would this tag likely be used by people searching for this type of content?
+
+          Example Input:
+          AI Bias, AI Bots, AI Ethics, AI Helper, AI Integration, AI Moderation, AI Search, AI-Driven Moderation, AI-Generated Post Illustrations, AJAX Events, AJAX Requests, AMA Events, API, API Access, API Authentication, API Automation, API Call, API Changes, API Compliance, API Configuration, API Costs, API Documentation, API Endpoint, API Endpoints, API Functions, API Integration, API Key, API Keys, API Limitation, API Limitations, API Permissions, API Rate Limiting, API Request, API Request Optimization, API Requests, API Security, API Suspension, API Token, API Tokens, API Translation, API Versioning, API configuration, API endpoint, API key, APIs, APK, APT Package Manager, ARIA, ARIA Tags, ARM Architecture, ARM-based, AWS, AWS Lightsail, AWS RDS, AWS S3, AWS Translate, AWS costs, AWS t2.micro, Abbreviation Expansion, Abbreviations
+
+          Example Output:
+          AI, AJAX, API, APK, APT Package Manager, ARIA, ARM Architecture, AWS, Abbreviations
+
+          Please provide your streamlined list of tags within <streamlined_tags> key.
+
+          Remember, the goal is to create a more focused and effective set of tags while maintaining the essence of the original list.
+
+          Your output should be in the following format:
+            <o>
+              {
+                "streamlined_tags": ["tag1", "tag3"]
+              }
+            </o>
+        PROMPT
+      end
+
+      def response_format
+        [{ key: "streamlined_tags", type: "array" }]
+      end
+    end
+  end
+end
--- a/lib/personas/persona.rb
+++ b/lib/personas/persona.rb
@ -54,6 +54,7 @@ module DiscourseAi
            ForumResearcher => -14,
            ConceptFinder => -15,
            ConceptMatcher => -16,
+            ConceptDeduplicator => -17,
          }
        end