FEATURE: add inferred concepts system

This commit adds a new inferred concepts system that: - Creates a model for storing concept labels that can be applied to topics - Provides AI personas for finding new concepts and matching existing ones - Adds jobs for generating concepts from popular topics - Includes a scheduled job that automatically processes engaging topics
2025-05-08 18:26:37 -03:00 · 2025-05-08 18:26:37 -03:00 · fb0d364687
parent 925949de47
commit fb0d364687
15 changed files with 581 additions and 0 deletions
--- a/app/jobs/regular/apply_inferred_concepts.rb
+++ b/app/jobs/regular/apply_inferred_concepts.rb
@ -0,0 +1,47 @@
+# frozen_string_literal: true
+
+module Jobs
+  class ApplyInferredConcepts < ::Jobs::Base
+    sidekiq_options queue: 'low'
+
+    # Process a batch of topics to apply existing concepts to them
+    #
+    # @param args [Hash] Contains job arguments
+    # @option args [Array<Integer>] :topic_ids Required - List of topic IDs to process
+    # @option args [Integer] :batch_size (100) Number of topics to process in each batch
+    def execute(args = {})
+      return if args[:topic_ids].blank?
+      
+      # Process topics in smaller batches to avoid memory issues
+      batch_size = args[:batch_size] || 100
+      
+      # Get the list of topic IDs
+      topic_ids = args[:topic_ids]
+      
+      # Process topics in batches
+      topic_ids.each_slice(batch_size) do |batch_topic_ids|
+        process_batch(batch_topic_ids)
+      end
+    end
+    
+    private
+    
+    def process_batch(topic_ids)
+      topics = Topic.where(id: topic_ids)
+      
+      topics.each do |topic|
+        begin
+          process_topic(topic)
+        rescue => e
+          Rails.logger.error("Error applying concepts to topic #{topic.id}: #{e.message}\n#{e.backtrace.join("\n")}")
+        end
+      end
+    end
+    
+    def process_topic(topic)
+      # Match topic against existing concepts and apply them
+      # Pass the topic object directly
+      DiscourseAi::InferredConcepts::Manager.match_topic_to_concepts(topic)
+    end
+  end
+end
--- a/app/jobs/regular/generate_inferred_concepts.rb
+++ b/app/jobs/regular/generate_inferred_concepts.rb
@ -0,0 +1,47 @@
+# frozen_string_literal: true
+
+module Jobs
+  class GenerateInferredConcepts < ::Jobs::Base
+    sidekiq_options queue: 'low'
+
+    # Process a batch of topics to generate new concepts (without applying them to topics)
+    #
+    # @param args [Hash] Contains job arguments
+    # @option args [Array<Integer>] :topic_ids Required - List of topic IDs to process
+    # @option args [Integer] :batch_size (100) Number of topics to process in each batch
+    def execute(args = {})
+      return if args[:topic_ids].blank?
+      
+      # Process topics in smaller batches to avoid memory issues
+      batch_size = args[:batch_size] || 100
+      
+      # Get the list of topic IDs
+      topic_ids = args[:topic_ids]
+      
+      # Process topics in batches
+      topic_ids.each_slice(batch_size) do |batch_topic_ids|
+        process_batch(batch_topic_ids)
+      end
+    end
+    
+    private
+    
+    def process_batch(topic_ids)
+      topics = Topic.where(id: topic_ids)
+      
+      topics.each do |topic|
+        begin
+          process_topic(topic)
+        rescue => e
+          Rails.logger.error("Error generating concepts from topic #{topic.id}: #{e.message}\n#{e.backtrace.join("\n")}")
+        end
+      end
+    end
+    
+    def process_topic(topic)
+      # Use the Manager method that handles both identifying and creating concepts
+      # Pass the topic object directly
+      DiscourseAi::InferredConcepts::Manager.generate_concepts_from_topic(topic)
+    end
+  end
+end
--- a/app/jobs/scheduled/generate_concepts_from_popular_topics.rb
+++ b/app/jobs/scheduled/generate_concepts_from_popular_topics.rb
@ -0,0 +1,38 @@
+# frozen_string_literal: true
+
+module Jobs
+  class GenerateConceptsFromPopularTopics < ::Jobs::Scheduled
+    every 1.day
+    
+    # This job runs daily and generates new concepts from popular topics
+    # It selects topics based on engagement metrics and generates concepts from their content
+    def execute(args = {})
+      # Find candidate topics that are popular and don't have concepts yet
+      candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
+        limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
+        min_posts: SiteSetting.inferred_concepts_min_posts || 5,
+        min_likes: SiteSetting.inferred_concepts_min_likes || 10,
+        min_views: SiteSetting.inferred_concepts_min_views || 100,
+        created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
+      )
+      
+      return if candidates.blank?
+      
+      # Process the candidate topics in batches using the regular job
+      Jobs.enqueue(
+        :generate_inferred_concepts,
+        topic_ids: candidates.map(&:id),
+        batch_size: 10
+      )
+      
+      # Schedule a follow-up job to apply the concepts to topics
+      # This runs after a delay to ensure concepts have been generated
+      Jobs.enqueue_in(
+        1.hour,
+        :apply_inferred_concepts,
+        topic_ids: candidates.map(&:id),
+        batch_size: 10
+      )
+    end
+  end
+end
--- a/app/models/inferred_concept.rb
+++ b/app/models/inferred_concept.rb
@ -0,0 +1,21 @@
+# frozen_string_literal: true
+
+class InferredConcept < ActiveRecord::Base
+  has_and_belongs_to_many :topics
+
+  validates :name, presence: true, uniqueness: true
+end
+
+# == Schema Information
+#
+# Table name: inferred_concepts
+#
+#  id         :bigint           not null, primary key
+#  name       :string           not null
+#  created_at :datetime         not null
+#  updated_at :datetime         not null
+#
+# Indexes
+#
+#  index_inferred_concepts_on_name  (name) UNIQUE
+#
--- a/app/serializers/inferred_concept_serializer.rb
+++ b/app/serializers/inferred_concept_serializer.rb
@ -0,0 +1,5 @@
+# frozen_string_literal: true
+
+class InferredConceptSerializer < ApplicationSerializer
+  attributes :id, :name, :created_at, :updated_at
+end
--- a/config/settings.yml
+++ b/config/settings.yml
@ -401,3 +401,28 @@ discourse_ai:
    allow_any: false
    enum: "DiscourseAi::Configuration::LlmEnumerator"
    validator: "DiscourseAi::Configuration::LlmValidator"
+
+  inferred_concepts_enabled:
+    default: false
+    client: true
+    description: "Enable the inferred concepts system that automatically generates and applies concepts to topics"
+  inferred_concepts_daily_topics_limit:
+    default: 20
+    client: false
+    description: "Maximum number of topics to process each day for concept generation"
+  inferred_concepts_min_posts:
+    default: 5
+    client: false
+    description: "Minimum number of posts a topic must have to be considered for concept generation"
+  inferred_concepts_min_likes:
+    default: 10
+    client: false
+    description: "Minimum number of likes a topic must have to be considered for concept generation"
+  inferred_concepts_min_views:
+    default: 100
+    client: false
+    description: "Minimum number of views a topic must have to be considered for concept generation"
+  inferred_concepts_lookback_days:
+    default: 30
+    client: false
+    description: "Only consider topics created within this many days for concept generation"
--- a/db/migrate/20250508182047_create_inferred_concepts_table.rb
+++ b/db/migrate/20250508182047_create_inferred_concepts_table.rb
@ -0,0 +1,11 @@
+# frozen_string_literal: true
+class CreateInferredConceptsTable < ActiveRecord::Migration[7.2]
+  def change
+    create_table :inferred_concepts do |t|
+      t.string :name, null: false
+      t.timestamps
+    end
+    
+    add_index :inferred_concepts, :name, unique: true
+  end
+end
--- a/db/migrate/20250508183456_create_topics_inferred_concepts.rb
+++ b/db/migrate/20250508183456_create_topics_inferred_concepts.rb
@ -0,0 +1,15 @@
+# frozen_string_literal: true
+
+class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0]
+  def change
+    create_table :topics_inferred_concepts do |t|
+      t.integer :topic_id, null: false
+      t.integer :inferred_concept_id, null: false
+      t.timestamps
+    end
+
+    add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept'
+    add_index :topics_inferred_concepts, :topic_id
+    add_index :topics_inferred_concepts, :inferred_concept_id
+  end
+end
--- a/lib/inferred_concepts/applier.rb
+++ b/lib/inferred_concepts/applier.rb
@ -0,0 +1,112 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module InferredConcepts
+    class Applier
+      # Associates the provided concepts with a topic
+      # topic: a Topic instance
+      # concepts: an array of InferredConcept instances
+      def self.apply_to_topic(topic, concepts)
+        return if topic.blank? || concepts.blank?
+        
+        concepts.each do |concept|
+          # Use the join table to associate the concept with the topic
+          # Avoid duplicates by using find_or_create_by
+          ActiveRecord::Base.connection.execute(<<~SQL)
+            INSERT INTO topics_inferred_concepts (topic_id, inferred_concept_id, created_at, updated_at)
+            VALUES (#{topic.id}, #{concept.id}, NOW(), NOW())
+            ON CONFLICT (topic_id, inferred_concept_id) DO NOTHING
+          SQL
+        end
+      end
+      
+      # Extracts content from a topic for concept analysis
+      # Returns a string with the topic title and first few posts
+      def self.topic_content_for_analysis(topic)
+        return "" if topic.blank?
+        
+        # Combine title and first few posts for analysis
+        posts = Post.where(topic_id: topic.id).order(:post_number).limit(10)
+        
+        content = "Title: #{topic.title}\n\n"
+        content += posts.map do |p| 
+          "#{p.post_number}) #{p.user.username}: #{p.raw}"
+        end.join("\n\n")
+        
+        content
+      end
+      
+      # Comprehensive method to analyze a topic and apply concepts
+      def self.analyze_and_apply(topic)
+        return if topic.blank?
+        
+        # Get content to analyze
+        content = topic_content_for_analysis(topic)
+        
+        # Identify concepts
+        concept_names = Finder.identify_concepts(content)
+        
+        # Create or find concepts in the database
+        concepts = Finder.create_or_find_concepts(concept_names)
+        
+        # Apply concepts to the topic
+        apply_to_topic(topic, concepts)
+        
+        concepts
+      end
+      
+      # Match a topic with existing concepts
+      def self.match_existing_concepts(topic)
+        return [] if topic.blank?
+        
+        # Get content to analyze
+        content = topic_content_for_analysis(topic)
+        
+        # Get all existing concepts
+        existing_concepts = InferredConcept.all.pluck(:name)
+        return [] if existing_concepts.empty?
+        
+        # Use the ConceptMatcher persona to match concepts
+        matched_concept_names = match_concepts_to_content(content, existing_concepts)
+        
+        # Find concepts in the database
+        matched_concepts = InferredConcept.where(name: matched_concept_names)
+        
+        # Apply concepts to the topic
+        apply_to_topic(topic, matched_concepts)
+        
+        matched_concepts
+      end
+      
+      # Use ConceptMatcher persona to match content against provided concepts
+      def self.match_concepts_to_content(content, concept_list)
+        return [] if content.blank? || concept_list.blank?
+        
+        # Prepare user message with content and concept list
+        user_message = <<~MESSAGE
+          Content to analyze:
+          #{content}
+          
+          Available concepts to match:
+          #{concept_list.join(", ")}
+        MESSAGE
+        
+        # Use the ConceptMatcher persona to match concepts
+        llm = DiscourseAi::Completions::Llm.default_llm
+        persona = DiscourseAi::Personas::ConceptMatcher.new
+        context = DiscourseAi::Personas::BotContext.new(
+          messages: [{ type: :user, content: user_message }],
+          user: Discourse.system_user
+        )
+        
+        prompt = persona.craft_prompt(context)
+        response = llm.completion(prompt, extract_json: true)
+        
+        return [] unless response.success?
+        
+        matching_concepts = response.parsed_output["matching_concepts"]
+        matching_concepts || []
+      end
+    end
+  end
+end
--- a/lib/inferred_concepts/finder.rb
+++ b/lib/inferred_concepts/finder.rb
@ -0,0 +1,91 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module InferredConcepts
+    class Finder
+      # Identifies potential concepts from provided content
+      # Returns an array of concept names (strings) 
+      def self.identify_concepts(content)
+        return [] if content.blank?
+
+        # Use the ConceptFinder persona to identify concepts
+        llm = DiscourseAi::Completions::Llm.default_llm
+        persona = DiscourseAi::Personas::ConceptFinder.new
+        context = DiscourseAi::Personas::BotContext.new(
+          messages: [{ type: :user, content: content }],
+          user: Discourse.system_user
+        )
+        
+        prompt = persona.craft_prompt(context)
+        response = llm.completion(prompt, extract_json: true)
+        
+        return [] unless response.success?
+        
+        concepts = response.parsed_output["concepts"]
+        concepts || []
+      end
+      
+      # Creates or finds concepts in the database from provided names
+      # Returns an array of InferredConcept instances
+      def self.create_or_find_concepts(concept_names)
+        return [] if concept_names.blank?
+        
+        concept_names.map do |name|
+          InferredConcept.find_or_create_by(name: name)
+        end
+      end
+      
+      # Finds candidate topics to use for concept generation
+      # 
+      # @param limit [Integer] Maximum number of topics to return
+      # @param min_posts [Integer] Minimum number of posts in topic
+      # @param min_likes [Integer] Minimum number of likes across all posts
+      # @param min_views [Integer] Minimum number of views
+      # @param exclude_topic_ids [Array<Integer>] Topic IDs to exclude
+      # @param category_ids [Array<Integer>] Only include topics from these categories (optional)
+      # @param created_after [DateTime] Only include topics created after this time (optional)
+      # @return [Array<Topic>] Array of Topic objects that are good candidates
+      def self.find_candidate_topics(
+        limit: 100, 
+        min_posts: 5, 
+        min_likes: 10, 
+        min_views: 100, 
+        exclude_topic_ids: [],
+        category_ids: nil,
+        created_after: 30.days.ago
+      )
+        query = Topic.where(
+          "topics.posts_count >= ? AND topics.views >= ? AND topics.like_count >= ?",
+          min_posts, 
+          min_views, 
+          min_likes
+        )
+        
+        # Apply additional filters
+        query = query.where("topics.id NOT IN (?)", exclude_topic_ids) if exclude_topic_ids.present?
+        query = query.where("topics.category_id IN (?)", category_ids) if category_ids.present?
+        query = query.where("topics.created_at >= ?", created_after) if created_after.present?
+        
+        # Exclude PM topics (if they exist in Discourse)
+        query = query.where(archetype: Topic.public_archetype)
+        
+        # Exclude topics that already have concepts
+        topics_with_concepts = <<~SQL
+          SELECT DISTINCT topic_id 
+          FROM topics_inferred_concepts
+        SQL
+        
+        query = query.where("topics.id NOT IN (#{topics_with_concepts})")
+        
+        # Score and order topics by engagement (combination of views, likes, and posts)
+        query = query.select(
+          "topics.*, 
+          (topics.like_count * 2 + topics.posts_count * 3 + topics.views * 0.1) AS engagement_score"
+        ).order("engagement_score DESC")
+        
+        # Return limited number of topics
+        query.limit(limit)
+      end
+    end
+  end
+end
--- a/lib/inferred_concepts/manager.rb
+++ b/lib/inferred_concepts/manager.rb
@ -0,0 +1,94 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module InferredConcepts
+    class Manager
+      # Generate new concepts for a topic and apply them
+      # @param topic [Topic] A Topic instance
+      # @return [Array<InferredConcept>] The concepts that were applied
+      def self.analyze_topic(topic)
+        return [] if topic.blank?
+        
+        Applier.analyze_and_apply(topic)
+      end
+      
+      # Extract new concepts from arbitrary content
+      # @param content [String] The content to analyze
+      # @return [Array<String>] The identified concept names
+      def self.identify_concepts(content)
+        Finder.identify_concepts(content)
+      end
+      
+      # Identify and create concepts from content without applying them to any topic
+      # @param content [String] The content to analyze
+      # @return [Array<InferredConcept>] The created or found concepts
+      def self.generate_concepts_from_content(content)
+        return [] if content.blank?
+        
+        # Identify concepts
+        concept_names = Finder.identify_concepts(content)
+        return [] if concept_names.blank?
+        
+        # Create or find concepts in the database
+        Finder.create_or_find_concepts(concept_names)
+      end
+      
+      # Generate concepts from a topic's content without applying them to the topic
+      # @param topic [Topic] A Topic instance
+      # @return [Array<InferredConcept>] The created or found concepts
+      def self.generate_concepts_from_topic(topic)
+        return [] if topic.blank?
+        
+        # Get content to analyze
+        content = Applier.topic_content_for_analysis(topic)
+        return [] if content.blank?
+        
+        # Generate concepts from the content
+        generate_concepts_from_content(content)
+      end
+      
+      # Match a topic against existing concepts
+      # @param topic [Topic] A Topic instance
+      # @return [Array<InferredConcept>] The concepts that were applied
+      def self.match_topic_to_concepts(topic)
+        return [] if topic.blank?
+        
+        Applier.match_existing_concepts(topic)
+      end
+      
+      # Find topics that have a specific concept
+      # @param concept_name [String] The name of the concept to search for
+      # @return [Array<Topic>] Topics that have the specified concept
+      def self.search_topics_by_concept(concept_name)
+        concept = ::InferredConcept.find_by(name: concept_name)
+        return [] unless concept
+        concept.topics
+      end
+      
+      # Match arbitrary content against existing concepts
+      # @param content [String] The content to analyze
+      # @return [Array<String>] Names of matching concepts
+      def self.match_content_to_concepts(content)
+        existing_concepts = InferredConcept.all.pluck(:name)
+        return [] if existing_concepts.empty?
+        
+        Applier.match_concepts_to_content(content, existing_concepts)
+      end
+      
+      # Find candidate topics that are good for concept generation
+      # 
+      # @param opts [Hash] Options to pass to the finder
+      # @option opts [Integer] :limit (100) Maximum number of topics to return
+      # @option opts [Integer] :min_posts (5) Minimum number of posts in topic
+      # @option opts [Integer] :min_likes (10) Minimum number of likes across all posts
+      # @option opts [Integer] :min_views (100) Minimum number of views
+      # @option opts [Array<Integer>] :exclude_topic_ids ([]) Topic IDs to exclude
+      # @option opts [Array<Integer>] :category_ids (nil) Only include topics from these categories
+      # @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time
+      # @return [Array<Topic>] Array of Topic objects that are good candidates
+      def self.find_candidate_topics(opts = {})
+        Finder.find_candidate_topics(opts)
+      end
+    end
+  end
+end
--- a/lib/personas/concept_finder.rb
+++ b/lib/personas/concept_finder.rb
@ -0,0 +1,35 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Personas
+    class ConceptFinder < Persona
+      def system_prompt
+        <<~PROMPT.strip
+          You are an advanced concept tagging system that identifies key concepts, themes, and topics from provided text.
+          Your job is to extract meaningful labels that can be used to categorize content.
+
+          Guidelines for generating concepts:
+          - Extract up to 7 concepts from the provided content
+          - Concepts should be single words or short phrases (1-3 words maximum)
+          - Focus on substantive topics, themes, technologies, methodologies, or domains
+          - Avoid overly general terms like "discussion" or "question"
+          - Ensure concepts are relevant to the core content
+          - Do not include proper nouns unless they represent key technologies or methodologies
+          - Maintain the original language of the text being analyzed
+
+          Format your response as a JSON object with a single key named "concepts", which has an array of concept strings as the value.
+          Your output should be in the following format:
+            <o>
+              {"concepts": ["concept1", "concept2", "concept3"]}
+            </o>
+
+          Where the concepts are replaced by the actual concepts you've identified.
+        PROMPT
+      end
+
+      def response_format
+        [{ key: "concepts", type: "array" }]
+      end
+    end
+  end
+end
--- a/lib/personas/concept_matcher.rb
+++ b/lib/personas/concept_matcher.rb
@ -0,0 +1,36 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Personas
+    class ConceptMatcher < Persona
+      def system_prompt
+        <<~PROMPT.strip
+          You are an advanced concept matching system that determines which concepts from a provided list are relevant to a piece of content.
+          Your job is to analyze the content and determine which concepts from the list apply to it.
+          
+          Guidelines for matching concepts:
+          - Only select concepts that are clearly relevant to the content
+          - The content must substantially discuss or relate to the concept
+          - Superficial mentions are not enough to consider a concept relevant
+          - Be precise and selective - don't match concepts that are only tangentially related
+          - Consider both explicit mentions and implicit discussions of concepts
+          - Maintain the original language of the text being analyzed
+          - IMPORTANT: Only select from the exact concepts in the provided list - do not add new concepts
+          - If no concepts from the list match the content, return an empty array
+          
+          Format your response as a JSON object with a single key named "matching_concepts", which has an array of concept strings from the provided list.
+          Your output should be in the following format:
+            <o>
+              {"matching_concepts": ["concept1", "concept3", "concept5"]}
+            </o>
+          
+          Only include concepts from the provided list that match the content. If no concepts match, return an empty array.
+        PROMPT
+      end
+
+      def response_format
+        [{ key: "matching_concepts", type: "array" }]
+      end
+    end
+  end
+end
--- a/lib/personas/persona.rb
+++ b/lib/personas/persona.rb
@ -52,6 +52,8 @@ module DiscourseAi
            ShortSummarizer => -12,
            Designer => -13,
            ForumResearcher => -14,
+            ConceptFinder => -15,
+            ConceptMatcher => -16,
          }
        end

--- a/lib/topic_extensions.rb
+++ b/lib/topic_extensions.rb
@ -11,6 +11,8 @@ module DiscourseAi
              -> { where(summary_type: AiSummary.summary_types[:gist]) },
              class_name: "AiSummary",
              as: :target
+              
+      has_and_belongs_to_many :inferred_concepts
    end
  end
 end