From 04c6e553716d56dfe64d6321f8904da968f8fd62 Mon Sep 17 00:00:00 2001 From: Rafael Silva Date: Thu, 8 May 2025 18:26:37 -0300 Subject: [PATCH] FEATURE: add inferred concepts system This commit adds a new inferred concepts system that: - Creates a model for storing concept labels that can be applied to topics - Provides AI personas for finding new concepts and matching existing ones - Adds jobs for generating concepts from popular topics - Includes a scheduled job that automatically processes engaging topics --- app/jobs/regular/apply_inferred_concepts.rb | 47 ++++++++ .../regular/generate_inferred_concepts.rb | 47 ++++++++ .../generate_concepts_from_popular_topics.rb | 38 ++++++ app/models/inferred_concept.rb | 21 ++++ .../inferred_concept_serializer.rb | 5 + config/settings.yml | 25 ++++ ...08182047_create_inferred_concepts_table.rb | 11 ++ ...8183456_create_topics_inferred_concepts.rb | 15 +++ lib/inferred_concepts/applier.rb | 112 ++++++++++++++++++ lib/inferred_concepts/finder.rb | 91 ++++++++++++++ lib/inferred_concepts/manager.rb | 94 +++++++++++++++ lib/personas/concept_finder.rb | 35 ++++++ lib/personas/concept_matcher.rb | 36 ++++++ lib/personas/persona.rb | 2 + lib/topic_extensions.rb | 2 + 15 files changed, 581 insertions(+) create mode 100644 app/jobs/regular/apply_inferred_concepts.rb create mode 100644 app/jobs/regular/generate_inferred_concepts.rb create mode 100644 app/jobs/scheduled/generate_concepts_from_popular_topics.rb create mode 100644 app/models/inferred_concept.rb create mode 100644 app/serializers/inferred_concept_serializer.rb create mode 100644 db/migrate/20250508182047_create_inferred_concepts_table.rb create mode 100644 db/migrate/20250508183456_create_topics_inferred_concepts.rb create mode 100644 lib/inferred_concepts/applier.rb create mode 100644 lib/inferred_concepts/finder.rb create mode 100644 lib/inferred_concepts/manager.rb create mode 100644 lib/personas/concept_finder.rb create mode 100644 lib/personas/concept_matcher.rb diff --git a/app/jobs/regular/apply_inferred_concepts.rb b/app/jobs/regular/apply_inferred_concepts.rb new file mode 100644 index 00000000..916c3f0c --- /dev/null +++ b/app/jobs/regular/apply_inferred_concepts.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +module Jobs + class ApplyInferredConcepts < ::Jobs::Base + sidekiq_options queue: 'low' + + # Process a batch of topics to apply existing concepts to them + # + # @param args [Hash] Contains job arguments + # @option args [Array] :topic_ids Required - List of topic IDs to process + # @option args [Integer] :batch_size (100) Number of topics to process in each batch + def execute(args = {}) + return if args[:topic_ids].blank? + + # Process topics in smaller batches to avoid memory issues + batch_size = args[:batch_size] || 100 + + # Get the list of topic IDs + topic_ids = args[:topic_ids] + + # Process topics in batches + topic_ids.each_slice(batch_size) do |batch_topic_ids| + process_batch(batch_topic_ids) + end + end + + private + + def process_batch(topic_ids) + topics = Topic.where(id: topic_ids) + + topics.each do |topic| + begin + process_topic(topic) + rescue => e + Rails.logger.error("Error applying concepts to topic #{topic.id}: #{e.message}\n#{e.backtrace.join("\n")}") + end + end + end + + def process_topic(topic) + # Match topic against existing concepts and apply them + # Pass the topic object directly + DiscourseAi::InferredConcepts::Manager.match_topic_to_concepts(topic) + end + end +end \ No newline at end of file diff --git a/app/jobs/regular/generate_inferred_concepts.rb b/app/jobs/regular/generate_inferred_concepts.rb new file mode 100644 index 00000000..d0b73b89 --- /dev/null +++ b/app/jobs/regular/generate_inferred_concepts.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +module Jobs + class GenerateInferredConcepts < ::Jobs::Base + sidekiq_options queue: 'low' + + # Process a batch of topics to generate new concepts (without applying them to topics) + # + # @param args [Hash] Contains job arguments + # @option args [Array] :topic_ids Required - List of topic IDs to process + # @option args [Integer] :batch_size (100) Number of topics to process in each batch + def execute(args = {}) + return if args[:topic_ids].blank? + + # Process topics in smaller batches to avoid memory issues + batch_size = args[:batch_size] || 100 + + # Get the list of topic IDs + topic_ids = args[:topic_ids] + + # Process topics in batches + topic_ids.each_slice(batch_size) do |batch_topic_ids| + process_batch(batch_topic_ids) + end + end + + private + + def process_batch(topic_ids) + topics = Topic.where(id: topic_ids) + + topics.each do |topic| + begin + process_topic(topic) + rescue => e + Rails.logger.error("Error generating concepts from topic #{topic.id}: #{e.message}\n#{e.backtrace.join("\n")}") + end + end + end + + def process_topic(topic) + # Use the Manager method that handles both identifying and creating concepts + # Pass the topic object directly + DiscourseAi::InferredConcepts::Manager.generate_concepts_from_topic(topic) + end + end +end \ No newline at end of file diff --git a/app/jobs/scheduled/generate_concepts_from_popular_topics.rb b/app/jobs/scheduled/generate_concepts_from_popular_topics.rb new file mode 100644 index 00000000..fe009a1c --- /dev/null +++ b/app/jobs/scheduled/generate_concepts_from_popular_topics.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +module Jobs + class GenerateConceptsFromPopularTopics < ::Jobs::Scheduled + every 1.day + + # This job runs daily and generates new concepts from popular topics + # It selects topics based on engagement metrics and generates concepts from their content + def execute(args = {}) + # Find candidate topics that are popular and don't have concepts yet + candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics( + limit: SiteSetting.inferred_concepts_daily_topics_limit || 20, + min_posts: SiteSetting.inferred_concepts_min_posts || 5, + min_likes: SiteSetting.inferred_concepts_min_likes || 10, + min_views: SiteSetting.inferred_concepts_min_views || 100, + created_after: SiteSetting.inferred_concepts_lookback_days.days.ago + ) + + return if candidates.blank? + + # Process the candidate topics in batches using the regular job + Jobs.enqueue( + :generate_inferred_concepts, + topic_ids: candidates.map(&:id), + batch_size: 10 + ) + + # Schedule a follow-up job to apply the concepts to topics + # This runs after a delay to ensure concepts have been generated + Jobs.enqueue_in( + 1.hour, + :apply_inferred_concepts, + topic_ids: candidates.map(&:id), + batch_size: 10 + ) + end + end +end \ No newline at end of file diff --git a/app/models/inferred_concept.rb b/app/models/inferred_concept.rb new file mode 100644 index 00000000..a5b8d877 --- /dev/null +++ b/app/models/inferred_concept.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +class InferredConcept < ActiveRecord::Base + has_and_belongs_to_many :topics + + validates :name, presence: true, uniqueness: true +end + +# == Schema Information +# +# Table name: inferred_concepts +# +# id :bigint not null, primary key +# name :string not null +# created_at :datetime not null +# updated_at :datetime not null +# +# Indexes +# +# index_inferred_concepts_on_name (name) UNIQUE +# \ No newline at end of file diff --git a/app/serializers/inferred_concept_serializer.rb b/app/serializers/inferred_concept_serializer.rb new file mode 100644 index 00000000..265fe858 --- /dev/null +++ b/app/serializers/inferred_concept_serializer.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +class InferredConceptSerializer < ApplicationSerializer + attributes :id, :name, :created_at, :updated_at +end \ No newline at end of file diff --git a/config/settings.yml b/config/settings.yml index af7f8f60..eaa1f40a 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -417,3 +417,28 @@ discourse_ai: default: false client: false hidden: true + + inferred_concepts_enabled: + default: false + client: true + description: "Enable the inferred concepts system that automatically generates and applies concepts to topics" + inferred_concepts_daily_topics_limit: + default: 20 + client: false + description: "Maximum number of topics to process each day for concept generation" + inferred_concepts_min_posts: + default: 5 + client: false + description: "Minimum number of posts a topic must have to be considered for concept generation" + inferred_concepts_min_likes: + default: 10 + client: false + description: "Minimum number of likes a topic must have to be considered for concept generation" + inferred_concepts_min_views: + default: 100 + client: false + description: "Minimum number of views a topic must have to be considered for concept generation" + inferred_concepts_lookback_days: + default: 30 + client: false + description: "Only consider topics created within this many days for concept generation" diff --git a/db/migrate/20250508182047_create_inferred_concepts_table.rb b/db/migrate/20250508182047_create_inferred_concepts_table.rb new file mode 100644 index 00000000..6686c040 --- /dev/null +++ b/db/migrate/20250508182047_create_inferred_concepts_table.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true +class CreateInferredConceptsTable < ActiveRecord::Migration[7.2] + def change + create_table :inferred_concepts do |t| + t.string :name, null: false + t.timestamps + end + + add_index :inferred_concepts, :name, unique: true + end +end diff --git a/db/migrate/20250508183456_create_topics_inferred_concepts.rb b/db/migrate/20250508183456_create_topics_inferred_concepts.rb new file mode 100644 index 00000000..6066bfbb --- /dev/null +++ b/db/migrate/20250508183456_create_topics_inferred_concepts.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0] + def change + create_table :topics_inferred_concepts do |t| + t.integer :topic_id, null: false + t.integer :inferred_concept_id, null: false + t.timestamps + end + + add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept' + add_index :topics_inferred_concepts, :topic_id + add_index :topics_inferred_concepts, :inferred_concept_id + end +end \ No newline at end of file diff --git a/lib/inferred_concepts/applier.rb b/lib/inferred_concepts/applier.rb new file mode 100644 index 00000000..9426ec0a --- /dev/null +++ b/lib/inferred_concepts/applier.rb @@ -0,0 +1,112 @@ +# frozen_string_literal: true + +module DiscourseAi + module InferredConcepts + class Applier + # Associates the provided concepts with a topic + # topic: a Topic instance + # concepts: an array of InferredConcept instances + def self.apply_to_topic(topic, concepts) + return if topic.blank? || concepts.blank? + + concepts.each do |concept| + # Use the join table to associate the concept with the topic + # Avoid duplicates by using find_or_create_by + ActiveRecord::Base.connection.execute(<<~SQL) + INSERT INTO topics_inferred_concepts (topic_id, inferred_concept_id, created_at, updated_at) + VALUES (#{topic.id}, #{concept.id}, NOW(), NOW()) + ON CONFLICT (topic_id, inferred_concept_id) DO NOTHING + SQL + end + end + + # Extracts content from a topic for concept analysis + # Returns a string with the topic title and first few posts + def self.topic_content_for_analysis(topic) + return "" if topic.blank? + + # Combine title and first few posts for analysis + posts = Post.where(topic_id: topic.id).order(:post_number).limit(10) + + content = "Title: #{topic.title}\n\n" + content += posts.map do |p| + "#{p.post_number}) #{p.user.username}: #{p.raw}" + end.join("\n\n") + + content + end + + # Comprehensive method to analyze a topic and apply concepts + def self.analyze_and_apply(topic) + return if topic.blank? + + # Get content to analyze + content = topic_content_for_analysis(topic) + + # Identify concepts + concept_names = Finder.identify_concepts(content) + + # Create or find concepts in the database + concepts = Finder.create_or_find_concepts(concept_names) + + # Apply concepts to the topic + apply_to_topic(topic, concepts) + + concepts + end + + # Match a topic with existing concepts + def self.match_existing_concepts(topic) + return [] if topic.blank? + + # Get content to analyze + content = topic_content_for_analysis(topic) + + # Get all existing concepts + existing_concepts = InferredConcept.all.pluck(:name) + return [] if existing_concepts.empty? + + # Use the ConceptMatcher persona to match concepts + matched_concept_names = match_concepts_to_content(content, existing_concepts) + + # Find concepts in the database + matched_concepts = InferredConcept.where(name: matched_concept_names) + + # Apply concepts to the topic + apply_to_topic(topic, matched_concepts) + + matched_concepts + end + + # Use ConceptMatcher persona to match content against provided concepts + def self.match_concepts_to_content(content, concept_list) + return [] if content.blank? || concept_list.blank? + + # Prepare user message with content and concept list + user_message = <<~MESSAGE + Content to analyze: + #{content} + + Available concepts to match: + #{concept_list.join(", ")} + MESSAGE + + # Use the ConceptMatcher persona to match concepts + llm = DiscourseAi::Completions::Llm.default_llm + persona = DiscourseAi::Personas::ConceptMatcher.new + context = DiscourseAi::Personas::BotContext.new( + messages: [{ type: :user, content: user_message }], + user: Discourse.system_user + ) + + prompt = persona.craft_prompt(context) + response = llm.completion(prompt, extract_json: true) + + return [] unless response.success? + + matching_concepts = response.parsed_output["matching_concepts"] + matching_concepts || [] + end + end + end +end \ No newline at end of file diff --git a/lib/inferred_concepts/finder.rb b/lib/inferred_concepts/finder.rb new file mode 100644 index 00000000..3e870cf1 --- /dev/null +++ b/lib/inferred_concepts/finder.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +module DiscourseAi + module InferredConcepts + class Finder + # Identifies potential concepts from provided content + # Returns an array of concept names (strings) + def self.identify_concepts(content) + return [] if content.blank? + + # Use the ConceptFinder persona to identify concepts + llm = DiscourseAi::Completions::Llm.default_llm + persona = DiscourseAi::Personas::ConceptFinder.new + context = DiscourseAi::Personas::BotContext.new( + messages: [{ type: :user, content: content }], + user: Discourse.system_user + ) + + prompt = persona.craft_prompt(context) + response = llm.completion(prompt, extract_json: true) + + return [] unless response.success? + + concepts = response.parsed_output["concepts"] + concepts || [] + end + + # Creates or finds concepts in the database from provided names + # Returns an array of InferredConcept instances + def self.create_or_find_concepts(concept_names) + return [] if concept_names.blank? + + concept_names.map do |name| + InferredConcept.find_or_create_by(name: name) + end + end + + # Finds candidate topics to use for concept generation + # + # @param limit [Integer] Maximum number of topics to return + # @param min_posts [Integer] Minimum number of posts in topic + # @param min_likes [Integer] Minimum number of likes across all posts + # @param min_views [Integer] Minimum number of views + # @param exclude_topic_ids [Array] Topic IDs to exclude + # @param category_ids [Array] Only include topics from these categories (optional) + # @param created_after [DateTime] Only include topics created after this time (optional) + # @return [Array] Array of Topic objects that are good candidates + def self.find_candidate_topics( + limit: 100, + min_posts: 5, + min_likes: 10, + min_views: 100, + exclude_topic_ids: [], + category_ids: nil, + created_after: 30.days.ago + ) + query = Topic.where( + "topics.posts_count >= ? AND topics.views >= ? AND topics.like_count >= ?", + min_posts, + min_views, + min_likes + ) + + # Apply additional filters + query = query.where("topics.id NOT IN (?)", exclude_topic_ids) if exclude_topic_ids.present? + query = query.where("topics.category_id IN (?)", category_ids) if category_ids.present? + query = query.where("topics.created_at >= ?", created_after) if created_after.present? + + # Exclude PM topics (if they exist in Discourse) + query = query.where(archetype: Topic.public_archetype) + + # Exclude topics that already have concepts + topics_with_concepts = <<~SQL + SELECT DISTINCT topic_id + FROM topics_inferred_concepts + SQL + + query = query.where("topics.id NOT IN (#{topics_with_concepts})") + + # Score and order topics by engagement (combination of views, likes, and posts) + query = query.select( + "topics.*, + (topics.like_count * 2 + topics.posts_count * 3 + topics.views * 0.1) AS engagement_score" + ).order("engagement_score DESC") + + # Return limited number of topics + query.limit(limit) + end + end + end +end \ No newline at end of file diff --git a/lib/inferred_concepts/manager.rb b/lib/inferred_concepts/manager.rb new file mode 100644 index 00000000..28246891 --- /dev/null +++ b/lib/inferred_concepts/manager.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true + +module DiscourseAi + module InferredConcepts + class Manager + # Generate new concepts for a topic and apply them + # @param topic [Topic] A Topic instance + # @return [Array] The concepts that were applied + def self.analyze_topic(topic) + return [] if topic.blank? + + Applier.analyze_and_apply(topic) + end + + # Extract new concepts from arbitrary content + # @param content [String] The content to analyze + # @return [Array] The identified concept names + def self.identify_concepts(content) + Finder.identify_concepts(content) + end + + # Identify and create concepts from content without applying them to any topic + # @param content [String] The content to analyze + # @return [Array] The created or found concepts + def self.generate_concepts_from_content(content) + return [] if content.blank? + + # Identify concepts + concept_names = Finder.identify_concepts(content) + return [] if concept_names.blank? + + # Create or find concepts in the database + Finder.create_or_find_concepts(concept_names) + end + + # Generate concepts from a topic's content without applying them to the topic + # @param topic [Topic] A Topic instance + # @return [Array] The created or found concepts + def self.generate_concepts_from_topic(topic) + return [] if topic.blank? + + # Get content to analyze + content = Applier.topic_content_for_analysis(topic) + return [] if content.blank? + + # Generate concepts from the content + generate_concepts_from_content(content) + end + + # Match a topic against existing concepts + # @param topic [Topic] A Topic instance + # @return [Array] The concepts that were applied + def self.match_topic_to_concepts(topic) + return [] if topic.blank? + + Applier.match_existing_concepts(topic) + end + + # Find topics that have a specific concept + # @param concept_name [String] The name of the concept to search for + # @return [Array] Topics that have the specified concept + def self.search_topics_by_concept(concept_name) + concept = ::InferredConcept.find_by(name: concept_name) + return [] unless concept + concept.topics + end + + # Match arbitrary content against existing concepts + # @param content [String] The content to analyze + # @return [Array] Names of matching concepts + def self.match_content_to_concepts(content) + existing_concepts = InferredConcept.all.pluck(:name) + return [] if existing_concepts.empty? + + Applier.match_concepts_to_content(content, existing_concepts) + end + + # Find candidate topics that are good for concept generation + # + # @param opts [Hash] Options to pass to the finder + # @option opts [Integer] :limit (100) Maximum number of topics to return + # @option opts [Integer] :min_posts (5) Minimum number of posts in topic + # @option opts [Integer] :min_likes (10) Minimum number of likes across all posts + # @option opts [Integer] :min_views (100) Minimum number of views + # @option opts [Array] :exclude_topic_ids ([]) Topic IDs to exclude + # @option opts [Array] :category_ids (nil) Only include topics from these categories + # @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time + # @return [Array] Array of Topic objects that are good candidates + def self.find_candidate_topics(opts = {}) + Finder.find_candidate_topics(opts) + end + end + end +end \ No newline at end of file diff --git a/lib/personas/concept_finder.rb b/lib/personas/concept_finder.rb new file mode 100644 index 00000000..2e0502d0 --- /dev/null +++ b/lib/personas/concept_finder.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module DiscourseAi + module Personas + class ConceptFinder < Persona + def system_prompt + <<~PROMPT.strip + You are an advanced concept tagging system that identifies key concepts, themes, and topics from provided text. + Your job is to extract meaningful labels that can be used to categorize content. + + Guidelines for generating concepts: + - Extract up to 7 concepts from the provided content + - Concepts should be single words or short phrases (1-3 words maximum) + - Focus on substantive topics, themes, technologies, methodologies, or domains + - Avoid overly general terms like "discussion" or "question" + - Ensure concepts are relevant to the core content + - Do not include proper nouns unless they represent key technologies or methodologies + - Maintain the original language of the text being analyzed + + Format your response as a JSON object with a single key named "concepts", which has an array of concept strings as the value. + Your output should be in the following format: + + {"concepts": ["concept1", "concept2", "concept3"]} + + + Where the concepts are replaced by the actual concepts you've identified. + PROMPT + end + + def response_format + [{ key: "concepts", type: "array" }] + end + end + end +end diff --git a/lib/personas/concept_matcher.rb b/lib/personas/concept_matcher.rb new file mode 100644 index 00000000..ce398bcb --- /dev/null +++ b/lib/personas/concept_matcher.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module DiscourseAi + module Personas + class ConceptMatcher < Persona + def system_prompt + <<~PROMPT.strip + You are an advanced concept matching system that determines which concepts from a provided list are relevant to a piece of content. + Your job is to analyze the content and determine which concepts from the list apply to it. + + Guidelines for matching concepts: + - Only select concepts that are clearly relevant to the content + - The content must substantially discuss or relate to the concept + - Superficial mentions are not enough to consider a concept relevant + - Be precise and selective - don't match concepts that are only tangentially related + - Consider both explicit mentions and implicit discussions of concepts + - Maintain the original language of the text being analyzed + - IMPORTANT: Only select from the exact concepts in the provided list - do not add new concepts + - If no concepts from the list match the content, return an empty array + + Format your response as a JSON object with a single key named "matching_concepts", which has an array of concept strings from the provided list. + Your output should be in the following format: + + {"matching_concepts": ["concept1", "concept3", "concept5"]} + + + Only include concepts from the provided list that match the content. If no concepts match, return an empty array. + PROMPT + end + + def response_format + [{ key: "matching_concepts", type: "array" }] + end + end + end +end \ No newline at end of file diff --git a/lib/personas/persona.rb b/lib/personas/persona.rb index 62426f77..ba3d3be6 100644 --- a/lib/personas/persona.rb +++ b/lib/personas/persona.rb @@ -52,6 +52,8 @@ module DiscourseAi ShortSummarizer => -12, Designer => -13, ForumResearcher => -14, + ConceptFinder => -15, + ConceptMatcher => -16, } end diff --git a/lib/topic_extensions.rb b/lib/topic_extensions.rb index 7ab36493..659a3392 100644 --- a/lib/topic_extensions.rb +++ b/lib/topic_extensions.rb @@ -11,6 +11,8 @@ module DiscourseAi -> { where(summary_type: AiSummary.summary_types[:gist]) }, class_name: "AiSummary", as: :target + + has_and_belongs_to_many :inferred_concepts end end end