FEATURE: add inferred concepts system

This commit adds a new inferred concepts system that:
- Creates a model for storing concept labels that can be applied to topics
- Provides AI personas for finding new concepts and matching existing ones
- Adds jobs for generating concepts from popular topics
- Includes a scheduled job that automatically processes engaging topics
This commit is contained in:
Rafael Silva 2025-05-08 18:26:37 -03:00
parent 925949de47
commit fb0d364687
15 changed files with 581 additions and 0 deletions

View File

@ -0,0 +1,47 @@
# frozen_string_literal: true
module Jobs
class ApplyInferredConcepts < ::Jobs::Base
sidekiq_options queue: 'low'
# Process a batch of topics to apply existing concepts to them
#
# @param args [Hash] Contains job arguments
# @option args [Array<Integer>] :topic_ids Required - List of topic IDs to process
# @option args [Integer] :batch_size (100) Number of topics to process in each batch
def execute(args = {})
return if args[:topic_ids].blank?
# Process topics in smaller batches to avoid memory issues
batch_size = args[:batch_size] || 100
# Get the list of topic IDs
topic_ids = args[:topic_ids]
# Process topics in batches
topic_ids.each_slice(batch_size) do |batch_topic_ids|
process_batch(batch_topic_ids)
end
end
private
def process_batch(topic_ids)
topics = Topic.where(id: topic_ids)
topics.each do |topic|
begin
process_topic(topic)
rescue => e
Rails.logger.error("Error applying concepts to topic #{topic.id}: #{e.message}\n#{e.backtrace.join("\n")}")
end
end
end
def process_topic(topic)
# Match topic against existing concepts and apply them
# Pass the topic object directly
DiscourseAi::InferredConcepts::Manager.match_topic_to_concepts(topic)
end
end
end

View File

@ -0,0 +1,47 @@
# frozen_string_literal: true
module Jobs
class GenerateInferredConcepts < ::Jobs::Base
sidekiq_options queue: 'low'
# Process a batch of topics to generate new concepts (without applying them to topics)
#
# @param args [Hash] Contains job arguments
# @option args [Array<Integer>] :topic_ids Required - List of topic IDs to process
# @option args [Integer] :batch_size (100) Number of topics to process in each batch
def execute(args = {})
return if args[:topic_ids].blank?
# Process topics in smaller batches to avoid memory issues
batch_size = args[:batch_size] || 100
# Get the list of topic IDs
topic_ids = args[:topic_ids]
# Process topics in batches
topic_ids.each_slice(batch_size) do |batch_topic_ids|
process_batch(batch_topic_ids)
end
end
private
def process_batch(topic_ids)
topics = Topic.where(id: topic_ids)
topics.each do |topic|
begin
process_topic(topic)
rescue => e
Rails.logger.error("Error generating concepts from topic #{topic.id}: #{e.message}\n#{e.backtrace.join("\n")}")
end
end
end
def process_topic(topic)
# Use the Manager method that handles both identifying and creating concepts
# Pass the topic object directly
DiscourseAi::InferredConcepts::Manager.generate_concepts_from_topic(topic)
end
end
end

View File

@ -0,0 +1,38 @@
# frozen_string_literal: true
module Jobs
class GenerateConceptsFromPopularTopics < ::Jobs::Scheduled
every 1.day
# This job runs daily and generates new concepts from popular topics
# It selects topics based on engagement metrics and generates concepts from their content
def execute(args = {})
# Find candidate topics that are popular and don't have concepts yet
candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
min_posts: SiteSetting.inferred_concepts_min_posts || 5,
min_likes: SiteSetting.inferred_concepts_min_likes || 10,
min_views: SiteSetting.inferred_concepts_min_views || 100,
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
)
return if candidates.blank?
# Process the candidate topics in batches using the regular job
Jobs.enqueue(
:generate_inferred_concepts,
topic_ids: candidates.map(&:id),
batch_size: 10
)
# Schedule a follow-up job to apply the concepts to topics
# This runs after a delay to ensure concepts have been generated
Jobs.enqueue_in(
1.hour,
:apply_inferred_concepts,
topic_ids: candidates.map(&:id),
batch_size: 10
)
end
end
end

View File

@ -0,0 +1,21 @@
# frozen_string_literal: true
class InferredConcept < ActiveRecord::Base
has_and_belongs_to_many :topics
validates :name, presence: true, uniqueness: true
end
# == Schema Information
#
# Table name: inferred_concepts
#
# id :bigint not null, primary key
# name :string not null
# created_at :datetime not null
# updated_at :datetime not null
#
# Indexes
#
# index_inferred_concepts_on_name (name) UNIQUE
#

View File

@ -0,0 +1,5 @@
# frozen_string_literal: true
class InferredConceptSerializer < ApplicationSerializer
attributes :id, :name, :created_at, :updated_at
end

View File

@ -401,3 +401,28 @@ discourse_ai:
allow_any: false
enum: "DiscourseAi::Configuration::LlmEnumerator"
validator: "DiscourseAi::Configuration::LlmValidator"
inferred_concepts_enabled:
default: false
client: true
description: "Enable the inferred concepts system that automatically generates and applies concepts to topics"
inferred_concepts_daily_topics_limit:
default: 20
client: false
description: "Maximum number of topics to process each day for concept generation"
inferred_concepts_min_posts:
default: 5
client: false
description: "Minimum number of posts a topic must have to be considered for concept generation"
inferred_concepts_min_likes:
default: 10
client: false
description: "Minimum number of likes a topic must have to be considered for concept generation"
inferred_concepts_min_views:
default: 100
client: false
description: "Minimum number of views a topic must have to be considered for concept generation"
inferred_concepts_lookback_days:
default: 30
client: false
description: "Only consider topics created within this many days for concept generation"

View File

@ -0,0 +1,11 @@
# frozen_string_literal: true
class CreateInferredConceptsTable < ActiveRecord::Migration[7.2]
def change
create_table :inferred_concepts do |t|
t.string :name, null: false
t.timestamps
end
add_index :inferred_concepts, :name, unique: true
end
end

View File

@ -0,0 +1,15 @@
# frozen_string_literal: true
class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0]
def change
create_table :topics_inferred_concepts do |t|
t.integer :topic_id, null: false
t.integer :inferred_concept_id, null: false
t.timestamps
end
add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept'
add_index :topics_inferred_concepts, :topic_id
add_index :topics_inferred_concepts, :inferred_concept_id
end
end

View File

@ -0,0 +1,112 @@
# frozen_string_literal: true
module DiscourseAi
module InferredConcepts
class Applier
# Associates the provided concepts with a topic
# topic: a Topic instance
# concepts: an array of InferredConcept instances
def self.apply_to_topic(topic, concepts)
return if topic.blank? || concepts.blank?
concepts.each do |concept|
# Use the join table to associate the concept with the topic
# Avoid duplicates by using find_or_create_by
ActiveRecord::Base.connection.execute(<<~SQL)
INSERT INTO topics_inferred_concepts (topic_id, inferred_concept_id, created_at, updated_at)
VALUES (#{topic.id}, #{concept.id}, NOW(), NOW())
ON CONFLICT (topic_id, inferred_concept_id) DO NOTHING
SQL
end
end
# Extracts content from a topic for concept analysis
# Returns a string with the topic title and first few posts
def self.topic_content_for_analysis(topic)
return "" if topic.blank?
# Combine title and first few posts for analysis
posts = Post.where(topic_id: topic.id).order(:post_number).limit(10)
content = "Title: #{topic.title}\n\n"
content += posts.map do |p|
"#{p.post_number}) #{p.user.username}: #{p.raw}"
end.join("\n\n")
content
end
# Comprehensive method to analyze a topic and apply concepts
def self.analyze_and_apply(topic)
return if topic.blank?
# Get content to analyze
content = topic_content_for_analysis(topic)
# Identify concepts
concept_names = Finder.identify_concepts(content)
# Create or find concepts in the database
concepts = Finder.create_or_find_concepts(concept_names)
# Apply concepts to the topic
apply_to_topic(topic, concepts)
concepts
end
# Match a topic with existing concepts
def self.match_existing_concepts(topic)
return [] if topic.blank?
# Get content to analyze
content = topic_content_for_analysis(topic)
# Get all existing concepts
existing_concepts = InferredConcept.all.pluck(:name)
return [] if existing_concepts.empty?
# Use the ConceptMatcher persona to match concepts
matched_concept_names = match_concepts_to_content(content, existing_concepts)
# Find concepts in the database
matched_concepts = InferredConcept.where(name: matched_concept_names)
# Apply concepts to the topic
apply_to_topic(topic, matched_concepts)
matched_concepts
end
# Use ConceptMatcher persona to match content against provided concepts
def self.match_concepts_to_content(content, concept_list)
return [] if content.blank? || concept_list.blank?
# Prepare user message with content and concept list
user_message = <<~MESSAGE
Content to analyze:
#{content}
Available concepts to match:
#{concept_list.join(", ")}
MESSAGE
# Use the ConceptMatcher persona to match concepts
llm = DiscourseAi::Completions::Llm.default_llm
persona = DiscourseAi::Personas::ConceptMatcher.new
context = DiscourseAi::Personas::BotContext.new(
messages: [{ type: :user, content: user_message }],
user: Discourse.system_user
)
prompt = persona.craft_prompt(context)
response = llm.completion(prompt, extract_json: true)
return [] unless response.success?
matching_concepts = response.parsed_output["matching_concepts"]
matching_concepts || []
end
end
end
end

View File

@ -0,0 +1,91 @@
# frozen_string_literal: true
module DiscourseAi
module InferredConcepts
class Finder
# Identifies potential concepts from provided content
# Returns an array of concept names (strings)
def self.identify_concepts(content)
return [] if content.blank?
# Use the ConceptFinder persona to identify concepts
llm = DiscourseAi::Completions::Llm.default_llm
persona = DiscourseAi::Personas::ConceptFinder.new
context = DiscourseAi::Personas::BotContext.new(
messages: [{ type: :user, content: content }],
user: Discourse.system_user
)
prompt = persona.craft_prompt(context)
response = llm.completion(prompt, extract_json: true)
return [] unless response.success?
concepts = response.parsed_output["concepts"]
concepts || []
end
# Creates or finds concepts in the database from provided names
# Returns an array of InferredConcept instances
def self.create_or_find_concepts(concept_names)
return [] if concept_names.blank?
concept_names.map do |name|
InferredConcept.find_or_create_by(name: name)
end
end
# Finds candidate topics to use for concept generation
#
# @param limit [Integer] Maximum number of topics to return
# @param min_posts [Integer] Minimum number of posts in topic
# @param min_likes [Integer] Minimum number of likes across all posts
# @param min_views [Integer] Minimum number of views
# @param exclude_topic_ids [Array<Integer>] Topic IDs to exclude
# @param category_ids [Array<Integer>] Only include topics from these categories (optional)
# @param created_after [DateTime] Only include topics created after this time (optional)
# @return [Array<Topic>] Array of Topic objects that are good candidates
def self.find_candidate_topics(
limit: 100,
min_posts: 5,
min_likes: 10,
min_views: 100,
exclude_topic_ids: [],
category_ids: nil,
created_after: 30.days.ago
)
query = Topic.where(
"topics.posts_count >= ? AND topics.views >= ? AND topics.like_count >= ?",
min_posts,
min_views,
min_likes
)
# Apply additional filters
query = query.where("topics.id NOT IN (?)", exclude_topic_ids) if exclude_topic_ids.present?
query = query.where("topics.category_id IN (?)", category_ids) if category_ids.present?
query = query.where("topics.created_at >= ?", created_after) if created_after.present?
# Exclude PM topics (if they exist in Discourse)
query = query.where(archetype: Topic.public_archetype)
# Exclude topics that already have concepts
topics_with_concepts = <<~SQL
SELECT DISTINCT topic_id
FROM topics_inferred_concepts
SQL
query = query.where("topics.id NOT IN (#{topics_with_concepts})")
# Score and order topics by engagement (combination of views, likes, and posts)
query = query.select(
"topics.*,
(topics.like_count * 2 + topics.posts_count * 3 + topics.views * 0.1) AS engagement_score"
).order("engagement_score DESC")
# Return limited number of topics
query.limit(limit)
end
end
end
end

View File

@ -0,0 +1,94 @@
# frozen_string_literal: true
module DiscourseAi
module InferredConcepts
class Manager
# Generate new concepts for a topic and apply them
# @param topic [Topic] A Topic instance
# @return [Array<InferredConcept>] The concepts that were applied
def self.analyze_topic(topic)
return [] if topic.blank?
Applier.analyze_and_apply(topic)
end
# Extract new concepts from arbitrary content
# @param content [String] The content to analyze
# @return [Array<String>] The identified concept names
def self.identify_concepts(content)
Finder.identify_concepts(content)
end
# Identify and create concepts from content without applying them to any topic
# @param content [String] The content to analyze
# @return [Array<InferredConcept>] The created or found concepts
def self.generate_concepts_from_content(content)
return [] if content.blank?
# Identify concepts
concept_names = Finder.identify_concepts(content)
return [] if concept_names.blank?
# Create or find concepts in the database
Finder.create_or_find_concepts(concept_names)
end
# Generate concepts from a topic's content without applying them to the topic
# @param topic [Topic] A Topic instance
# @return [Array<InferredConcept>] The created or found concepts
def self.generate_concepts_from_topic(topic)
return [] if topic.blank?
# Get content to analyze
content = Applier.topic_content_for_analysis(topic)
return [] if content.blank?
# Generate concepts from the content
generate_concepts_from_content(content)
end
# Match a topic against existing concepts
# @param topic [Topic] A Topic instance
# @return [Array<InferredConcept>] The concepts that were applied
def self.match_topic_to_concepts(topic)
return [] if topic.blank?
Applier.match_existing_concepts(topic)
end
# Find topics that have a specific concept
# @param concept_name [String] The name of the concept to search for
# @return [Array<Topic>] Topics that have the specified concept
def self.search_topics_by_concept(concept_name)
concept = ::InferredConcept.find_by(name: concept_name)
return [] unless concept
concept.topics
end
# Match arbitrary content against existing concepts
# @param content [String] The content to analyze
# @return [Array<String>] Names of matching concepts
def self.match_content_to_concepts(content)
existing_concepts = InferredConcept.all.pluck(:name)
return [] if existing_concepts.empty?
Applier.match_concepts_to_content(content, existing_concepts)
end
# Find candidate topics that are good for concept generation
#
# @param opts [Hash] Options to pass to the finder
# @option opts [Integer] :limit (100) Maximum number of topics to return
# @option opts [Integer] :min_posts (5) Minimum number of posts in topic
# @option opts [Integer] :min_likes (10) Minimum number of likes across all posts
# @option opts [Integer] :min_views (100) Minimum number of views
# @option opts [Array<Integer>] :exclude_topic_ids ([]) Topic IDs to exclude
# @option opts [Array<Integer>] :category_ids (nil) Only include topics from these categories
# @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time
# @return [Array<Topic>] Array of Topic objects that are good candidates
def self.find_candidate_topics(opts = {})
Finder.find_candidate_topics(opts)
end
end
end
end

View File

@ -0,0 +1,35 @@
# frozen_string_literal: true
module DiscourseAi
module Personas
class ConceptFinder < Persona
def system_prompt
<<~PROMPT.strip
You are an advanced concept tagging system that identifies key concepts, themes, and topics from provided text.
Your job is to extract meaningful labels that can be used to categorize content.
Guidelines for generating concepts:
- Extract up to 7 concepts from the provided content
- Concepts should be single words or short phrases (1-3 words maximum)
- Focus on substantive topics, themes, technologies, methodologies, or domains
- Avoid overly general terms like "discussion" or "question"
- Ensure concepts are relevant to the core content
- Do not include proper nouns unless they represent key technologies or methodologies
- Maintain the original language of the text being analyzed
Format your response as a JSON object with a single key named "concepts", which has an array of concept strings as the value.
Your output should be in the following format:
<o>
{"concepts": ["concept1", "concept2", "concept3"]}
</o>
Where the concepts are replaced by the actual concepts you've identified.
PROMPT
end
def response_format
[{ key: "concepts", type: "array" }]
end
end
end
end

View File

@ -0,0 +1,36 @@
# frozen_string_literal: true
module DiscourseAi
module Personas
class ConceptMatcher < Persona
def system_prompt
<<~PROMPT.strip
You are an advanced concept matching system that determines which concepts from a provided list are relevant to a piece of content.
Your job is to analyze the content and determine which concepts from the list apply to it.
Guidelines for matching concepts:
- Only select concepts that are clearly relevant to the content
- The content must substantially discuss or relate to the concept
- Superficial mentions are not enough to consider a concept relevant
- Be precise and selective - don't match concepts that are only tangentially related
- Consider both explicit mentions and implicit discussions of concepts
- Maintain the original language of the text being analyzed
- IMPORTANT: Only select from the exact concepts in the provided list - do not add new concepts
- If no concepts from the list match the content, return an empty array
Format your response as a JSON object with a single key named "matching_concepts", which has an array of concept strings from the provided list.
Your output should be in the following format:
<o>
{"matching_concepts": ["concept1", "concept3", "concept5"]}
</o>
Only include concepts from the provided list that match the content. If no concepts match, return an empty array.
PROMPT
end
def response_format
[{ key: "matching_concepts", type: "array" }]
end
end
end
end

View File

@ -52,6 +52,8 @@ module DiscourseAi
ShortSummarizer => -12,
Designer => -13,
ForumResearcher => -14,
ConceptFinder => -15,
ConceptMatcher => -16,
}
end

View File

@ -11,6 +11,8 @@ module DiscourseAi
-> { where(summary_type: AiSummary.summary_types[:gist]) },
class_name: "AiSummary",
as: :target
has_and_belongs_to_many :inferred_concepts
end
end
end