Dedup concepts

This commit is contained in:
Rafael Silva 2025-05-15 16:38:45 -03:00
parent 1c4806beb6
commit 0664ec512b
8 changed files with 218 additions and 70 deletions

View File

@ -6,76 +6,80 @@ module Jobs
# This job runs daily and generates new concepts from popular topics and posts
# It selects items based on engagement metrics and generates concepts from their content
def execute(args = {})
def execute(_args)
return unless SiteSetting.inferred_concepts_enabled
process_popular_topics
process_popular_posts
end
private
def process_popular_topics
private
def process_popular_topics
# Find candidate topics that are popular and don't have concepts yet
candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
min_posts: SiteSetting.inferred_concepts_min_posts || 5,
min_likes: SiteSetting.inferred_concepts_min_likes || 10,
min_views: SiteSetting.inferred_concepts_min_views || 100,
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
)
candidates =
DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
min_posts: SiteSetting.inferred_concepts_min_posts || 5,
min_likes: SiteSetting.inferred_concepts_min_likes || 10,
min_views: SiteSetting.inferred_concepts_min_views || 100,
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago,
)
return if candidates.blank?
# Process candidate topics - first generate concepts, then match
Jobs.enqueue(
:generate_inferred_concepts,
item_type: 'topics',
item_ids: candidates.map(&:id),
batch_size: 10
)
# Schedule a follow-up job to match existing concepts
Jobs.enqueue_in(
1.hour,
:generate_inferred_concepts,
item_type: 'topics',
item_type: "topics",
item_ids: candidates.map(&:id),
batch_size: 10,
match_only: true
)
end
def process_popular_posts
if SiteSetting.inferred_concepts_background_match
# Schedule a follow-up job to match existing concepts
Jobs.enqueue_in(
1.hour,
:generate_inferred_concepts,
item_type: "topics",
item_ids: candidates.map(&:id),
batch_size: 10,
match_only: true,
)
end
end
def process_popular_posts
# Find candidate posts that are popular and don't have concepts yet
candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_posts(
limit: SiteSetting.inferred_concepts_daily_posts_limit || 30,
min_likes: SiteSetting.inferred_concepts_post_min_likes || 5,
exclude_first_posts: true,
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
)
candidates =
DiscourseAi::InferredConcepts::Manager.find_candidate_posts(
limit: SiteSetting.inferred_concepts_daily_posts_limit || 30,
min_likes: SiteSetting.inferred_concepts_post_min_likes || 5,
exclude_first_posts: true,
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago,
)
return if candidates.blank?
# Process candidate posts - first generate concepts, then match
Jobs.enqueue(
:generate_inferred_concepts,
item_type: 'posts',
item_ids: candidates.map(&:id),
batch_size: 10
)
# Schedule a follow-up job to match against existing concepts
Jobs.enqueue_in(
1.hour,
:generate_inferred_concepts,
item_type: 'posts',
item_type: "posts",
item_ids: candidates.map(&:id),
batch_size: 10,
match_only: true
)
if SiteSetting.inferred_concepts_background_match
# Schedule a follow-up job to match against existing concepts
Jobs.enqueue_in(
1.hour,
:generate_inferred_concepts,
item_type: "posts",
item_ids: candidates.map(&:id),
batch_size: 10,
match_only: true,
)
end
end
end
end
end

View File

@ -405,32 +405,39 @@ discourse_ai:
inferred_concepts_enabled:
default: false
client: true
description: "Enable the inferred concepts system that automatically generates and applies concepts to topics"
inferred_concepts_background_match:
default: false
client: false
inferred_concepts_daily_topics_limit:
default: 20
client: false
description: "Maximum number of topics to process each day for concept generation"
inferred_concepts_min_posts:
default: 5
client: false
description: "Minimum number of posts a topic must have to be considered for concept generation"
inferred_concepts_min_likes:
default: 10
client: false
description: "Minimum number of likes a topic must have to be considered for concept generation"
inferred_concepts_min_views:
default: 100
client: false
description: "Minimum number of views a topic must have to be considered for concept generation"
inferred_concepts_lookback_days:
default: 30
client: false
description: "Only consider topics created within this many days for concept generation"
inferred_concepts_daily_posts_limit:
default: 30
client: false
description: "Maximum number of posts to process each day for concept generation"
inferred_concepts_post_min_likes:
default: 5
client: false
description: "Minimum number of likes a post must have to be considered for concept generation"
inferred_concepts_generate_persona:
default: "-15"
type: enum
enum: "DiscourseAi::Configuration::PersonaEnumerator"
inferred_concepts_match_persona:
default: "-16"
type: enum
enum: "DiscourseAi::Configuration::PersonaEnumerator"
inferred_concepts_deduplicate_persona:
default: "-17"
type: enum
enum: "DiscourseAi::Configuration::PersonaEnumerator"

View File

@ -2,14 +2,12 @@
class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0]
def change
create_table :topics_inferred_concepts do |t|
create_table :topics_inferred_concepts, primary_key: %i[topic_id inferred_concept_id] do |t|
t.integer :topic_id, null: false
t.integer :inferred_concept_id, null: false
t.timestamps
end
add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept'
add_index :topics_inferred_concepts, :topic_id
add_index :topics_inferred_concepts, :inferred_concept_id
end
end
end

View File

@ -2,14 +2,12 @@
class CreatePostsInferredConcepts < ActiveRecord::Migration[7.0]
def change
create_table :posts_inferred_concepts do |t|
create_table :posts_inferred_concepts, primary_key: %i[post_id inferred_concept_id] do |t|
t.integer :post_id, null: false
t.integer :inferred_concept_id, null: false
t.timestamps
end
add_index :posts_inferred_concepts, [:post_id, :inferred_concept_id], unique: true, name: 'idx_unique_post_inferred_concept'
add_index :posts_inferred_concepts, :post_id
add_index :posts_inferred_concepts, :inferred_concept_id
end
end
end

View File

@ -9,8 +9,13 @@ module DiscourseAi
return [] if content.blank?
# Use the ConceptFinder persona to identify concepts
llm = DiscourseAi::Completions::Llm.default_llm
persona = DiscourseAi::Personas::ConceptFinder.new
persona =
AiPersona
.all_personas(enabled_only: false)
.find { |persona| persona.id == SiteSetting.inferred_concepts_generate_persona.to_i }
.new
llm = LlmModel.find(persona.class.default_llm_id)
context =
DiscourseAi::Personas::BotContext.new(
messages: [{ type: :user, content: content }],
@ -18,12 +23,11 @@ module DiscourseAi
inferred_concepts: DiscourseAi::InferredConcepts::Manager.list_concepts,
)
prompt = persona.craft_prompt(context)
response = llm.completion(prompt, extract_json: true)
bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm)
return [] unless response.success?
response = bot.reply(context)
concepts = response.parsed_output["concepts"]
concepts = JSON.parse(response[0][0]).dig("concepts")
concepts || []
end
@ -68,7 +72,7 @@ module DiscourseAi
query = query.where("topics.created_at >= ?", created_after) if created_after.present?
# Exclude PM topics (if they exist in Discourse)
query = query.where(archetype: Topic.public_archetype)
query = query.where(archetype: Archetype.default)
# Exclude topics that already have concepts
topics_with_concepts = <<~SQL
@ -134,6 +138,34 @@ module DiscourseAi
# Return limited number of posts
query.limit(limit)
end
# Deduplicate and standardize a list of concepts
# @param concept_names [Array<String>] List of concept names to deduplicate
# @return [Hash] Hash with deduplicated concepts and mapping
def self.deduplicate_concepts(concept_names)
return { deduplicated_concepts: [], mapping: {} } if concept_names.blank?
# Use the ConceptDeduplicator persona to deduplicate concepts
persona =
AiPersona
.all_personas(enabled_only: false)
.find { |persona| persona.id == SiteSetting.inferred_concepts_deduplicate_persona.to_i }
.new
llm = LlmModel.find(persona.class.default_llm_id)
# Create the input for the deduplicator
input = { type: :user, content: concept_names.join(", ") }
context =
DiscourseAi::Personas::BotContext.new(messages: [input], user: Discourse.system_user)
bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm)
response = bot.reply(context)
concepts = JSON.parse(response[0][0]).dig("streamlined_tags")
end
end
end
end

View File

@ -14,6 +14,65 @@ module DiscourseAi
query.pluck(:name)
end
# Deduplicate concepts in batches by letter
# This method will:
# 1. Group concepts by first letter
# 2. Process each letter group separately through the deduplicator
# 3. Do a final pass with all deduplicated concepts
# @return [Hash] Statistics about the deduplication process
def self.deduplicate_concepts_by_letter(per_letter_batch: 50, full_pass_batch: 150)
# Get all concepts
all_concepts = list_concepts
return if all_concepts.empty?
letter_groups = Hash.new { |h, k| h[k] = [] }
# Group concepts by first letter
all_concepts.each do |concept|
first_char = concept[0]&.upcase
if first_char && first_char.match?(/[A-Z]/)
letter_groups[first_char] << concept
else
# Non-alphabetic or empty concepts go in a special group
letter_groups["#"] << concept
end
end
# Process each letter group
letter_deduplicated_concepts = []
letter_groups.each do |letter, concepts|
next if concepts.empty?
batches = concepts.each_slice(per_letter_batch).to_a
batches.each do |batch|
result = Finder.deduplicate_concepts(batch)
letter_deduplicated_concepts.concat(result)
end
end
# Final pass with all deduplicated concepts
if letter_deduplicated_concepts.present?
final_result = []
batches = letter_deduplicated_concepts.each_slice(full_pass_batch).to_a
batches.each do |batch|
dedups = Finder.deduplicate_concepts(batch)
final_result.concat(dedups)
end
# Remove duplicates
final_result.uniq!
# Apply the deduplicated concepts
InferredConcept.destroy_all
InferredConcept.insert_all(final_result.map { { name: it } })
end
end
# Generate new concepts for a topic and apply them
# @param topic [Topic] A Topic instance
# @return [Array<InferredConcept>] The concepts that were applied
@ -139,7 +198,7 @@ module DiscourseAi
# @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time
# @return [Array<Topic>] Array of Topic objects that are good candidates
def self.find_candidate_topics(opts = {})
Finder.find_candidate_topics(opts)
Finder.find_candidate_topics(**opts)
end
# Find candidate posts that are good for concept generation

View File

@ -0,0 +1,49 @@
# frozen_string_literal: true
module DiscourseAi
module Personas
class ConceptDeduplicator < Persona
def system_prompt
<<~PROMPT.strip
You will be given a list of machine-generated tags.
Your task is to streamline this list by merging entries who are similar or related.
Please follow these steps to create a streamlined list of tags:
1. Review the entire list of tags carefully.
2. Identify and remove any exact duplicates.
3. Look for tags that are too specific or niche, and consider removing them or replacing them with more general terms.
4. If there are multiple tags that convey similar concepts, choose the best one and remove the others, or add a new one that covers the missing aspect.
5. Ensure that the remaining tags are relevant and useful for describing the content.
When deciding which tags are "best", consider the following criteria:
- Relevance: How well does the tag describe the core content or theme?
- Generality: Is the tag specific enough to be useful, but not so specific that it's unlikely to be searched for?
- Clarity: Is the tag easy to understand and free from ambiguity?
- Popularity: Would this tag likely be used by people searching for this type of content?
Example Input:
AI Bias, AI Bots, AI Ethics, AI Helper, AI Integration, AI Moderation, AI Search, AI-Driven Moderation, AI-Generated Post Illustrations, AJAX Events, AJAX Requests, AMA Events, API, API Access, API Authentication, API Automation, API Call, API Changes, API Compliance, API Configuration, API Costs, API Documentation, API Endpoint, API Endpoints, API Functions, API Integration, API Key, API Keys, API Limitation, API Limitations, API Permissions, API Rate Limiting, API Request, API Request Optimization, API Requests, API Security, API Suspension, API Token, API Tokens, API Translation, API Versioning, API configuration, API endpoint, API key, APIs, APK, APT Package Manager, ARIA, ARIA Tags, ARM Architecture, ARM-based, AWS, AWS Lightsail, AWS RDS, AWS S3, AWS Translate, AWS costs, AWS t2.micro, Abbreviation Expansion, Abbreviations
Example Output:
AI, AJAX, API, APK, APT Package Manager, ARIA, ARM Architecture, AWS, Abbreviations
Please provide your streamlined list of tags within <streamlined_tags> key.
Remember, the goal is to create a more focused and effective set of tags while maintaining the essence of the original list.
Your output should be in the following format:
<o>
{
"streamlined_tags": ["tag1", "tag3"]
}
</o>
PROMPT
end
def response_format
[{ key: "streamlined_tags", type: "array" }]
end
end
end
end

View File

@ -54,6 +54,7 @@ module DiscourseAi
ForumResearcher => -14,
ConceptFinder => -15,
ConceptMatcher => -16,
ConceptDeduplicator => -17,
}
end