diff --git a/app/jobs/scheduled/generate_concepts_from_popular_items.rb b/app/jobs/scheduled/generate_concepts_from_popular_items.rb index a9a03493..74f29484 100644 --- a/app/jobs/scheduled/generate_concepts_from_popular_items.rb +++ b/app/jobs/scheduled/generate_concepts_from_popular_items.rb @@ -6,76 +6,80 @@ module Jobs # This job runs daily and generates new concepts from popular topics and posts # It selects items based on engagement metrics and generates concepts from their content - def execute(args = {}) + def execute(_args) return unless SiteSetting.inferred_concepts_enabled process_popular_topics process_popular_posts end - - private - - def process_popular_topics + private + + def process_popular_topics # Find candidate topics that are popular and don't have concepts yet - candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics( - limit: SiteSetting.inferred_concepts_daily_topics_limit || 20, - min_posts: SiteSetting.inferred_concepts_min_posts || 5, - min_likes: SiteSetting.inferred_concepts_min_likes || 10, - min_views: SiteSetting.inferred_concepts_min_views || 100, - created_after: SiteSetting.inferred_concepts_lookback_days.days.ago - ) + candidates = + DiscourseAi::InferredConcepts::Manager.find_candidate_topics( + limit: SiteSetting.inferred_concepts_daily_topics_limit || 20, + min_posts: SiteSetting.inferred_concepts_min_posts || 5, + min_likes: SiteSetting.inferred_concepts_min_likes || 10, + min_views: SiteSetting.inferred_concepts_min_views || 100, + created_after: SiteSetting.inferred_concepts_lookback_days.days.ago, + ) return if candidates.blank? - + # Process candidate topics - first generate concepts, then match Jobs.enqueue( :generate_inferred_concepts, - item_type: 'topics', - item_ids: candidates.map(&:id), - batch_size: 10 - ) - - # Schedule a follow-up job to match existing concepts - Jobs.enqueue_in( - 1.hour, - :generate_inferred_concepts, - item_type: 'topics', + item_type: "topics", item_ids: candidates.map(&:id), batch_size: 10, - match_only: true ) - end - - def process_popular_posts + if SiteSetting.inferred_concepts_background_match + # Schedule a follow-up job to match existing concepts + Jobs.enqueue_in( + 1.hour, + :generate_inferred_concepts, + item_type: "topics", + item_ids: candidates.map(&:id), + batch_size: 10, + match_only: true, + ) + end + end + + def process_popular_posts # Find candidate posts that are popular and don't have concepts yet - candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_posts( - limit: SiteSetting.inferred_concepts_daily_posts_limit || 30, - min_likes: SiteSetting.inferred_concepts_post_min_likes || 5, - exclude_first_posts: true, - created_after: SiteSetting.inferred_concepts_lookback_days.days.ago - ) + candidates = + DiscourseAi::InferredConcepts::Manager.find_candidate_posts( + limit: SiteSetting.inferred_concepts_daily_posts_limit || 30, + min_likes: SiteSetting.inferred_concepts_post_min_likes || 5, + exclude_first_posts: true, + created_after: SiteSetting.inferred_concepts_lookback_days.days.ago, + ) return if candidates.blank? - + # Process candidate posts - first generate concepts, then match Jobs.enqueue( :generate_inferred_concepts, - item_type: 'posts', - item_ids: candidates.map(&:id), - batch_size: 10 - ) - - # Schedule a follow-up job to match against existing concepts - Jobs.enqueue_in( - 1.hour, - :generate_inferred_concepts, - item_type: 'posts', + item_type: "posts", item_ids: candidates.map(&:id), batch_size: 10, - match_only: true ) + + if SiteSetting.inferred_concepts_background_match + # Schedule a follow-up job to match against existing concepts + Jobs.enqueue_in( + 1.hour, + :generate_inferred_concepts, + item_type: "posts", + item_ids: candidates.map(&:id), + batch_size: 10, + match_only: true, + ) + end end end -end \ No newline at end of file +end diff --git a/config/settings.yml b/config/settings.yml index 92371470..4c10e45d 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -405,32 +405,39 @@ discourse_ai: inferred_concepts_enabled: default: false client: true - description: "Enable the inferred concepts system that automatically generates and applies concepts to topics" + inferred_concepts_background_match: + default: false + client: false inferred_concepts_daily_topics_limit: default: 20 client: false - description: "Maximum number of topics to process each day for concept generation" inferred_concepts_min_posts: default: 5 client: false - description: "Minimum number of posts a topic must have to be considered for concept generation" inferred_concepts_min_likes: default: 10 client: false - description: "Minimum number of likes a topic must have to be considered for concept generation" inferred_concepts_min_views: default: 100 client: false - description: "Minimum number of views a topic must have to be considered for concept generation" inferred_concepts_lookback_days: default: 30 client: false - description: "Only consider topics created within this many days for concept generation" inferred_concepts_daily_posts_limit: default: 30 client: false - description: "Maximum number of posts to process each day for concept generation" inferred_concepts_post_min_likes: default: 5 client: false - description: "Minimum number of likes a post must have to be considered for concept generation" + inferred_concepts_generate_persona: + default: "-15" + type: enum + enum: "DiscourseAi::Configuration::PersonaEnumerator" + inferred_concepts_match_persona: + default: "-16" + type: enum + enum: "DiscourseAi::Configuration::PersonaEnumerator" + inferred_concepts_deduplicate_persona: + default: "-17" + type: enum + enum: "DiscourseAi::Configuration::PersonaEnumerator" diff --git a/db/migrate/20250508183456_create_topics_inferred_concepts.rb b/db/migrate/20250508183456_create_topics_inferred_concepts.rb index 6066bfbb..8ecbcd97 100644 --- a/db/migrate/20250508183456_create_topics_inferred_concepts.rb +++ b/db/migrate/20250508183456_create_topics_inferred_concepts.rb @@ -2,14 +2,12 @@ class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0] def change - create_table :topics_inferred_concepts do |t| + create_table :topics_inferred_concepts, primary_key: %i[topic_id inferred_concept_id] do |t| t.integer :topic_id, null: false t.integer :inferred_concept_id, null: false t.timestamps end - add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept' - add_index :topics_inferred_concepts, :topic_id add_index :topics_inferred_concepts, :inferred_concept_id end -end \ No newline at end of file +end diff --git a/db/migrate/20250509000001_create_posts_inferred_concepts.rb b/db/migrate/20250509000001_create_posts_inferred_concepts.rb index 258d0f14..518d643d 100644 --- a/db/migrate/20250509000001_create_posts_inferred_concepts.rb +++ b/db/migrate/20250509000001_create_posts_inferred_concepts.rb @@ -2,14 +2,12 @@ class CreatePostsInferredConcepts < ActiveRecord::Migration[7.0] def change - create_table :posts_inferred_concepts do |t| + create_table :posts_inferred_concepts, primary_key: %i[post_id inferred_concept_id] do |t| t.integer :post_id, null: false t.integer :inferred_concept_id, null: false t.timestamps end - add_index :posts_inferred_concepts, [:post_id, :inferred_concept_id], unique: true, name: 'idx_unique_post_inferred_concept' - add_index :posts_inferred_concepts, :post_id add_index :posts_inferred_concepts, :inferred_concept_id end -end \ No newline at end of file +end diff --git a/lib/inferred_concepts/finder.rb b/lib/inferred_concepts/finder.rb index 37e2c625..38d9e367 100644 --- a/lib/inferred_concepts/finder.rb +++ b/lib/inferred_concepts/finder.rb @@ -9,8 +9,13 @@ module DiscourseAi return [] if content.blank? # Use the ConceptFinder persona to identify concepts - llm = DiscourseAi::Completions::Llm.default_llm - persona = DiscourseAi::Personas::ConceptFinder.new + persona = + AiPersona + .all_personas(enabled_only: false) + .find { |persona| persona.id == SiteSetting.inferred_concepts_generate_persona.to_i } + .new + + llm = LlmModel.find(persona.class.default_llm_id) context = DiscourseAi::Personas::BotContext.new( messages: [{ type: :user, content: content }], @@ -18,12 +23,11 @@ module DiscourseAi inferred_concepts: DiscourseAi::InferredConcepts::Manager.list_concepts, ) - prompt = persona.craft_prompt(context) - response = llm.completion(prompt, extract_json: true) + bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm) - return [] unless response.success? + response = bot.reply(context) - concepts = response.parsed_output["concepts"] + concepts = JSON.parse(response[0][0]).dig("concepts") concepts || [] end @@ -68,7 +72,7 @@ module DiscourseAi query = query.where("topics.created_at >= ?", created_after) if created_after.present? # Exclude PM topics (if they exist in Discourse) - query = query.where(archetype: Topic.public_archetype) + query = query.where(archetype: Archetype.default) # Exclude topics that already have concepts topics_with_concepts = <<~SQL @@ -134,6 +138,34 @@ module DiscourseAi # Return limited number of posts query.limit(limit) end + + # Deduplicate and standardize a list of concepts + # @param concept_names [Array] List of concept names to deduplicate + # @return [Hash] Hash with deduplicated concepts and mapping + def self.deduplicate_concepts(concept_names) + return { deduplicated_concepts: [], mapping: {} } if concept_names.blank? + + # Use the ConceptDeduplicator persona to deduplicate concepts + persona = + AiPersona + .all_personas(enabled_only: false) + .find { |persona| persona.id == SiteSetting.inferred_concepts_deduplicate_persona.to_i } + .new + + llm = LlmModel.find(persona.class.default_llm_id) + + # Create the input for the deduplicator + input = { type: :user, content: concept_names.join(", ") } + + context = + DiscourseAi::Personas::BotContext.new(messages: [input], user: Discourse.system_user) + + bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm) + + response = bot.reply(context) + + concepts = JSON.parse(response[0][0]).dig("streamlined_tags") + end end end end diff --git a/lib/inferred_concepts/manager.rb b/lib/inferred_concepts/manager.rb index 3037cd15..a543d4c3 100644 --- a/lib/inferred_concepts/manager.rb +++ b/lib/inferred_concepts/manager.rb @@ -14,6 +14,65 @@ module DiscourseAi query.pluck(:name) end + + # Deduplicate concepts in batches by letter + # This method will: + # 1. Group concepts by first letter + # 2. Process each letter group separately through the deduplicator + # 3. Do a final pass with all deduplicated concepts + # @return [Hash] Statistics about the deduplication process + def self.deduplicate_concepts_by_letter(per_letter_batch: 50, full_pass_batch: 150) + # Get all concepts + all_concepts = list_concepts + return if all_concepts.empty? + + letter_groups = Hash.new { |h, k| h[k] = [] } + + # Group concepts by first letter + all_concepts.each do |concept| + first_char = concept[0]&.upcase + + if first_char && first_char.match?(/[A-Z]/) + letter_groups[first_char] << concept + else + # Non-alphabetic or empty concepts go in a special group + letter_groups["#"] << concept + end + end + + # Process each letter group + letter_deduplicated_concepts = [] + + letter_groups.each do |letter, concepts| + next if concepts.empty? + + batches = concepts.each_slice(per_letter_batch).to_a + + batches.each do |batch| + result = Finder.deduplicate_concepts(batch) + letter_deduplicated_concepts.concat(result) + end + end + + # Final pass with all deduplicated concepts + if letter_deduplicated_concepts.present? + final_result = [] + + batches = letter_deduplicated_concepts.each_slice(full_pass_batch).to_a + batches.each do |batch| + dedups = Finder.deduplicate_concepts(batch) + final_result.concat(dedups) + end + + # Remove duplicates + final_result.uniq! + + # Apply the deduplicated concepts + InferredConcept.destroy_all + InferredConcept.insert_all(final_result.map { { name: it } }) + end + end + # Generate new concepts for a topic and apply them # @param topic [Topic] A Topic instance # @return [Array] The concepts that were applied @@ -139,7 +198,7 @@ module DiscourseAi # @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time # @return [Array] Array of Topic objects that are good candidates def self.find_candidate_topics(opts = {}) - Finder.find_candidate_topics(opts) + Finder.find_candidate_topics(**opts) end # Find candidate posts that are good for concept generation diff --git a/lib/personas/concept_deduplicator.rb b/lib/personas/concept_deduplicator.rb new file mode 100644 index 00000000..d5a5048d --- /dev/null +++ b/lib/personas/concept_deduplicator.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +module DiscourseAi + module Personas + class ConceptDeduplicator < Persona + def system_prompt + <<~PROMPT.strip + You will be given a list of machine-generated tags. + Your task is to streamline this list by merging entries who are similar or related. + + Please follow these steps to create a streamlined list of tags: + + 1. Review the entire list of tags carefully. + 2. Identify and remove any exact duplicates. + 3. Look for tags that are too specific or niche, and consider removing them or replacing them with more general terms. + 4. If there are multiple tags that convey similar concepts, choose the best one and remove the others, or add a new one that covers the missing aspect. + 5. Ensure that the remaining tags are relevant and useful for describing the content. + + When deciding which tags are "best", consider the following criteria: + - Relevance: How well does the tag describe the core content or theme? + - Generality: Is the tag specific enough to be useful, but not so specific that it's unlikely to be searched for? + - Clarity: Is the tag easy to understand and free from ambiguity? + - Popularity: Would this tag likely be used by people searching for this type of content? + + Example Input: + AI Bias, AI Bots, AI Ethics, AI Helper, AI Integration, AI Moderation, AI Search, AI-Driven Moderation, AI-Generated Post Illustrations, AJAX Events, AJAX Requests, AMA Events, API, API Access, API Authentication, API Automation, API Call, API Changes, API Compliance, API Configuration, API Costs, API Documentation, API Endpoint, API Endpoints, API Functions, API Integration, API Key, API Keys, API Limitation, API Limitations, API Permissions, API Rate Limiting, API Request, API Request Optimization, API Requests, API Security, API Suspension, API Token, API Tokens, API Translation, API Versioning, API configuration, API endpoint, API key, APIs, APK, APT Package Manager, ARIA, ARIA Tags, ARM Architecture, ARM-based, AWS, AWS Lightsail, AWS RDS, AWS S3, AWS Translate, AWS costs, AWS t2.micro, Abbreviation Expansion, Abbreviations + + Example Output: + AI, AJAX, API, APK, APT Package Manager, ARIA, ARM Architecture, AWS, Abbreviations + + Please provide your streamlined list of tags within key. + + Remember, the goal is to create a more focused and effective set of tags while maintaining the essence of the original list. + + Your output should be in the following format: + + { + "streamlined_tags": ["tag1", "tag3"] + } + + PROMPT + end + + def response_format + [{ key: "streamlined_tags", type: "array" }] + end + end + end +end diff --git a/lib/personas/persona.rb b/lib/personas/persona.rb index ba3d3be6..002e8f4e 100644 --- a/lib/personas/persona.rb +++ b/lib/personas/persona.rb @@ -54,6 +54,7 @@ module DiscourseAi ForumResearcher => -14, ConceptFinder => -15, ConceptMatcher => -16, + ConceptDeduplicator => -17, } end