# frozen_string_literal: true module DiscourseAi module Agents module Tools class Researcher < Tool attr_reader :filter, :result_count, :goals, :dry_run class << self def signature { name: name, description: "Analyze and extract information from content across the forum based on specified filters", parameters: [ { name: "filter", description: filter_description, type: "string" }, { name: "goals", description: "The specific information you want to extract or analyze from the filtered content, you may specify multiple goals", type: "string", }, { name: "dry_run", description: "When true, only count matching posts without processing data", type: "boolean", }, ], } end def filter_description <<~TEXT Filter string to target specific content. - Supports user (@username) - date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics) - categories (category:category1,category2) - tags (tag:tag1,tag2) - groups (group:group1,group2). - status (status:open, status:closed, status:archived, status:noreplies, status:single_user) - keywords (keywords:keyword1,keyword2) - specific words to search for in posts - max_results (max_results:10) the maximum number of results to return (optional) - order (order:latest, order:oldest, order:latest_topic, order:oldest_topic) - the order of the results (optional) - topic (topic:topic_id1,topic_id2) - add specific topics to the filter, topics will unconditionally be included If multiple tags or categories are specified, they are treated as OR conditions. Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature' TEXT end def name "researcher" end def accepted_options [ option(:max_results, type: :integer), option(:include_private, type: :boolean), option(:max_tokens_per_post, type: :integer), ] end end def invoke(&blk) max_results = options[:max_results] || 1000 @filter = parameters[:filter] || "" @goals = parameters[:goals] || "" @dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run] post = Post.find_by(id: context.post_id) goals = parameters[:goals] || "" dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run] return { error: "No goals provided" } if goals.blank? return { error: "No filter provided" } if @filter.blank? guardian = nil guardian = Guardian.new(context.user) if options[:include_private] filter = DiscourseAi::Utils::Research::Filter.new( @filter, limit: max_results, guardian: guardian, ) if filter.invalid_filters.present? return( { error: "Invalid filter fragment: #{filter.invalid_filters.join(" ")}\n\n#{self.class.filter_description}", } ) end @result_count = filter.search.count blk.call details if dry_run { dry_run: true, goals: goals, filter: @filter, number_of_posts: @result_count } else process_filter(filter, goals, post, &blk) end end def details if @dry_run I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args) else I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args) end end def summary if @dry_run I18n.t("discourse_ai.ai_bot.tool_summary.researcher_dry_run") else I18n.t("discourse_ai.ai_bot.tool_summary.researcher") end end def description_args { count: @result_count || 0, filter: @filter || "", goals: @goals || "" } end protected MIN_TOKENS_FOR_RESEARCH = 8000 def process_filter(filter, goals, post, &blk) if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH raise ArgumentError, "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}." end formatter = DiscourseAi::Utils::Research::LlmFormatter.new( filter, max_tokens_per_batch: llm.max_prompt_tokens - 2000, tokenizer: llm.tokenizer, max_tokens_per_post: options[:max_tokens_per_post] || 2000, ) results = [] formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goals, post, &blk) } { dry_run: false, goals: goals, filter: @filter, results: results } end def run_inference(chunk_text, goals, post, &blk) system_prompt = goal_system_prompt(goals) user_prompt = goal_user_prompt(goals, chunk_text) prompt = DiscourseAi::Completions::Prompt.new( system_prompt, messages: [{ type: :user, content: user_prompt }], post_id: post.id, topic_id: post.topic_id, ) results = [] llm.generate( prompt, user: post.user, feature_name: context.feature_name, cancel_manager: context.cancel_manager, ) { |partial| results << partial } @progress_dots ||= 0 @progress_dots += 1 blk.call(details + "\n\n#{"." * @progress_dots}") results.join end def goal_system_prompt(goals) <<~TEXT You are a researcher tool designed to analyze and extract information from forum content on #{Discourse.base_url}. The current date is #{::Time.zone.now.strftime("%a, %d %b %Y %H:%M %Z")}. Your task is to process the provided content and extract relevant information based on the specified goal. When extracting content ALWAYS include the following: - Multiple citations using Markdown - Topic citations: Interesting fact [ref](/t/-/TOPIC_ID) - Post citations: Interesting fact [ref](/t/-/TOPIC_ID/POST_NUMBER) - Relevent quotes from the direct source content - Relevant dates and times from the content Your goal is: #{goals} TEXT end def goal_user_prompt(goals, chunk_text) <<~TEXT Here is the content to analyze: {{{ #{chunk_text} }}} Your goal is: #{goals} TEXT end end end end end