discourse-ai/lib/agents/tools/researcher.rb

208 lines
7.1 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Agents
module Tools
class Researcher < Tool
attr_reader :filter, :result_count, :goals, :dry_run
class << self
def signature
{
name: name,
description:
"Analyze and extract information from content across the forum based on specified filters",
parameters: [
{ name: "filter", description: filter_description, type: "string" },
{
name: "goals",
description:
"The specific information you want to extract or analyze from the filtered content, you may specify multiple goals",
type: "string",
},
{
name: "dry_run",
description: "When true, only count matching posts without processing data",
type: "boolean",
},
],
}
end
def filter_description
<<~TEXT
Filter string to target specific content.
- Supports user (@username)
- date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
- categories (category:category1,category2)
- tags (tag:tag1,tag2)
- groups (group:group1,group2).
- status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
- keywords (keywords:keyword1,keyword2) - specific words to search for in posts
- max_results (max_results:10) the maximum number of results to return (optional)
- order (order:latest, order:oldest, order:latest_topic, order:oldest_topic) - the order of the results (optional)
- topic (topic:topic_id1,topic_id2) - add specific topics to the filter, topics will unconditionally be included
If multiple tags or categories are specified, they are treated as OR conditions.
Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'
TEXT
end
def name
"researcher"
end
def accepted_options
[
option(:max_results, type: :integer),
option(:include_private, type: :boolean),
option(:max_tokens_per_post, type: :integer),
]
end
end
def invoke(&blk)
max_results = options[:max_results] || 1000
@filter = parameters[:filter] || ""
@goals = parameters[:goals] || ""
@dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
post = Post.find_by(id: context.post_id)
goals = parameters[:goals] || ""
dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
return { error: "No goals provided" } if goals.blank?
return { error: "No filter provided" } if @filter.blank?
guardian = nil
guardian = Guardian.new(context.user) if options[:include_private]
filter =
DiscourseAi::Utils::Research::Filter.new(
@filter,
limit: max_results,
guardian: guardian,
)
if filter.invalid_filters.present?
return(
{
error:
"Invalid filter fragment: #{filter.invalid_filters.join(" ")}\n\n#{self.class.filter_description}",
}
)
end
@result_count = filter.search.count
blk.call details
if dry_run
{ dry_run: true, goals: goals, filter: @filter, number_of_posts: @result_count }
else
process_filter(filter, goals, post, &blk)
end
end
def details
if @dry_run
I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args)
else
I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args)
end
end
def summary
if @dry_run
I18n.t("discourse_ai.ai_bot.tool_summary.researcher_dry_run")
else
I18n.t("discourse_ai.ai_bot.tool_summary.researcher")
end
end
def description_args
{ count: @result_count || 0, filter: @filter || "", goals: @goals || "" }
end
protected
MIN_TOKENS_FOR_RESEARCH = 8000
def process_filter(filter, goals, post, &blk)
if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
raise ArgumentError,
"LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
end
formatter =
DiscourseAi::Utils::Research::LlmFormatter.new(
filter,
max_tokens_per_batch: llm.max_prompt_tokens - 2000,
tokenizer: llm.tokenizer,
max_tokens_per_post: options[:max_tokens_per_post] || 2000,
)
results = []
formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goals, post, &blk) }
{ dry_run: false, goals: goals, filter: @filter, results: results }
end
def run_inference(chunk_text, goals, post, &blk)
system_prompt = goal_system_prompt(goals)
user_prompt = goal_user_prompt(goals, chunk_text)
prompt =
DiscourseAi::Completions::Prompt.new(
system_prompt,
messages: [{ type: :user, content: user_prompt }],
post_id: post.id,
topic_id: post.topic_id,
)
results = []
llm.generate(
prompt,
user: post.user,
feature_name: context.feature_name,
cancel_manager: context.cancel_manager,
) { |partial| results << partial }
@progress_dots ||= 0
@progress_dots += 1
blk.call(details + "\n\n#{"." * @progress_dots}")
results.join
end
def goal_system_prompt(goals)
<<~TEXT
You are a researcher tool designed to analyze and extract information from forum content on #{Discourse.base_url}.
The current date is #{::Time.zone.now.strftime("%a, %d %b %Y %H:%M %Z")}.
Your task is to process the provided content and extract relevant information based on the specified goal.
When extracting content ALWAYS include the following:
- Multiple citations using Markdown
- Topic citations: Interesting fact [ref](/t/-/TOPIC_ID)
- Post citations: Interesting fact [ref](/t/-/TOPIC_ID/POST_NUMBER)
- Relevent quotes from the direct source content
- Relevant dates and times from the content
Your goal is: #{goals}
TEXT
end
def goal_user_prompt(goals, chunk_text)
<<~TEXT
Here is the content to analyze:
{{{
#{chunk_text}
}}}
Your goal is: #{goals}
TEXT
end
end
end
end
end