DEV: Support multiple tests per eval and followups per test (#1199)
See https://github.com/discourse/discourse-ai-evals/pull/9 for format of prompts
This commit is contained in:
parent
107f14456b
commit
5bf61ef9e1
|
@ -29,7 +29,6 @@ class DiscourseAi::Evals::Eval
|
||||||
@id = @yaml[:id]
|
@id = @yaml[:id]
|
||||||
@description = @yaml[:description]
|
@description = @yaml[:description]
|
||||||
@vision = @yaml[:vision]
|
@vision = @yaml[:vision]
|
||||||
@args = @yaml[:args]&.symbolize_keys
|
|
||||||
@type = @yaml[:type]
|
@type = @yaml[:type]
|
||||||
@expected_output = @yaml[:expected_output]
|
@expected_output = @yaml[:expected_output]
|
||||||
@expected_output_regex = @yaml[:expected_output_regex]
|
@expected_output_regex = @yaml[:expected_output_regex]
|
||||||
|
@ -39,10 +38,14 @@ class DiscourseAi::Evals::Eval
|
||||||
@expected_tool_call.symbolize_keys! if @expected_tool_call
|
@expected_tool_call.symbolize_keys! if @expected_tool_call
|
||||||
@judge = @yaml[:judge]
|
@judge = @yaml[:judge]
|
||||||
@judge.symbolize_keys! if @judge
|
@judge.symbolize_keys! if @judge
|
||||||
|
if @yaml[:args].is_a?(Array)
|
||||||
@args.each do |key, value|
|
@args = @yaml[:args].map(&:symbolize_keys)
|
||||||
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
|
else
|
||||||
@args[key] = File.expand_path(File.join(File.dirname(path), value))
|
@args = @yaml[:args].symbolize_keys
|
||||||
|
@args.each do |key, value|
|
||||||
|
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
|
||||||
|
@args[key] = File.expand_path(File.join(File.dirname(path), value))
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -57,7 +60,7 @@ class DiscourseAi::Evals::Eval
|
||||||
when "image_to_text"
|
when "image_to_text"
|
||||||
image_to_text(llm, **args)
|
image_to_text(llm, **args)
|
||||||
when "prompt"
|
when "prompt"
|
||||||
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
|
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(args)
|
||||||
when "edit_artifact"
|
when "edit_artifact"
|
||||||
edit_artifact(llm, **args)
|
edit_artifact(llm, **args)
|
||||||
when "summarization"
|
when "summarization"
|
||||||
|
|
|
@ -1,52 +0,0 @@
|
||||||
# frozen_string_literal: true
|
|
||||||
|
|
||||||
class DiscourseAi::Evals::PromptEvaluator
|
|
||||||
def initialize(llm)
|
|
||||||
@llm = llm.llm_model.to_llm
|
|
||||||
end
|
|
||||||
|
|
||||||
def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
|
|
||||||
tools = symbolize_tools(tools)
|
|
||||||
total = prompts.size * messages.size
|
|
||||||
count = 0
|
|
||||||
puts ""
|
|
||||||
|
|
||||||
prompts.flat_map do |prompt|
|
|
||||||
messages.map do |content|
|
|
||||||
count += 1
|
|
||||||
print "\rProcessing #{count}/#{total}"
|
|
||||||
|
|
||||||
c_prompt =
|
|
||||||
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
|
|
||||||
c_prompt.tools = tools if tools
|
|
||||||
result = { prompt:, message: content }
|
|
||||||
result[:result] = generate_result(c_prompt, temperature, stream)
|
|
||||||
result
|
|
||||||
end
|
|
||||||
end
|
|
||||||
ensure
|
|
||||||
print "\r\033[K"
|
|
||||||
end
|
|
||||||
|
|
||||||
private
|
|
||||||
|
|
||||||
def generate_result(c_prompt, temperature, stream)
|
|
||||||
if stream
|
|
||||||
stream_result = []
|
|
||||||
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
|
|
||||||
stream_result << partial
|
|
||||||
end
|
|
||||||
stream_result
|
|
||||||
else
|
|
||||||
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def symbolize_tools(tools)
|
|
||||||
return nil if tools.nil?
|
|
||||||
|
|
||||||
tools.map do |tool|
|
|
||||||
{ name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class DiscourseAi::Evals::PromptEvaluator
|
||||||
|
def initialize(llm)
|
||||||
|
@llm = llm.llm_model.to_llm
|
||||||
|
end
|
||||||
|
|
||||||
|
def prompt_call(args)
|
||||||
|
args = [args] if !args.is_a?(Array)
|
||||||
|
runner = DiscourseAi::Evals::PromptSingleTestRunner.new(@llm)
|
||||||
|
|
||||||
|
with_tests_progress(total: args.size) do |bump_progress|
|
||||||
|
args.flat_map do |test|
|
||||||
|
bump_progress.call
|
||||||
|
|
||||||
|
prompts, messages, followups, output_thinking, stream, temperature, tools =
|
||||||
|
symbolize_test_args(test)
|
||||||
|
|
||||||
|
prompts.flat_map do |prompt|
|
||||||
|
messages.map do |message|
|
||||||
|
runner.run_single_test(
|
||||||
|
prompt,
|
||||||
|
message,
|
||||||
|
followups,
|
||||||
|
output_thinking,
|
||||||
|
stream,
|
||||||
|
temperature,
|
||||||
|
tools,
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def symbolize_test_args(args)
|
||||||
|
prompts = args[:prompts] || [args[:prompt]]
|
||||||
|
messages = args[:messages] || [args[:message]]
|
||||||
|
followups = symbolize_followups(args)
|
||||||
|
output_thinking = args[:output_thinking] || false
|
||||||
|
stream = args[:stream] || false
|
||||||
|
temperature = args[:temperature]
|
||||||
|
tools = symbolize_tools(args[:tools])
|
||||||
|
[prompts, messages, followups, output_thinking, stream, temperature, tools]
|
||||||
|
end
|
||||||
|
|
||||||
|
def symbolize_followups(args)
|
||||||
|
return nil if args[:followups].nil? && args[:followup].nil?
|
||||||
|
followups = args[:followups] || [args[:followup]]
|
||||||
|
followups.map do |followup|
|
||||||
|
followup = followup.dup.symbolize_keys!
|
||||||
|
message = followup[:message].dup.symbolize_keys!
|
||||||
|
message[:type] = message[:type].to_sym if message[:type]
|
||||||
|
followup[:message] = message
|
||||||
|
followup
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def symbolize_tools(tools)
|
||||||
|
return nil if tools.nil?
|
||||||
|
tools.map do |tool|
|
||||||
|
tool.symbolize_keys!
|
||||||
|
tool.merge(
|
||||||
|
parameters: tool[:parameters]&.map { |param| param.transform_keys(&:to_sym) },
|
||||||
|
).compact
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def with_tests_progress(total:)
|
||||||
|
puts ""
|
||||||
|
count = 0
|
||||||
|
result =
|
||||||
|
yield(
|
||||||
|
-> do
|
||||||
|
count += 1
|
||||||
|
print "\rProcessing test #{count}/#{total}"
|
||||||
|
end
|
||||||
|
)
|
||||||
|
print "\r\033[K"
|
||||||
|
result
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,76 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class DiscourseAi::Evals::PromptSingleTestRunner
|
||||||
|
def initialize(llm)
|
||||||
|
@llm = llm
|
||||||
|
end
|
||||||
|
|
||||||
|
# Run a single test with a prompt and message, and some model settings
|
||||||
|
# @param prompt [String] the prompt to use
|
||||||
|
# @param message [String] the message to use
|
||||||
|
# @param followups [Array<Hash>] an array of followups (messages) to run after the initial prompt
|
||||||
|
# @param output_thinking [Boolean] whether to output the thinking state of the model
|
||||||
|
# @param stream [Boolean] whether to stream the output of the model
|
||||||
|
# @param temperature [Float] the temperature to use when generating completions
|
||||||
|
# @param tools [Array<Hash>] an array of tools to use when generating completions
|
||||||
|
# @return [Hash] the prompt, message, and result of the test
|
||||||
|
def run_single_test(prompt, message, followups, output_thinking, stream, temperature, tools)
|
||||||
|
@c_prompt =
|
||||||
|
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: message }])
|
||||||
|
@c_prompt.tools = tools if tools
|
||||||
|
generate_result(temperature, output_thinking, stream)
|
||||||
|
|
||||||
|
if followups
|
||||||
|
followups.each do |followup|
|
||||||
|
generate_followup(followup, output_thinking, stream, temperature)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
{ prompt:, message:, result: @result }
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def generate_followup(followup, output_thinking, stream, temperature)
|
||||||
|
@c_prompt.push_model_response(@result)
|
||||||
|
followup_message = set_followup_tool(followup)
|
||||||
|
@c_prompt.push(**followup_message)
|
||||||
|
begin
|
||||||
|
generate_result(temperature, output_thinking, stream)
|
||||||
|
rescue => e
|
||||||
|
# should not happen but it helps debugging...
|
||||||
|
puts e
|
||||||
|
result = []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def set_followup_tool(followup)
|
||||||
|
@c_prompt.tools = followup[:tools] if followup[:tools]
|
||||||
|
followup_message = followup[:message]
|
||||||
|
%i[id name].each do |key|
|
||||||
|
if followup_message[key].is_a?(Array)
|
||||||
|
type, inner_key = followup_message[key]
|
||||||
|
# this allows us to dynamically set the id or name of the tool call
|
||||||
|
prev = @c_prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
|
||||||
|
followup_message[key] = prev[inner_key.to_sym] if prev
|
||||||
|
end
|
||||||
|
end
|
||||||
|
followup_message
|
||||||
|
end
|
||||||
|
|
||||||
|
def generate_result(temperature, output_thinking, stream)
|
||||||
|
@result =
|
||||||
|
if stream
|
||||||
|
stream_result = []
|
||||||
|
@llm.generate(
|
||||||
|
@c_prompt,
|
||||||
|
user: Discourse.system_user,
|
||||||
|
temperature:,
|
||||||
|
output_thinking:,
|
||||||
|
) { |partial| stream_result << partial }
|
||||||
|
stream_result
|
||||||
|
else
|
||||||
|
@llm.generate(@c_prompt, user: Discourse.system_user, temperature:, output_thinking:)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -6,7 +6,8 @@ require_relative "lib/llm"
|
||||||
require_relative "lib/cli"
|
require_relative "lib/cli"
|
||||||
require_relative "lib/runner"
|
require_relative "lib/runner"
|
||||||
require_relative "lib/eval"
|
require_relative "lib/eval"
|
||||||
require_relative "lib/prompt_evaluator"
|
require_relative "lib/prompts/prompt_evaluator"
|
||||||
|
require_relative "lib/prompts/single_test_runner"
|
||||||
|
|
||||||
options = DiscourseAi::Evals::Cli.parse_options!
|
options = DiscourseAi::Evals::Cli.parse_options!
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue