DEV: Allow prompt-type evals to take in several prompts and messages (#1190)

* DEV: Allow prompt-type evals to take in several prompts and messages

* ❄️
This commit is contained in:
Natalie Tay 2025-03-14 12:46:22 +08:00 committed by GitHub
parent 51ca942d7d
commit 3533cd1acc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 123 additions and 77 deletions

View File

@ -57,13 +57,51 @@ class DiscourseAi::Evals::Eval
when "image_to_text"
image_to_text(llm, **args)
when "prompt"
prompt_call(llm, **args)
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
when "edit_artifact"
edit_artifact(llm, **args)
when "summarization"
summarization(llm, **args)
end
classify_results(result)
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end
def print
puts "#{id}: #{description}"
end
def to_json
{
type: @type,
path: @path,
name: @name,
description: @description,
id: @id,
args: @args,
vision: @vision,
expected_output: @expected_output,
expected_output_regex: @expected_output_regex,
}.compact
end
private
# @param result [String, Array<Hash>] the result of the eval, either
# "llm response" or [{ result: "llm response", other_attrs: here }]
# @return [Array<Hash>] an array of hashes with the result classified
# as pass or fail, along with extra attributes
def classify_results(result)
if result.is_a?(Array)
result.each { |r| r.merge!(classify_result_pass_fail(r)) }
else
[classify_result_pass_fail(result)]
end
end
def classify_result_pass_fail(result)
if expected_output
if result == expected_output
{ result: :pass }
@ -94,34 +132,17 @@ class DiscourseAi::Evals::Eval
else
{ result: :pass }
end
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end
def print
puts "#{id}: #{description}"
end
def to_json
{
type: @type,
path: @path,
name: @name,
description: @description,
id: @id,
args: @args,
vision: @vision,
expected_output: @expected_output,
expected_output_regex: @expected_output_regex,
}.compact
end
private
def judge_result(result)
prompt = judge[:prompt].dup
prompt.sub!("{{output}}", result)
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
if result.is_a?(String)
prompt.sub!("{{output}}", result)
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
else
prompt.sub!("{{output}}", result[:result])
result.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
end
prompt += <<~SUFFIX
@ -220,36 +241,6 @@ class DiscourseAi::Evals::Eval
upload.destroy if upload
end
def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
if tools
tools.each do |tool|
tool.symbolize_keys!
tool[:parameters].symbolize_keys! if tool[:parameters]
end
end
prompt =
DiscourseAi::Completions::Prompt.new(
system_prompt,
messages: [{ type: :user, content: message }],
)
prompt.tools = tools if tools
result = nil
if stream
result = []
llm
.llm_model
.to_llm
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
result << partial
end
else
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
end
result
end
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
css = File.read(css_path)
js = File.read(js_path)

View File

@ -0,0 +1,52 @@
# frozen_string_literal: true
class DiscourseAi::Evals::PromptEvaluator
def initialize(llm)
@llm = llm.llm_model.to_llm
end
def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
tools = symbolize_tools(tools)
total = prompts.size * messages.size
count = 0
puts ""
prompts.flat_map do |prompt|
messages.map do |content|
count += 1
print "\rProcessing #{count}/#{total}"
c_prompt =
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
c_prompt.tools = tools if tools
result = { prompt:, message: content }
result[:result] = generate_result(c_prompt, temperature, stream)
result
end
end
ensure
print "\r\033[K"
end
private
def generate_result(c_prompt, temperature, stream)
if stream
stream_result = []
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
stream_result << partial
end
stream_result
else
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
end
end
def symbolize_tools(tools)
return nil if tools.nil?
tools.map do |tool|
{ name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
end
end
end

View File

@ -148,31 +148,33 @@ class DiscourseAi::Evals::Runner
structured_logger.step("Evaluating with LLM: #{llm.name}") do |step|
logger.info("Evaluating with LLM: #{llm.name}")
print "#{llm.name}: "
result = @eval.run(llm: llm)
results = @eval.run(llm: llm)
step[:args] = result
step[:cname] = result[:result] == :pass ? :good : :bad
results.each do |result|
step[:args] = result
step[:cname] = result[:result] == :pass ? :good : :bad
if result[:result] == :fail
puts "Failed 🔴"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
if result[:result] == :fail
puts "Failed 🔴"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
end
logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass
puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}")
else
STDERR.puts "Error: Unknown result #{eval.inspect}"
logger.error("Unknown result: #{eval.inspect}")
end
logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass
puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}")
else
STDERR.puts "Error: Unknown result #{eval.inspect}"
logger.error("Unknown result: #{eval.inspect}")
end
end
end

View File

@ -6,6 +6,7 @@ require_relative "lib/llm"
require_relative "lib/cli"
require_relative "lib/runner"
require_relative "lib/eval"
require_relative "lib/prompt_evaluator"
options = DiscourseAi::Evals::Cli.parse_options!