DEV: Allow prompt-type evals to take in several prompts and messages (#1190)
* DEV: Allow prompt-type evals to take in several prompts and messages
* ❄️
This commit is contained in:
parent
51ca942d7d
commit
3533cd1acc
|
@ -57,13 +57,51 @@ class DiscourseAi::Evals::Eval
|
|||
when "image_to_text"
|
||||
image_to_text(llm, **args)
|
||||
when "prompt"
|
||||
prompt_call(llm, **args)
|
||||
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
|
||||
when "edit_artifact"
|
||||
edit_artifact(llm, **args)
|
||||
when "summarization"
|
||||
summarization(llm, **args)
|
||||
end
|
||||
|
||||
classify_results(result)
|
||||
rescue EvalError => e
|
||||
{ result: :fail, message: e.message, context: e.context }
|
||||
end
|
||||
|
||||
def print
|
||||
puts "#{id}: #{description}"
|
||||
end
|
||||
|
||||
def to_json
|
||||
{
|
||||
type: @type,
|
||||
path: @path,
|
||||
name: @name,
|
||||
description: @description,
|
||||
id: @id,
|
||||
args: @args,
|
||||
vision: @vision,
|
||||
expected_output: @expected_output,
|
||||
expected_output_regex: @expected_output_regex,
|
||||
}.compact
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# @param result [String, Array<Hash>] the result of the eval, either
|
||||
# "llm response" or [{ result: "llm response", other_attrs: here }]
|
||||
# @return [Array<Hash>] an array of hashes with the result classified
|
||||
# as pass or fail, along with extra attributes
|
||||
def classify_results(result)
|
||||
if result.is_a?(Array)
|
||||
result.each { |r| r.merge!(classify_result_pass_fail(r)) }
|
||||
else
|
||||
[classify_result_pass_fail(result)]
|
||||
end
|
||||
end
|
||||
|
||||
def classify_result_pass_fail(result)
|
||||
if expected_output
|
||||
if result == expected_output
|
||||
{ result: :pass }
|
||||
|
@ -94,34 +132,17 @@ class DiscourseAi::Evals::Eval
|
|||
else
|
||||
{ result: :pass }
|
||||
end
|
||||
rescue EvalError => e
|
||||
{ result: :fail, message: e.message, context: e.context }
|
||||
end
|
||||
|
||||
def print
|
||||
puts "#{id}: #{description}"
|
||||
end
|
||||
|
||||
def to_json
|
||||
{
|
||||
type: @type,
|
||||
path: @path,
|
||||
name: @name,
|
||||
description: @description,
|
||||
id: @id,
|
||||
args: @args,
|
||||
vision: @vision,
|
||||
expected_output: @expected_output,
|
||||
expected_output_regex: @expected_output_regex,
|
||||
}.compact
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def judge_result(result)
|
||||
prompt = judge[:prompt].dup
|
||||
prompt.sub!("{{output}}", result)
|
||||
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
|
||||
if result.is_a?(String)
|
||||
prompt.sub!("{{output}}", result)
|
||||
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
|
||||
else
|
||||
prompt.sub!("{{output}}", result[:result])
|
||||
result.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
|
||||
end
|
||||
|
||||
prompt += <<~SUFFIX
|
||||
|
||||
|
@ -220,36 +241,6 @@ class DiscourseAi::Evals::Eval
|
|||
upload.destroy if upload
|
||||
end
|
||||
|
||||
def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
|
||||
if tools
|
||||
tools.each do |tool|
|
||||
tool.symbolize_keys!
|
||||
tool[:parameters].symbolize_keys! if tool[:parameters]
|
||||
end
|
||||
end
|
||||
prompt =
|
||||
DiscourseAi::Completions::Prompt.new(
|
||||
system_prompt,
|
||||
messages: [{ type: :user, content: message }],
|
||||
)
|
||||
|
||||
prompt.tools = tools if tools
|
||||
|
||||
result = nil
|
||||
if stream
|
||||
result = []
|
||||
llm
|
||||
.llm_model
|
||||
.to_llm
|
||||
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
|
||||
result << partial
|
||||
end
|
||||
else
|
||||
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
|
||||
end
|
||||
result
|
||||
end
|
||||
|
||||
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
|
||||
css = File.read(css_path)
|
||||
js = File.read(js_path)
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class DiscourseAi::Evals::PromptEvaluator
|
||||
def initialize(llm)
|
||||
@llm = llm.llm_model.to_llm
|
||||
end
|
||||
|
||||
def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
|
||||
tools = symbolize_tools(tools)
|
||||
total = prompts.size * messages.size
|
||||
count = 0
|
||||
puts ""
|
||||
|
||||
prompts.flat_map do |prompt|
|
||||
messages.map do |content|
|
||||
count += 1
|
||||
print "\rProcessing #{count}/#{total}"
|
||||
|
||||
c_prompt =
|
||||
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
|
||||
c_prompt.tools = tools if tools
|
||||
result = { prompt:, message: content }
|
||||
result[:result] = generate_result(c_prompt, temperature, stream)
|
||||
result
|
||||
end
|
||||
end
|
||||
ensure
|
||||
print "\r\033[K"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def generate_result(c_prompt, temperature, stream)
|
||||
if stream
|
||||
stream_result = []
|
||||
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
|
||||
stream_result << partial
|
||||
end
|
||||
stream_result
|
||||
else
|
||||
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
|
||||
end
|
||||
end
|
||||
|
||||
def symbolize_tools(tools)
|
||||
return nil if tools.nil?
|
||||
|
||||
tools.map do |tool|
|
||||
{ name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
|
||||
end
|
||||
end
|
||||
end
|
|
@ -148,31 +148,33 @@ class DiscourseAi::Evals::Runner
|
|||
structured_logger.step("Evaluating with LLM: #{llm.name}") do |step|
|
||||
logger.info("Evaluating with LLM: #{llm.name}")
|
||||
print "#{llm.name}: "
|
||||
result = @eval.run(llm: llm)
|
||||
results = @eval.run(llm: llm)
|
||||
|
||||
step[:args] = result
|
||||
step[:cname] = result[:result] == :pass ? :good : :bad
|
||||
results.each do |result|
|
||||
step[:args] = result
|
||||
step[:cname] = result[:result] == :pass ? :good : :bad
|
||||
|
||||
if result[:result] == :fail
|
||||
puts "Failed 🔴"
|
||||
puts "Error: #{result[:message]}" if result[:message]
|
||||
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
|
||||
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
|
||||
if result[:expected_output] && result[:actual_output]
|
||||
puts "---- Expected ----\n#{result[:expected_output]}"
|
||||
puts "---- Actual ----\n#{result[:actual_output]}"
|
||||
if result[:result] == :fail
|
||||
puts "Failed 🔴"
|
||||
puts "Error: #{result[:message]}" if result[:message]
|
||||
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
|
||||
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
|
||||
if result[:expected_output] && result[:actual_output]
|
||||
puts "---- Expected ----\n#{result[:expected_output]}"
|
||||
puts "---- Actual ----\n#{result[:actual_output]}"
|
||||
end
|
||||
logger.error("Evaluation failed with LLM: #{llm.name}")
|
||||
logger.error("Error: #{result[:message]}") if result[:message]
|
||||
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
|
||||
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
|
||||
logger.error("Context: #{result[:context]}") if result[:context]
|
||||
elsif result[:result] == :pass
|
||||
puts "Passed 🟢"
|
||||
logger.info("Evaluation passed with LLM: #{llm.name}")
|
||||
else
|
||||
STDERR.puts "Error: Unknown result #{eval.inspect}"
|
||||
logger.error("Unknown result: #{eval.inspect}")
|
||||
end
|
||||
logger.error("Evaluation failed with LLM: #{llm.name}")
|
||||
logger.error("Error: #{result[:message]}") if result[:message]
|
||||
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
|
||||
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
|
||||
logger.error("Context: #{result[:context]}") if result[:context]
|
||||
elsif result[:result] == :pass
|
||||
puts "Passed 🟢"
|
||||
logger.info("Evaluation passed with LLM: #{llm.name}")
|
||||
else
|
||||
STDERR.puts "Error: Unknown result #{eval.inspect}"
|
||||
logger.error("Unknown result: #{eval.inspect}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue