From 5bf61ef9e1fc9372322914848775c2f5af780fe9 Mon Sep 17 00:00:00 2001 From: Natalie Tay Date: Tue, 18 Mar 2025 11:42:05 +0800 Subject: [PATCH] DEV: Support multiple tests per eval and followups per test (#1199) See https://github.com/discourse/discourse-ai-evals/pull/9 for format of prompts --- evals/lib/eval.rb | 15 +++-- evals/lib/prompt_evaluator.rb | 52 --------------- evals/lib/prompts/prompt_evaluator.rb | 84 +++++++++++++++++++++++++ evals/lib/prompts/single_test_runner.rb | 76 ++++++++++++++++++++++ evals/run | 3 +- 5 files changed, 171 insertions(+), 59 deletions(-) delete mode 100644 evals/lib/prompt_evaluator.rb create mode 100644 evals/lib/prompts/prompt_evaluator.rb create mode 100644 evals/lib/prompts/single_test_runner.rb diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 4e84ff32..a39c6bd7 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -29,7 +29,6 @@ class DiscourseAi::Evals::Eval @id = @yaml[:id] @description = @yaml[:description] @vision = @yaml[:vision] - @args = @yaml[:args]&.symbolize_keys @type = @yaml[:type] @expected_output = @yaml[:expected_output] @expected_output_regex = @yaml[:expected_output_regex] @@ -39,10 +38,14 @@ class DiscourseAi::Evals::Eval @expected_tool_call.symbolize_keys! if @expected_tool_call @judge = @yaml[:judge] @judge.symbolize_keys! if @judge - - @args.each do |key, value| - if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) - @args[key] = File.expand_path(File.join(File.dirname(path), value)) + if @yaml[:args].is_a?(Array) + @args = @yaml[:args].map(&:symbolize_keys) + else + @args = @yaml[:args].symbolize_keys + @args.each do |key, value| + if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) + @args[key] = File.expand_path(File.join(File.dirname(path), value)) + end end end end @@ -57,7 +60,7 @@ class DiscourseAi::Evals::Eval when "image_to_text" image_to_text(llm, **args) when "prompt" - DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args) + DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(args) when "edit_artifact" edit_artifact(llm, **args) when "summarization" diff --git a/evals/lib/prompt_evaluator.rb b/evals/lib/prompt_evaluator.rb deleted file mode 100644 index d6ca3330..00000000 --- a/evals/lib/prompt_evaluator.rb +++ /dev/null @@ -1,52 +0,0 @@ -# frozen_string_literal: true - -class DiscourseAi::Evals::PromptEvaluator - def initialize(llm) - @llm = llm.llm_model.to_llm - end - - def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false) - tools = symbolize_tools(tools) - total = prompts.size * messages.size - count = 0 - puts "" - - prompts.flat_map do |prompt| - messages.map do |content| - count += 1 - print "\rProcessing #{count}/#{total}" - - c_prompt = - DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }]) - c_prompt.tools = tools if tools - result = { prompt:, message: content } - result[:result] = generate_result(c_prompt, temperature, stream) - result - end - end - ensure - print "\r\033[K" - end - - private - - def generate_result(c_prompt, temperature, stream) - if stream - stream_result = [] - @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial| - stream_result << partial - end - stream_result - else - @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) - end - end - - def symbolize_tools(tools) - return nil if tools.nil? - - tools.map do |tool| - { name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact - end - end -end diff --git a/evals/lib/prompts/prompt_evaluator.rb b/evals/lib/prompts/prompt_evaluator.rb new file mode 100644 index 00000000..d526243d --- /dev/null +++ b/evals/lib/prompts/prompt_evaluator.rb @@ -0,0 +1,84 @@ +# frozen_string_literal: true + +class DiscourseAi::Evals::PromptEvaluator + def initialize(llm) + @llm = llm.llm_model.to_llm + end + + def prompt_call(args) + args = [args] if !args.is_a?(Array) + runner = DiscourseAi::Evals::PromptSingleTestRunner.new(@llm) + + with_tests_progress(total: args.size) do |bump_progress| + args.flat_map do |test| + bump_progress.call + + prompts, messages, followups, output_thinking, stream, temperature, tools = + symbolize_test_args(test) + + prompts.flat_map do |prompt| + messages.map do |message| + runner.run_single_test( + prompt, + message, + followups, + output_thinking, + stream, + temperature, + tools, + ) + end + end + end + end + end + + private + + def symbolize_test_args(args) + prompts = args[:prompts] || [args[:prompt]] + messages = args[:messages] || [args[:message]] + followups = symbolize_followups(args) + output_thinking = args[:output_thinking] || false + stream = args[:stream] || false + temperature = args[:temperature] + tools = symbolize_tools(args[:tools]) + [prompts, messages, followups, output_thinking, stream, temperature, tools] + end + + def symbolize_followups(args) + return nil if args[:followups].nil? && args[:followup].nil? + followups = args[:followups] || [args[:followup]] + followups.map do |followup| + followup = followup.dup.symbolize_keys! + message = followup[:message].dup.symbolize_keys! + message[:type] = message[:type].to_sym if message[:type] + followup[:message] = message + followup + end + end + + def symbolize_tools(tools) + return nil if tools.nil? + tools.map do |tool| + tool.symbolize_keys! + tool.merge( + parameters: tool[:parameters]&.map { |param| param.transform_keys(&:to_sym) }, + ).compact + end + end + + def with_tests_progress(total:) + puts "" + count = 0 + result = + yield( + -> do + count += 1 + print "\rProcessing test #{count}/#{total}" + end + ) + print "\r\033[K" + result + end +end diff --git a/evals/lib/prompts/single_test_runner.rb b/evals/lib/prompts/single_test_runner.rb new file mode 100644 index 00000000..6e7c43f8 --- /dev/null +++ b/evals/lib/prompts/single_test_runner.rb @@ -0,0 +1,76 @@ +# frozen_string_literal: true + +class DiscourseAi::Evals::PromptSingleTestRunner + def initialize(llm) + @llm = llm + end + + # Run a single test with a prompt and message, and some model settings + # @param prompt [String] the prompt to use + # @param message [String] the message to use + # @param followups [Array] an array of followups (messages) to run after the initial prompt + # @param output_thinking [Boolean] whether to output the thinking state of the model + # @param stream [Boolean] whether to stream the output of the model + # @param temperature [Float] the temperature to use when generating completions + # @param tools [Array] an array of tools to use when generating completions + # @return [Hash] the prompt, message, and result of the test + def run_single_test(prompt, message, followups, output_thinking, stream, temperature, tools) + @c_prompt = + DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: message }]) + @c_prompt.tools = tools if tools + generate_result(temperature, output_thinking, stream) + + if followups + followups.each do |followup| + generate_followup(followup, output_thinking, stream, temperature) + end + end + + { prompt:, message:, result: @result } + end + + private + + def generate_followup(followup, output_thinking, stream, temperature) + @c_prompt.push_model_response(@result) + followup_message = set_followup_tool(followup) + @c_prompt.push(**followup_message) + begin + generate_result(temperature, output_thinking, stream) + rescue => e + # should not happen but it helps debugging... + puts e + result = [] + end + end + + def set_followup_tool(followup) + @c_prompt.tools = followup[:tools] if followup[:tools] + followup_message = followup[:message] + %i[id name].each do |key| + if followup_message[key].is_a?(Array) + type, inner_key = followup_message[key] + # this allows us to dynamically set the id or name of the tool call + prev = @c_prompt.messages.reverse.find { |m| m[:type] == type.to_sym } + followup_message[key] = prev[inner_key.to_sym] if prev + end + end + followup_message + end + + def generate_result(temperature, output_thinking, stream) + @result = + if stream + stream_result = [] + @llm.generate( + @c_prompt, + user: Discourse.system_user, + temperature:, + output_thinking:, + ) { |partial| stream_result << partial } + stream_result + else + @llm.generate(@c_prompt, user: Discourse.system_user, temperature:, output_thinking:) + end + end +end diff --git a/evals/run b/evals/run index 8c133eb2..8aa6c4ba 100755 --- a/evals/run +++ b/evals/run @@ -6,7 +6,8 @@ require_relative "lib/llm" require_relative "lib/cli" require_relative "lib/runner" require_relative "lib/eval" -require_relative "lib/prompt_evaluator" +require_relative "lib/prompts/prompt_evaluator" +require_relative "lib/prompts/single_test_runner" options = DiscourseAi::Evals::Cli.parse_options!