DEV: Support multiple tests per eval and followups per test (#1199)

See https://github.com/discourse/discourse-ai-evals/pull/9 for format of prompts
2025-03-18 11:42:05 +08:00 · 2025-03-18 11:42:05 +08:00 · 5bf61ef9e1
parent 107f14456b
commit 5bf61ef9e1
5 changed files with 171 additions and 59 deletions
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@ -29,7 +29,6 @@ class DiscourseAi::Evals::Eval
    @id = @yaml[:id]
    @description = @yaml[:description]
    @vision = @yaml[:vision]
    @args = @yaml[:args]&.symbolize_keys
    @type = @yaml[:type]
    @expected_output = @yaml[:expected_output]
    @expected_output_regex = @yaml[:expected_output_regex]
@ -39,10 +38,14 @@ class DiscourseAi::Evals::Eval
    @expected_tool_call.symbolize_keys! if @expected_tool_call
    @judge = @yaml[:judge]
    @judge.symbolize_keys! if @judge
-
+    if @yaml[:args].is_a?(Array)
-    @args.each do |key, value|
+      @args = @yaml[:args].map(&:symbolize_keys)
-      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
+    else
-        @args[key] = File.expand_path(File.join(File.dirname(path), value))
+      @args = @yaml[:args].symbolize_keys
      @args.each do |key, value|
        if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
          @args[key] = File.expand_path(File.join(File.dirname(path), value))
        end
      end
    end
  end
@ -57,7 +60,7 @@ class DiscourseAi::Evals::Eval
      when "image_to_text"
        image_to_text(llm, **args)
      when "prompt"
-        DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
+        DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(args)
      when "edit_artifact"
        edit_artifact(llm, **args)
      when "summarization"
--- a/evals/lib/prompt_evaluator.rb
+++ b/evals/lib/prompt_evaluator.rb
@ -1,52 +0,0 @@
 # frozen_string_literal: true
 class DiscourseAi::Evals::PromptEvaluator
  def initialize(llm)
    @llm = llm.llm_model.to_llm
  end
  def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
    tools = symbolize_tools(tools)
    total = prompts.size * messages.size
    count = 0
    puts ""
    prompts.flat_map do |prompt|
      messages.map do |content|
        count += 1
        print "\rProcessing #{count}/#{total}"
        c_prompt =
          DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
        c_prompt.tools = tools if tools
        result = { prompt:, message: content }
        result[:result] = generate_result(c_prompt, temperature, stream)
        result
      end
    end
  ensure
    print "\r\033[K"
  end
  private
  def generate_result(c_prompt, temperature, stream)
    if stream
      stream_result = []
      @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
        stream_result << partial
      end
      stream_result
    else
      @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
    end
  end
  def symbolize_tools(tools)
    return nil if tools.nil?
    tools.map do |tool|
      { name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
    end
  end
 end
--- a/evals/lib/prompts/prompt_evaluator.rb
+++ b/evals/lib/prompts/prompt_evaluator.rb
@ -0,0 +1,84 @@
 # frozen_string_literal: true
 class DiscourseAi::Evals::PromptEvaluator
  def initialize(llm)
    @llm = llm.llm_model.to_llm
  end
  def prompt_call(args)
    args = [args] if !args.is_a?(Array)
    runner = DiscourseAi::Evals::PromptSingleTestRunner.new(@llm)
    with_tests_progress(total: args.size) do |bump_progress|
      args.flat_map do |test|
        bump_progress.call
        prompts, messages, followups, output_thinking, stream, temperature, tools =
          symbolize_test_args(test)
        prompts.flat_map do |prompt|
          messages.map do |message|
            runner.run_single_test(
              prompt,
              message,
              followups,
              output_thinking,
              stream,
              temperature,
              tools,
            )
          end
        end
      end
    end
  end
  private
  def symbolize_test_args(args)
    prompts = args[:prompts] || [args[:prompt]]
    messages = args[:messages] || [args[:message]]
    followups = symbolize_followups(args)
    output_thinking = args[:output_thinking] || false
    stream = args[:stream] || false
    temperature = args[:temperature]
    tools = symbolize_tools(args[:tools])
    [prompts, messages, followups, output_thinking, stream, temperature, tools]
  end
  def symbolize_followups(args)
    return nil if args[:followups].nil? && args[:followup].nil?
    followups = args[:followups] || [args[:followup]]
    followups.map do |followup|
      followup = followup.dup.symbolize_keys!
      message = followup[:message].dup.symbolize_keys!
      message[:type] = message[:type].to_sym if message[:type]
      followup[:message] = message
      followup
    end
  end
  def symbolize_tools(tools)
    return nil if tools.nil?
    tools.map do |tool|
      tool.symbolize_keys!
      tool.merge(
        parameters: tool[:parameters]&.map { |param| param.transform_keys(&:to_sym) },
      ).compact
    end
  end
  def with_tests_progress(total:)
    puts ""
    count = 0
    result =
      yield(
        -> do
          count += 1
          print "\rProcessing test #{count}/#{total}"
        end
      )
    print "\r\033[K"
    result
  end
 end
--- a/evals/lib/prompts/single_test_runner.rb
+++ b/evals/lib/prompts/single_test_runner.rb
@ -0,0 +1,76 @@
 # frozen_string_literal: true
 class DiscourseAi::Evals::PromptSingleTestRunner
  def initialize(llm)
    @llm = llm
  end
  # Run a single test with a prompt and message, and some model settings
  # @param prompt [String] the prompt to use
  # @param message [String] the message to use
  # @param followups [Array<Hash>] an array of followups (messages) to run after the initial prompt
  # @param output_thinking [Boolean] whether to output the thinking state of the model
  # @param stream [Boolean] whether to stream the output of the model
  # @param temperature [Float] the temperature to use when generating completions
  # @param tools [Array<Hash>] an array of tools to use when generating completions
  # @return [Hash] the prompt, message, and result of the test
  def run_single_test(prompt, message, followups, output_thinking, stream, temperature, tools)
    @c_prompt =
      DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: message }])
    @c_prompt.tools = tools if tools
    generate_result(temperature, output_thinking, stream)
    if followups
      followups.each do |followup|
        generate_followup(followup, output_thinking, stream, temperature)
      end
    end
    { prompt:, message:, result: @result }
  end
  private
  def generate_followup(followup, output_thinking, stream, temperature)
    @c_prompt.push_model_response(@result)
    followup_message = set_followup_tool(followup)
    @c_prompt.push(**followup_message)
    begin
      generate_result(temperature, output_thinking, stream)
    rescue => e
      # should not happen but it helps debugging...
      puts e
      result = []
    end
  end
  def set_followup_tool(followup)
    @c_prompt.tools = followup[:tools] if followup[:tools]
    followup_message = followup[:message]
    %i[id name].each do |key|
      if followup_message[key].is_a?(Array)
        type, inner_key = followup_message[key]
        # this allows us to dynamically set the id or name of the tool call
        prev = @c_prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
        followup_message[key] = prev[inner_key.to_sym] if prev
      end
    end
    followup_message
  end
  def generate_result(temperature, output_thinking, stream)
    @result =
      if stream
        stream_result = []
        @llm.generate(
          @c_prompt,
          user: Discourse.system_user,
          temperature:,
          output_thinking:,
        ) { |partial| stream_result << partial }
        stream_result
      else
        @llm.generate(@c_prompt, user: Discourse.system_user, temperature:, output_thinking:)
      end
  end
 end
--- a/evals/run
+++ b/evals/run
@ -6,7 +6,8 @@ require_relative "lib/llm"
 require_relative "lib/cli"
 require_relative "lib/runner"
 require_relative "lib/eval"
-require_relative "lib/prompt_evaluator"
+require_relative "lib/prompts/prompt_evaluator"
 require_relative "lib/prompts/single_test_runner"
 options = DiscourseAi::Evals::Cli.parse_options!