From 5bf61ef9e1fc9372322914848775c2f5af780fe9 Mon Sep 17 00:00:00 2001
From: Natalie Tay <natalie.tay@gmail.com>
Date: Tue, 18 Mar 2025 11:42:05 +0800
Subject: [PATCH] DEV: Support multiple tests per eval and followups per test
 (#1199)

See https://github.com/discourse/discourse-ai-evals/pull/9 for format of prompts
---
 evals/lib/eval.rb                       | 15 +++--
 evals/lib/prompt_evaluator.rb           | 52 ---------------
 evals/lib/prompts/prompt_evaluator.rb   | 84 +++++++++++++++++++++++++
 evals/lib/prompts/single_test_runner.rb | 76 ++++++++++++++++++++++
 evals/run                               |  3 +-
 5 files changed, 171 insertions(+), 59 deletions(-)
 delete mode 100644 evals/lib/prompt_evaluator.rb
 create mode 100644 evals/lib/prompts/prompt_evaluator.rb
 create mode 100644 evals/lib/prompts/single_test_runner.rb

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index 4e84ff32..a39c6bd7 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -29,7 +29,6 @@ class DiscourseAi::Evals::Eval
     @id = @yaml[:id]
     @description = @yaml[:description]
     @vision = @yaml[:vision]
-    @args = @yaml[:args]&.symbolize_keys
     @type = @yaml[:type]
     @expected_output = @yaml[:expected_output]
     @expected_output_regex = @yaml[:expected_output_regex]
@@ -39,10 +38,14 @@ class DiscourseAi::Evals::Eval
     @expected_tool_call.symbolize_keys! if @expected_tool_call
     @judge = @yaml[:judge]
     @judge.symbolize_keys! if @judge
-
-    @args.each do |key, value|
-      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
-        @args[key] = File.expand_path(File.join(File.dirname(path), value))
+    if @yaml[:args].is_a?(Array)
+      @args = @yaml[:args].map(&:symbolize_keys)
+    else
+      @args = @yaml[:args].symbolize_keys
+      @args.each do |key, value|
+        if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
+          @args[key] = File.expand_path(File.join(File.dirname(path), value))
+        end
       end
     end
   end
@@ -57,7 +60,7 @@ class DiscourseAi::Evals::Eval
       when "image_to_text"
         image_to_text(llm, **args)
       when "prompt"
-        DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
+        DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(args)
       when "edit_artifact"
         edit_artifact(llm, **args)
       when "summarization"
diff --git a/evals/lib/prompt_evaluator.rb b/evals/lib/prompt_evaluator.rb
deleted file mode 100644
index d6ca3330..00000000
--- a/evals/lib/prompt_evaluator.rb
+++ /dev/null
@@ -1,52 +0,0 @@
-# frozen_string_literal: true
-
-class DiscourseAi::Evals::PromptEvaluator
-  def initialize(llm)
-    @llm = llm.llm_model.to_llm
-  end
-
-  def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
-    tools = symbolize_tools(tools)
-    total = prompts.size * messages.size
-    count = 0
-    puts ""
-
-    prompts.flat_map do |prompt|
-      messages.map do |content|
-        count += 1
-        print "\rProcessing #{count}/#{total}"
-
-        c_prompt =
-          DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
-        c_prompt.tools = tools if tools
-        result = { prompt:, message: content }
-        result[:result] = generate_result(c_prompt, temperature, stream)
-        result
-      end
-    end
-  ensure
-    print "\r\033[K"
-  end
-
-  private
-
-  def generate_result(c_prompt, temperature, stream)
-    if stream
-      stream_result = []
-      @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
-        stream_result << partial
-      end
-      stream_result
-    else
-      @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
-    end
-  end
-
-  def symbolize_tools(tools)
-    return nil if tools.nil?
-
-    tools.map do |tool|
-      { name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
-    end
-  end
-end
diff --git a/evals/lib/prompts/prompt_evaluator.rb b/evals/lib/prompts/prompt_evaluator.rb
new file mode 100644
index 00000000..d526243d
--- /dev/null
+++ b/evals/lib/prompts/prompt_evaluator.rb
@@ -0,0 +1,84 @@
+# frozen_string_literal: true
+
+class DiscourseAi::Evals::PromptEvaluator
+  def initialize(llm)
+    @llm = llm.llm_model.to_llm
+  end
+
+  def prompt_call(args)
+    args = [args] if !args.is_a?(Array)
+    runner = DiscourseAi::Evals::PromptSingleTestRunner.new(@llm)
+
+    with_tests_progress(total: args.size) do |bump_progress|
+      args.flat_map do |test|
+        bump_progress.call
+
+        prompts, messages, followups, output_thinking, stream, temperature, tools =
+          symbolize_test_args(test)
+
+        prompts.flat_map do |prompt|
+          messages.map do |message|
+            runner.run_single_test(
+              prompt,
+              message,
+              followups,
+              output_thinking,
+              stream,
+              temperature,
+              tools,
+            )
+          end
+        end
+      end
+    end
+  end
+
+  private
+
+  def symbolize_test_args(args)
+    prompts = args[:prompts] || [args[:prompt]]
+    messages = args[:messages] || [args[:message]]
+    followups = symbolize_followups(args)
+    output_thinking = args[:output_thinking] || false
+    stream = args[:stream] || false
+    temperature = args[:temperature]
+    tools = symbolize_tools(args[:tools])
+    [prompts, messages, followups, output_thinking, stream, temperature, tools]
+  end
+
+  def symbolize_followups(args)
+    return nil if args[:followups].nil? && args[:followup].nil?
+    followups = args[:followups] || [args[:followup]]
+    followups.map do |followup|
+      followup = followup.dup.symbolize_keys!
+      message = followup[:message].dup.symbolize_keys!
+      message[:type] = message[:type].to_sym if message[:type]
+      followup[:message] = message
+      followup
+    end
+  end
+
+  def symbolize_tools(tools)
+    return nil if tools.nil?
+    tools.map do |tool|
+      tool.symbolize_keys!
+      tool.merge(
+        parameters: tool[:parameters]&.map { |param| param.transform_keys(&:to_sym) },
+      ).compact
+    end
+  end
+
+  def with_tests_progress(total:)
+    puts ""
+    count = 0
+    result =
+      yield(
+        -> do
+          count += 1
+          print "\rProcessing test #{count}/#{total}"
+        end
+      )
+    print "\r\033[K"
+    result
+  end
+end
diff --git a/evals/lib/prompts/single_test_runner.rb b/evals/lib/prompts/single_test_runner.rb
new file mode 100644
index 00000000..6e7c43f8
--- /dev/null
+++ b/evals/lib/prompts/single_test_runner.rb
@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+
+class DiscourseAi::Evals::PromptSingleTestRunner
+  def initialize(llm)
+    @llm = llm
+  end
+
+  # Run a single test with a prompt and message, and some model settings
+  # @param prompt [String] the prompt to use
+  # @param message [String] the message to use
+  # @param followups [Array<Hash>] an array of followups (messages) to run after the initial prompt
+  # @param output_thinking [Boolean] whether to output the thinking state of the model
+  # @param stream [Boolean] whether to stream the output of the model
+  # @param temperature [Float] the temperature to use when generating completions
+  # @param tools [Array<Hash>] an array of tools to use when generating completions
+  # @return [Hash] the prompt, message, and result of the test
+  def run_single_test(prompt, message, followups, output_thinking, stream, temperature, tools)
+    @c_prompt =
+      DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: message }])
+    @c_prompt.tools = tools if tools
+    generate_result(temperature, output_thinking, stream)
+
+    if followups
+      followups.each do |followup|
+        generate_followup(followup, output_thinking, stream, temperature)
+      end
+    end
+
+    { prompt:, message:, result: @result }
+  end
+
+  private
+
+  def generate_followup(followup, output_thinking, stream, temperature)
+    @c_prompt.push_model_response(@result)
+    followup_message = set_followup_tool(followup)
+    @c_prompt.push(**followup_message)
+    begin
+      generate_result(temperature, output_thinking, stream)
+    rescue => e
+      # should not happen but it helps debugging...
+      puts e
+      result = []
+    end
+  end
+
+  def set_followup_tool(followup)
+    @c_prompt.tools = followup[:tools] if followup[:tools]
+    followup_message = followup[:message]
+    %i[id name].each do |key|
+      if followup_message[key].is_a?(Array)
+        type, inner_key = followup_message[key]
+        # this allows us to dynamically set the id or name of the tool call
+        prev = @c_prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
+        followup_message[key] = prev[inner_key.to_sym] if prev
+      end
+    end
+    followup_message
+  end
+
+  def generate_result(temperature, output_thinking, stream)
+    @result =
+      if stream
+        stream_result = []
+        @llm.generate(
+          @c_prompt,
+          user: Discourse.system_user,
+          temperature:,
+          output_thinking:,
+        ) { |partial| stream_result << partial }
+        stream_result
+      else
+        @llm.generate(@c_prompt, user: Discourse.system_user, temperature:, output_thinking:)
+      end
+  end
+end
diff --git a/evals/run b/evals/run
index 8c133eb2..8aa6c4ba 100755
--- a/evals/run
+++ b/evals/run
@@ -6,7 +6,8 @@ require_relative "lib/llm"
 require_relative "lib/cli"
 require_relative "lib/runner"
 require_relative "lib/eval"
-require_relative "lib/prompt_evaluator"
+require_relative "lib/prompts/prompt_evaluator"
+require_relative "lib/prompts/single_test_runner"
 
 options = DiscourseAi::Evals::Cli.parse_options!