discourse-ai/evals/lib/runner.rb

#frozen_string_literal: true

class DiscourseAi::Evals::Runner
  class StructuredLogger
    def initialize
      @log = []
      @current_step = @log
    end

    def log(name, args: nil, start_time: nil, end_time: nil)
      start_time ||= Time.now.utc
      end_time ||= Time.now.utc
      args ||= {}
      object = { name: name, args: args, start_time: start_time, end_time: end_time }
      @current_step << object
    end

    def step(name, args: nil)
      start_time = Time.now.utc
      start_step = @current_step

      new_step = { type: :step, name: name, args: args || {}, log: [], start_time: start_time }

      @current_step << new_step
      @current_step = new_step[:log]
      yield new_step
      @current_step = start_step
      new_step[:end_time] = Time.now.utc
    end

    def to_trace_event_json
      trace_events = []
      process_id = 1
      thread_id = 1

      to_trace_event(@log, process_id, thread_id, trace_events)

      JSON.pretty_generate({ traceEvents: trace_events })
    end

    private

    def to_trace_event(log_items, pid, tid, trace_events, parent_start_time = nil)
      log_items.each do |item|
        if item.is_a?(Hash) && item[:type] == :step
          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "B", # Begin event
            pid: pid,
            tid: tid,
            args: item[:args],
            ts: timestamp_in_microseconds(item[:start_time]),
          }

          to_trace_event(item[:log], pid, tid, trace_events, item[:start_time])

          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "E", # End event
            pid: pid,
            tid: tid,
            ts: timestamp_in_microseconds(item[:end_time]),
          }
        else
          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "B",
            pid: pid,
            tid: tid,
            args: item[:args],
            ts: timestamp_in_microseconds(item[:start_time] || parent_start_time || Time.now.utc),
            s: "p", # Scope: process
          }
          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "E",
            pid: pid,
            tid: tid,
            ts: timestamp_in_microseconds(item[:end_time] || Time.now.utc),
            s: "p",
          }
        end
      end
    end

    def timestamp_in_microseconds(time)
      (time.to_f * 1_000_000).to_i
    end
  end

  attr_reader :llms, :cases

  def self.evals_paths
    @eval_paths ||= Dir.glob(File.join(File.join(__dir__, "../cases"), "*/*.yml"))
  end

  def self.evals
    @evals ||= evals_paths.map { |path| DiscourseAi::Evals::Eval.new(path: path) }
  end

  def self.print
    evals.each(&:print)
  end

  def initialize(eval_name:, llms:)
    @llms = llms
    @eval = self.class.evals.find { |c| c.id == eval_name }

    if !@eval
      puts "Error: Unknown evaluation '#{eval_name}'"
      exit 1
    end

    if @llms.empty?
      puts "Error: Unknown model 'model'"
      exit 1
    end
  end

  def run!
    puts "Running evaluation '#{@eval.id}'"

    structured_log_filename = "#{@eval.id}-#{Time.now.strftime("%Y%m%d-%H%M%S")}.json"
    log_filename = "#{@eval.id}-#{Time.now.strftime("%Y%m%d-%H%M%S")}.log"
    logs_dir = File.join(__dir__, "../log")
    FileUtils.mkdir_p(logs_dir)

    log_path = File.expand_path(File.join(logs_dir, log_filename))
    structured_log_path = File.expand_path(File.join(logs_dir, structured_log_filename))

    logger = Logger.new(File.open(log_path, "a"))
    logger.info("Starting evaluation '#{@eval.id}'")

    Thread.current[:llm_audit_log] = logger
    structured_logger = Thread.current[:llm_audit_structured_log] = StructuredLogger.new

    structured_logger.step("Evaluating #{@eval.id}", args: @eval.to_json) do
      llms.each do |llm|
        if @eval.vision && !llm.vision?
          logger.info("Skipping LLM: #{llm.name} as it does not support vision")
          next
        end

        structured_logger.step("Evaluating with LLM: #{llm.name}") do |step|
          logger.info("Evaluating with LLM: #{llm.name}")
          print "#{llm.name}: "
          results = @eval.run(llm: llm)

          results.each do |result|
            step[:args] = result
            step[:cname] = result[:result] == :pass ? :good : :bad

            if result[:result] == :fail
              puts "Failed 🔴"
              puts "Error: #{result[:message]}" if result[:message]
              # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
              #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
              if result[:expected_output] && result[:actual_output]
                puts "---- Expected ----\n#{result[:expected_output]}"
                puts "---- Actual ----\n#{result[:actual_output]}"
              end
              logger.error("Evaluation failed with LLM: #{llm.name}")
              logger.error("Error: #{result[:message]}") if result[:message]
              logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
              logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
              logger.error("Context: #{result[:context]}") if result[:context]
            elsif result[:result] == :pass
              puts "Passed 🟢"
              logger.info("Evaluation passed with LLM: #{llm.name}")
            else
              STDERR.puts "Error: Unknown result #{eval.inspect}"
              logger.error("Unknown result: #{eval.inspect}")
            end
          end
        end
      end
    end

    #structured_logger.save(structured_log_path)

    File.write("#{structured_log_path}", structured_logger.to_trace_event_json)

    puts
    puts "Log file: #{log_path}"
    puts "Structured log file (ui.perfetto.dev): #{structured_log_path}"

    # temp code
    # puts File.read(structured_log_path)
  end
end