#frozen_string_literal: true class DiscourseAi::Evals::Eval attr_reader :type, :path, :name, :description, :id, :args, :vision, :expected_output, :expected_output_regex, :expected_tool_call, :judge class EvalError < StandardError attr_reader :context def initialize(message, context) super(message) @context = context end end def initialize(path:) @yaml = YAML.load_file(path).symbolize_keys @path = path @name = @yaml[:name] @id = @yaml[:id] @description = @yaml[:description] @vision = @yaml[:vision] @type = @yaml[:type] @expected_output = @yaml[:expected_output] @expected_output_regex = @yaml[:expected_output_regex] @expected_output_regex = Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex @expected_tool_call = @yaml[:expected_tool_call] @expected_tool_call.symbolize_keys! if @expected_tool_call @judge = @yaml[:judge] @judge.symbolize_keys! if @judge if @yaml[:args].is_a?(Array) @args = @yaml[:args].map(&:symbolize_keys) else @args = @yaml[:args].symbolize_keys @args.each do |key, value| if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) @args[key] = File.expand_path(File.join(File.dirname(path), value)) end end end end def run(llm:) result = case type when "helper" helper(llm, **args) when "pdf_to_text" pdf_to_text(llm, **args) when "image_to_text" image_to_text(llm, **args) when "prompt" DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(args) when "edit_artifact" edit_artifact(llm, **args) when "summarization" summarization(llm, **args) end classify_results(result) rescue EvalError => e { result: :fail, message: e.message, context: e.context } end def print puts "#{id}: #{description}" end def to_json { type: @type, path: @path, name: @name, description: @description, id: @id, args: @args, vision: @vision, expected_output: @expected_output, expected_output_regex: @expected_output_regex, }.compact end private # @param result [String, Array] the result of the eval, either # "llm response" or [{ result: "llm response", other_attrs: here }] # @return [Array] an array of hashes with the result classified # as pass or fail, along with extra attributes def classify_results(result) if result.is_a?(Array) result.each { |r| r.merge!(classify_result_pass_fail(r)) } else [classify_result_pass_fail(result)] end end def classify_result_pass_fail(result) if expected_output if result == expected_output { result: :pass } else { result: :fail, expected_output: expected_output, actual_output: result } end elsif expected_output_regex if result.to_s.match?(expected_output_regex) { result: :pass } else { result: :fail, expected_output: expected_output_regex, actual_output: result } end elsif expected_tool_call tool_call = result if result.is_a?(Array) tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) } end if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) || (tool_call.name != expected_tool_call[:name]) || (tool_call.parameters != expected_tool_call[:params]) { result: :fail, expected_output: expected_tool_call, actual_output: result } else { result: :pass } end elsif judge judge_result(result) else { result: :pass } end end def judge_result(result) prompt = judge[:prompt].dup if result.is_a?(String) prompt.sub!("{{output}}", result) args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) } else prompt.sub!("{{output}}", result[:result]) result.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) } end prompt += <<~SUFFIX Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible. example output: [RATING]10[/RATING] perfect output example output: [RATING]5[/RATING] the following failed to preserve... etc... SUFFIX judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first DiscourseAi::Completions::Prompt.new( "You are an expert judge tasked at testing LLM outputs.", messages: [{ type: :user, content: prompt }], ) result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user, temperature: 0) if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]}) rating = rating[1].to_i end if rating.to_i >= judge[:pass_rating] { result: :pass } else { result: :fail, message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}", context: result, } end end def helper(llm, input:, name:, locale: nil) completion_prompt = CompletionPrompt.find_by(name: name) helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy) user = Discourse.system_user if locale user = User.new class << user attr_accessor :effective_locale end user.effective_locale = locale user.admin = true end result = helper.generate_and_send_prompt( completion_prompt, input, current_user = user, _force_default_locale = false, ) result[:suggestions].first end def image_to_text(llm, path:) upload = UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id) text = +"" DiscourseAi::Utils::ImageToText .new(upload: upload, llm_model: llm.llm_model, user: Discourse.system_user) .extract_text do |chunk, error| text << chunk if chunk text << "\n\n" if chunk end text ensure upload.destroy if upload end def pdf_to_text(llm, path:) upload = UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id) text = +"" DiscourseAi::Utils::PdfToText .new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model) .extract_text do |chunk| text << chunk if chunk text << "\n\n" if chunk end text ensure upload.destroy if upload end def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) css = File.read(css_path) js = File.read(js_path) html = File.read(html_path) instructions = File.read(instructions_path) artifact = AiArtifact.create!( css: css, js: js, html: html, user_id: Discourse.system_user.id, post_id: 1, name: "eval artifact", ) post = Post.new(topic_id: 1, id: 1) diff = DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new( llm: llm.llm_model.to_llm, post: post, user: Discourse.system_user, artifact: artifact, artifact_version: nil, instructions: instructions, ) diff.apply if diff.failed_searches.present? puts "Eval Errors encountered" p diff.failed_searches raise EvalError.new("Failed to apply all changes", diff.failed_searches) end version = artifact.versions.last raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js) output = { css: version.css, js: version.js, html: version.html } artifact.destroy output end def valid_javascript?(str) require "open3" # Create a temporary file with the JavaScript code Tempfile.create(%w[test .js]) do |f| f.write(str) f.flush File.write("/tmp/test.js", str) begin Discourse::Utils.execute_command( "node", "--check", f.path, failure_message: "Invalid JavaScript syntax", timeout: 30, # reasonable timeout in seconds ) true rescue Discourse::Utils::CommandError false end end rescue StandardError false end def summarization(llm, input:) topic = Topic.new( category: Category.last, title: "Eval topic for topic summarization", id: -99, user_id: Discourse.system_user.id, ) Post.new(topic: topic, id: -99, user_id: Discourse.system_user.id, raw: input) strategy = DiscourseAi::Summarization::FoldContent.new( llm.llm_proxy, DiscourseAi::Summarization::Strategies::TopicSummary.new(topic), ) summary = DiscourseAi::TopicSummarization.new(strategy, Discourse.system_user).summarize summary.summarized_text end end