DEV: eval support for tool calls (#1128)
Also fixes anthropic with no params, streaming calls
This commit is contained in:
parent
5e80f93e4c
commit
9a6aec2cf6
|
@ -9,7 +9,8 @@ class DiscourseAi::Evals::Eval
|
||||||
:args,
|
:args,
|
||||||
:vision,
|
:vision,
|
||||||
:expected_output,
|
:expected_output,
|
||||||
:expected_output_regex
|
:expected_output_regex,
|
||||||
|
:expected_tool_call
|
||||||
|
|
||||||
def initialize(path:)
|
def initialize(path:)
|
||||||
@yaml = YAML.load_file(path).symbolize_keys
|
@yaml = YAML.load_file(path).symbolize_keys
|
||||||
|
@ -24,6 +25,8 @@ class DiscourseAi::Evals::Eval
|
||||||
@expected_output_regex = @yaml[:expected_output_regex]
|
@expected_output_regex = @yaml[:expected_output_regex]
|
||||||
@expected_output_regex =
|
@expected_output_regex =
|
||||||
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
|
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
|
||||||
|
@expected_tool_call = @yaml[:expected_tool_call]
|
||||||
|
@expected_tool_call.symbolize_keys! if @expected_tool_call
|
||||||
|
|
||||||
@args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
|
@args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
|
||||||
:path,
|
:path,
|
||||||
|
@ -39,6 +42,8 @@ class DiscourseAi::Evals::Eval
|
||||||
pdf_to_text(llm, **args)
|
pdf_to_text(llm, **args)
|
||||||
when "image_to_text"
|
when "image_to_text"
|
||||||
image_to_text(llm, **args)
|
image_to_text(llm, **args)
|
||||||
|
when "prompt"
|
||||||
|
prompt_call(llm, **args)
|
||||||
end
|
end
|
||||||
|
|
||||||
if expected_output
|
if expected_output
|
||||||
|
@ -53,6 +58,19 @@ class DiscourseAi::Evals::Eval
|
||||||
else
|
else
|
||||||
{ result: :fail, expected_output: expected_output_regex, actual_output: result }
|
{ result: :fail, expected_output: expected_output_regex, actual_output: result }
|
||||||
end
|
end
|
||||||
|
elsif expected_tool_call
|
||||||
|
tool_call = result
|
||||||
|
|
||||||
|
if result.is_a?(Array)
|
||||||
|
tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) }
|
||||||
|
end
|
||||||
|
if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) ||
|
||||||
|
(tool_call.name != expected_tool_call[:name]) ||
|
||||||
|
(tool_call.parameters != expected_tool_call[:params])
|
||||||
|
{ result: :fail, expected_output: expected_tool_call, actual_output: result }
|
||||||
|
else
|
||||||
|
{ result: :pass }
|
||||||
|
end
|
||||||
else
|
else
|
||||||
{ result: :unknown, actual_output: result }
|
{ result: :unknown, actual_output: result }
|
||||||
end
|
end
|
||||||
|
@ -133,4 +151,31 @@ class DiscourseAi::Evals::Eval
|
||||||
ensure
|
ensure
|
||||||
upload.destroy if upload
|
upload.destroy if upload
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
|
||||||
|
if tools
|
||||||
|
tools.each do |tool|
|
||||||
|
tool.symbolize_keys!
|
||||||
|
tool[:parameters].symbolize_keys! if tool[:parameters]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
prompt =
|
||||||
|
DiscourseAi::Completions::Prompt.new(
|
||||||
|
system_prompt,
|
||||||
|
messages: [{ type: :user, content: message }],
|
||||||
|
tools: tools,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = nil
|
||||||
|
if stream
|
||||||
|
result = []
|
||||||
|
llm
|
||||||
|
.llm_model
|
||||||
|
.to_llm
|
||||||
|
.generate(prompt, user: Discourse.system_user) { |partial| result << partial }
|
||||||
|
else
|
||||||
|
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
|
||||||
|
end
|
||||||
|
result
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -34,7 +34,8 @@ class DiscourseAi::Completions::AnthropicMessageProcessor
|
||||||
end
|
end
|
||||||
|
|
||||||
def to_tool_call
|
def to_tool_call
|
||||||
parameters = JSON.parse(raw_json, symbolize_names: true)
|
parameters = {}
|
||||||
|
parameters = JSON.parse(raw_json, symbolize_names: true) if raw_json.present?
|
||||||
# we dupe to avoid poisoning the original tool call
|
# we dupe to avoid poisoning the original tool call
|
||||||
@tool_call = @tool_call.dup
|
@tool_call = @tool_call.dup
|
||||||
@tool_call.partial = false
|
@tool_call.partial = false
|
||||||
|
|
Loading…
Reference in New Issue