Dev: eval improvements (#1162)

Adds sonnet 3.7
Adds support for temp in eval framework
This commit is contained in:
Sam 2025-03-04 16:12:25 +11:00 committed by GitHub
parent f6eedf3e0b
commit 28af4434c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 19 additions and 5 deletions

View File

@ -39,6 +39,16 @@ llms:
max_prompt_tokens: 200000 max_prompt_tokens: 200000
vision_enabled: true vision_enabled: true
claude-3.7-sonnet:
display_name: Claude 3.7 Sonnet
name: claude-3-7-sonnet-latest
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
api_key_env: ANTHROPIC_API_KEY
provider: anthropic
url: https://api.anthropic.com/v1/messages
max_prompt_tokens: 200000
vision_enabled: true
gemini-2.0-flash: gemini-2.0-flash:
display_name: Gemini 2.0 Flash display_name: Gemini 2.0 Flash
name: gemini-2-0-flash name: gemini-2-0-flash

View File

@ -121,7 +121,7 @@ class DiscourseAi::Evals::Eval
def judge_result(result) def judge_result(result)
prompt = judge[:prompt].dup prompt = judge[:prompt].dup
prompt.sub!("{{output}}", result) prompt.sub!("{{output}}", result)
prompt.sub!("{{input}}", args[:input]) args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
prompt += <<~SUFFIX prompt += <<~SUFFIX
@ -145,7 +145,8 @@ class DiscourseAi::Evals::Eval
messages: [{ type: :user, content: prompt }], messages: [{ type: :user, content: prompt }],
) )
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) result =
judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user, temperature: 0)
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]}) if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
rating = rating[1].to_i rating = rating[1].to_i
@ -219,7 +220,7 @@ class DiscourseAi::Evals::Eval
upload.destroy if upload upload.destroy if upload
end end
def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false) def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
if tools if tools
tools.each do |tool| tools.each do |tool|
tool.symbolize_keys! tool.symbolize_keys!
@ -230,16 +231,19 @@ class DiscourseAi::Evals::Eval
DiscourseAi::Completions::Prompt.new( DiscourseAi::Completions::Prompt.new(
system_prompt, system_prompt,
messages: [{ type: :user, content: message }], messages: [{ type: :user, content: message }],
tools: tools,
) )
prompt.tools = tools if tools
result = nil result = nil
if stream if stream
result = [] result = []
llm llm
.llm_model .llm_model
.to_llm .to_llm
.generate(prompt, user: Discourse.system_user) { |partial| result << partial } .generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
result << partial
end
else else
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
end end