Dev: eval improvements (#1162)
Adds sonnet 3.7 Adds support for temp in eval framework
This commit is contained in:
parent
f6eedf3e0b
commit
28af4434c5
|
@ -39,6 +39,16 @@ llms:
|
||||||
max_prompt_tokens: 200000
|
max_prompt_tokens: 200000
|
||||||
vision_enabled: true
|
vision_enabled: true
|
||||||
|
|
||||||
|
claude-3.7-sonnet:
|
||||||
|
display_name: Claude 3.7 Sonnet
|
||||||
|
name: claude-3-7-sonnet-latest
|
||||||
|
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
|
||||||
|
api_key_env: ANTHROPIC_API_KEY
|
||||||
|
provider: anthropic
|
||||||
|
url: https://api.anthropic.com/v1/messages
|
||||||
|
max_prompt_tokens: 200000
|
||||||
|
vision_enabled: true
|
||||||
|
|
||||||
gemini-2.0-flash:
|
gemini-2.0-flash:
|
||||||
display_name: Gemini 2.0 Flash
|
display_name: Gemini 2.0 Flash
|
||||||
name: gemini-2-0-flash
|
name: gemini-2-0-flash
|
||||||
|
|
|
@ -121,7 +121,7 @@ class DiscourseAi::Evals::Eval
|
||||||
def judge_result(result)
|
def judge_result(result)
|
||||||
prompt = judge[:prompt].dup
|
prompt = judge[:prompt].dup
|
||||||
prompt.sub!("{{output}}", result)
|
prompt.sub!("{{output}}", result)
|
||||||
prompt.sub!("{{input}}", args[:input])
|
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
|
||||||
|
|
||||||
prompt += <<~SUFFIX
|
prompt += <<~SUFFIX
|
||||||
|
|
||||||
|
@ -145,7 +145,8 @@ class DiscourseAi::Evals::Eval
|
||||||
messages: [{ type: :user, content: prompt }],
|
messages: [{ type: :user, content: prompt }],
|
||||||
)
|
)
|
||||||
|
|
||||||
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
|
result =
|
||||||
|
judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user, temperature: 0)
|
||||||
|
|
||||||
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
|
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
|
||||||
rating = rating[1].to_i
|
rating = rating[1].to_i
|
||||||
|
@ -219,7 +220,7 @@ class DiscourseAi::Evals::Eval
|
||||||
upload.destroy if upload
|
upload.destroy if upload
|
||||||
end
|
end
|
||||||
|
|
||||||
def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
|
def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
|
||||||
if tools
|
if tools
|
||||||
tools.each do |tool|
|
tools.each do |tool|
|
||||||
tool.symbolize_keys!
|
tool.symbolize_keys!
|
||||||
|
@ -230,16 +231,19 @@ class DiscourseAi::Evals::Eval
|
||||||
DiscourseAi::Completions::Prompt.new(
|
DiscourseAi::Completions::Prompt.new(
|
||||||
system_prompt,
|
system_prompt,
|
||||||
messages: [{ type: :user, content: message }],
|
messages: [{ type: :user, content: message }],
|
||||||
tools: tools,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
prompt.tools = tools if tools
|
||||||
|
|
||||||
result = nil
|
result = nil
|
||||||
if stream
|
if stream
|
||||||
result = []
|
result = []
|
||||||
llm
|
llm
|
||||||
.llm_model
|
.llm_model
|
||||||
.to_llm
|
.to_llm
|
||||||
.generate(prompt, user: Discourse.system_user) { |partial| result << partial }
|
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
|
||||||
|
result << partial
|
||||||
|
end
|
||||||
else
|
else
|
||||||
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
|
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue