mirror of
https://github.com/we-promise/sure.git
synced 2026-04-08 14:54:49 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
256 lines
8.5 KiB
Ruby
256 lines
8.5 KiB
Ruby
class Eval::Runners::ChatRunner < Eval::Runners::Base
|
|
# Chat samples are processed one at a time (not batched)
|
|
# because each has unique context and function calling requirements
|
|
|
|
protected
|
|
|
|
def process_samples
|
|
all_samples = samples.to_a
|
|
log_progress("Processing #{all_samples.size} chat samples")
|
|
|
|
all_samples.each_with_index do |sample, idx|
|
|
log_progress("Processing sample #{idx + 1}/#{all_samples.size}")
|
|
process_sample(sample)
|
|
end
|
|
end
|
|
|
|
def calculate_metrics
|
|
Eval::Metrics::ChatMetrics.new(eval_run).calculate
|
|
end
|
|
|
|
private
|
|
|
|
def process_sample(sample)
|
|
prompt = sample.chat_prompt
|
|
start_time = Time.current
|
|
|
|
begin
|
|
response = provider.chat_response(
|
|
prompt,
|
|
model: model,
|
|
instructions: build_instructions,
|
|
functions: build_function_definitions
|
|
)
|
|
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
|
|
if response.success?
|
|
record_chat_result(sample, response.data, latency_ms)
|
|
else
|
|
record_error_result(sample, response.error, latency_ms)
|
|
end
|
|
rescue => e
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
record_error_result(sample, e, latency_ms)
|
|
end
|
|
end
|
|
|
|
def record_chat_result(sample, chat_response, latency_ms)
|
|
# Extract function calls from response
|
|
actual_functions = extract_functions_from_response(chat_response)
|
|
|
|
# Extract response text
|
|
response_text = extract_response_text(chat_response)
|
|
|
|
# Evaluate function calling accuracy
|
|
expected_functions = sample.expected_functions
|
|
function_match = evaluate_function_match(actual_functions, expected_functions)
|
|
|
|
# Evaluate response content
|
|
expected_keywords = sample.expected_response_contains
|
|
response_match = evaluate_response_contains(response_text, expected_keywords)
|
|
|
|
# Overall correctness: functions are correct AND response contains expected keywords
|
|
correct = function_match[:correct] && response_match
|
|
|
|
record_result(
|
|
sample: sample,
|
|
actual_output: {
|
|
"functions" => actual_functions,
|
|
"response_text" => response_text,
|
|
"function_match_details" => function_match
|
|
},
|
|
correct: correct,
|
|
exact_match: function_match[:exact_match],
|
|
latency_ms: latency_ms,
|
|
metadata: {
|
|
"function_selection_correct" => function_match[:selection_correct],
|
|
"parameter_accuracy" => function_match[:parameter_accuracy],
|
|
"response_keywords_found" => response_match,
|
|
"expected_functions" => expected_functions,
|
|
"expected_keywords" => expected_keywords
|
|
}
|
|
)
|
|
end
|
|
|
|
def record_error_result(sample, error, latency_ms)
|
|
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
|
|
|
record_result(
|
|
sample: sample,
|
|
actual_output: { "error" => error_message },
|
|
correct: false,
|
|
exact_match: false,
|
|
latency_ms: latency_ms,
|
|
metadata: { "error" => error_message }
|
|
)
|
|
end
|
|
|
|
def extract_functions_from_response(chat_response)
|
|
# ChatResponse has function_requests array
|
|
function_requests = chat_response.function_requests || []
|
|
|
|
function_requests.map do |req|
|
|
{
|
|
"name" => req.function_name,
|
|
"params" => parse_function_args(req.function_args)
|
|
}
|
|
end
|
|
end
|
|
|
|
def parse_function_args(args)
|
|
return {} if args.nil?
|
|
return args if args.is_a?(Hash)
|
|
JSON.parse(args)
|
|
rescue JSON::ParserError
|
|
{}
|
|
end
|
|
|
|
def extract_response_text(chat_response)
|
|
# ChatResponse has messages array with output_text
|
|
messages = chat_response.messages || []
|
|
messages.map(&:output_text).compact.join("\n")
|
|
end
|
|
|
|
def evaluate_function_match(actual_functions, expected_functions)
|
|
return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty?
|
|
return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any?
|
|
|
|
# Check function selection accuracy
|
|
expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact
|
|
actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact
|
|
|
|
selection_correct = expected_names.all? { |name| actual_names.include?(name) }
|
|
|
|
# Check parameter accuracy for matched functions
|
|
param_scores = []
|
|
expected_functions.each do |expected_func|
|
|
expected_name = normalize_function_name(expected_func["name"])
|
|
actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name }
|
|
|
|
if actual_func
|
|
param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {})
|
|
param_scores << param_score
|
|
else
|
|
param_scores << 0.0
|
|
end
|
|
end
|
|
|
|
parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4)
|
|
|
|
# Exact match requires same functions with same parameters
|
|
exact_match = selection_correct && parameter_accuracy == 1.0
|
|
|
|
# Correct if all expected functions were called (parameters don't have to be exact)
|
|
correct = selection_correct
|
|
|
|
{
|
|
correct: correct,
|
|
exact_match: exact_match,
|
|
selection_correct: selection_correct,
|
|
parameter_accuracy: parameter_accuracy
|
|
}
|
|
end
|
|
|
|
def normalize_function_name(name)
|
|
return nil if name.nil?
|
|
# Convert to snake_case and downcase
|
|
name.to_s.underscore.downcase
|
|
end
|
|
|
|
def evaluate_parameters(actual_params, expected_params)
|
|
return 1.0 if expected_params.empty?
|
|
return 0.0 if actual_params.nil?
|
|
|
|
actual_params = actual_params.stringify_keys
|
|
expected_params = expected_params.stringify_keys
|
|
|
|
matches = 0
|
|
total = expected_params.size
|
|
|
|
expected_params.each do |key, expected_value|
|
|
actual_value = actual_params[key]
|
|
|
|
if values_match?(actual_value, expected_value)
|
|
matches += 1
|
|
end
|
|
end
|
|
|
|
(matches.to_f / total).round(4)
|
|
end
|
|
|
|
def values_match?(actual, expected)
|
|
return true if actual == expected
|
|
return true if actual.to_s.downcase == expected.to_s.downcase
|
|
|
|
# For arrays, check if all expected values are present
|
|
if expected.is_a?(Array) && actual.is_a?(Array)
|
|
expected_normalized = expected.map { |v| v.to_s.downcase }
|
|
actual_normalized = actual.map { |v| v.to_s.downcase }
|
|
return expected_normalized.all? { |v| actual_normalized.include?(v) }
|
|
end
|
|
|
|
# For dates, try to parse and compare
|
|
if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/
|
|
begin
|
|
expected_date = Date.parse(expected.to_s)
|
|
actual_date = Date.parse(actual.to_s)
|
|
return expected_date == actual_date
|
|
rescue
|
|
# Not valid dates, fall through
|
|
end
|
|
end
|
|
|
|
false
|
|
end
|
|
|
|
def evaluate_response_contains(response_text, expected_keywords)
|
|
return true if expected_keywords.empty?
|
|
return false if response_text.nil? || response_text.empty?
|
|
|
|
normalized_response = response_text.downcase
|
|
|
|
expected_keywords.all? do |keyword|
|
|
normalized_response.include?(keyword.to_s.downcase)
|
|
end
|
|
end
|
|
|
|
def build_instructions
|
|
# Simple instructions for evaluation - we don't have a real user/family context
|
|
<<~PROMPT
|
|
You are a financial assistant helping users understand their financial data.
|
|
Use the functions available to answer questions about accounts, transactions, and financial statements.
|
|
Today's date is #{Date.current}.
|
|
PROMPT
|
|
end
|
|
|
|
def build_function_definitions
|
|
# Return the function definitions that the chat would normally have
|
|
[
|
|
build_function_definition("get_transactions", "Get paginated transactions with optional filters"),
|
|
build_function_definition("get_accounts", "Get all accounts with balances and historical data"),
|
|
build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"),
|
|
build_function_definition("get_income_statement", "Get income and expenses by category for a period")
|
|
]
|
|
end
|
|
|
|
def build_function_definition(name, description)
|
|
{
|
|
name: name,
|
|
description: description,
|
|
params_schema: { type: "object", properties: {}, additionalProperties: true },
|
|
strict: false
|
|
}
|
|
end
|
|
end
|