mirror of
https://github.com/we-promise/sure.git
synced 2026-04-09 07:14:47 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
126 lines
4.0 KiB
Ruby
126 lines
4.0 KiB
Ruby
class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
|
|
def calculate
|
|
{
|
|
accuracy: accuracy,
|
|
function_selection_accuracy: function_selection_accuracy,
|
|
parameter_accuracy: parameter_accuracy,
|
|
response_relevance: response_relevance,
|
|
exact_match_rate: exact_match_rate,
|
|
error_rate: error_rate,
|
|
avg_functions_per_response: avg_functions_per_response,
|
|
samples_processed: total_count,
|
|
samples_correct: correct_count,
|
|
avg_latency_ms: avg_latency_ms,
|
|
total_cost: total_cost,
|
|
cost_per_sample: cost_per_sample,
|
|
by_difficulty: metrics_by_difficulty,
|
|
by_function: metrics_by_function
|
|
}
|
|
end
|
|
|
|
private
|
|
|
|
def function_selection_accuracy
|
|
# Percentage of samples where correct functions were called
|
|
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
|
return 0.0 if valid_results.empty?
|
|
|
|
correct = valid_results.count do |r|
|
|
r.metadata.dig("function_selection_correct") == true
|
|
end
|
|
|
|
(correct.to_f / valid_results.count * 100).round(2)
|
|
end
|
|
|
|
def parameter_accuracy
|
|
# Average parameter accuracy across all samples
|
|
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
|
return 0.0 if valid_results.empty?
|
|
|
|
scores = valid_results.map do |r|
|
|
r.metadata.dig("parameter_accuracy") || 0.0
|
|
end
|
|
|
|
(scores.sum / scores.size * 100).round(2)
|
|
end
|
|
|
|
def response_relevance
|
|
# Percentage of samples where response contained expected keywords
|
|
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
|
return 0.0 if valid_results.empty?
|
|
|
|
correct = valid_results.count do |r|
|
|
# If no keywords expected, consider it relevant
|
|
expected_keywords = r.metadata.dig("expected_keywords") || []
|
|
expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
|
|
end
|
|
|
|
(correct.to_f / valid_results.count * 100).round(2)
|
|
end
|
|
|
|
def exact_match_rate
|
|
return 0.0 if total_count.zero?
|
|
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
|
|
end
|
|
|
|
def error_rate
|
|
return 0.0 if total_count.zero?
|
|
|
|
errors = results.count do |r|
|
|
r.metadata.dig("error").present? || r.actual_output.dig("error").present?
|
|
end
|
|
|
|
(errors.to_f / total_count * 100).round(2)
|
|
end
|
|
|
|
def avg_functions_per_response
|
|
valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
|
|
return 0.0 if valid_results.empty?
|
|
|
|
total_functions = valid_results.sum do |r|
|
|
functions = r.actual_output.dig("functions") || []
|
|
functions.size
|
|
end
|
|
|
|
(total_functions.to_f / valid_results.count).round(2)
|
|
end
|
|
|
|
def metrics_by_function
|
|
# Group results by expected function and calculate accuracy
|
|
function_metrics = {}
|
|
|
|
results.includes(:sample).each do |result|
|
|
expected_functions = result.sample.expected_functions
|
|
|
|
expected_functions.each do |func|
|
|
name = func["name"]
|
|
next if name.nil?
|
|
|
|
function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
|
|
function_metrics[name][:total] += 1
|
|
|
|
# Check if this specific function was called correctly
|
|
actual_functions = result.actual_output.dig("functions") || []
|
|
if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
|
|
function_metrics[name][:correct] += 1
|
|
function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
|
|
end
|
|
end
|
|
end
|
|
|
|
function_metrics.transform_values do |metrics|
|
|
{
|
|
total: metrics[:total],
|
|
correct: metrics[:correct],
|
|
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
|
|
avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
|
|
}
|
|
end
|
|
end
|
|
|
|
def normalize_name(name)
|
|
return nil if name.nil?
|
|
name.to_s.underscore.downcase
|
|
end
|
|
end
|