mirror of
https://github.com/we-promise/sure.git
synced 2026-04-08 14:54:49 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
102 lines
3.4 KiB
Ruby
102 lines
3.4 KiB
Ruby
class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
|
|
def calculate
|
|
{
|
|
accuracy: accuracy,
|
|
exact_match_accuracy: exact_match_accuracy,
|
|
alternative_match_count: alternative_match_count,
|
|
precision: precision,
|
|
recall: recall,
|
|
f1_score: f1_score,
|
|
null_accuracy: null_accuracy,
|
|
hierarchical_accuracy: hierarchical_accuracy,
|
|
samples_processed: total_count,
|
|
samples_correct: correct_count,
|
|
avg_latency_ms: avg_latency_ms,
|
|
total_cost: total_cost,
|
|
cost_per_sample: cost_per_sample,
|
|
by_difficulty: metrics_by_difficulty,
|
|
by_category: metrics_by_category
|
|
}
|
|
end
|
|
|
|
private
|
|
|
|
def exact_match_accuracy
|
|
# Percentage of results that exactly match the primary expected category
|
|
return 0.0 if total_count.zero?
|
|
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
|
|
end
|
|
|
|
def alternative_match_count
|
|
# Number of results that matched an alternative (but not primary) category
|
|
results.where(alternative_match: true).count
|
|
end
|
|
|
|
def null_accuracy
|
|
# Accuracy for samples where null was expected
|
|
null_expected_results = results.where(null_expected: true)
|
|
return 100.0 if null_expected_results.empty?
|
|
|
|
correct = null_expected_results.where(null_returned: true).count
|
|
total = null_expected_results.count
|
|
|
|
(correct.to_f / total * 100).round(2)
|
|
end
|
|
|
|
def hierarchical_accuracy
|
|
# Percentage of results that match at hierarchical level (including exact matches)
|
|
return 0.0 if total_count.zero?
|
|
(results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
|
|
end
|
|
|
|
def precision
|
|
# True positives / (True positives + False positives)
|
|
# TP: Correct non-null predictions
|
|
# FP: Incorrect non-null predictions (predicted wrong category)
|
|
true_positives = results.where(correct: true, null_returned: false).count
|
|
false_positives = results.where(correct: false, null_returned: false).count
|
|
|
|
denominator = true_positives + false_positives
|
|
return 0.0 if denominator.zero?
|
|
|
|
(true_positives.to_f / denominator * 100).round(2)
|
|
end
|
|
|
|
def recall
|
|
# True positives / (True positives + False negatives)
|
|
# TP: Correct non-null predictions
|
|
# FN: Incorrectly returned null when category was expected
|
|
true_positives = results.where(correct: true, null_returned: false).count
|
|
false_negatives = results.where(null_expected: false, null_returned: true).count
|
|
|
|
denominator = true_positives + false_negatives
|
|
return 0.0 if denominator.zero?
|
|
|
|
(true_positives.to_f / denominator * 100).round(2)
|
|
end
|
|
|
|
def f1_score
|
|
return 0.0 if precision.zero? || recall.zero?
|
|
(2 * precision * recall / (precision + recall)).round(2)
|
|
end
|
|
|
|
def metrics_by_category
|
|
# Group results by expected category and calculate accuracy
|
|
category_metrics = {}
|
|
|
|
results.includes(:sample).each do |result|
|
|
expected = result.sample.expected_category_name || "null"
|
|
|
|
category_metrics[expected] ||= { correct: 0, total: 0 }
|
|
category_metrics[expected][:total] += 1
|
|
category_metrics[expected][:correct] += 1 if result.correct
|
|
end
|
|
|
|
category_metrics.transform_values do |metrics|
|
|
metrics.merge(
|
|
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
|
|
)
|
|
end
|
|
end
|
|
end
|