Files
sure/app/models/eval/metrics/categorization_metrics.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

102 lines
3.4 KiB
Ruby

class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
def calculate
{
accuracy: accuracy,
exact_match_accuracy: exact_match_accuracy,
alternative_match_count: alternative_match_count,
precision: precision,
recall: recall,
f1_score: f1_score,
null_accuracy: null_accuracy,
hierarchical_accuracy: hierarchical_accuracy,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
by_difficulty: metrics_by_difficulty,
by_category: metrics_by_category
}
end
private
def exact_match_accuracy
# Percentage of results that exactly match the primary expected category
return 0.0 if total_count.zero?
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
end
def alternative_match_count
# Number of results that matched an alternative (but not primary) category
results.where(alternative_match: true).count
end
def null_accuracy
# Accuracy for samples where null was expected
null_expected_results = results.where(null_expected: true)
return 100.0 if null_expected_results.empty?
correct = null_expected_results.where(null_returned: true).count
total = null_expected_results.count
(correct.to_f / total * 100).round(2)
end
def hierarchical_accuracy
# Percentage of results that match at hierarchical level (including exact matches)
return 0.0 if total_count.zero?
(results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
end
def precision
# True positives / (True positives + False positives)
# TP: Correct non-null predictions
# FP: Incorrect non-null predictions (predicted wrong category)
true_positives = results.where(correct: true, null_returned: false).count
false_positives = results.where(correct: false, null_returned: false).count
denominator = true_positives + false_positives
return 0.0 if denominator.zero?
(true_positives.to_f / denominator * 100).round(2)
end
def recall
# True positives / (True positives + False negatives)
# TP: Correct non-null predictions
# FN: Incorrectly returned null when category was expected
true_positives = results.where(correct: true, null_returned: false).count
false_negatives = results.where(null_expected: false, null_returned: true).count
denominator = true_positives + false_negatives
return 0.0 if denominator.zero?
(true_positives.to_f / denominator * 100).round(2)
end
def f1_score
return 0.0 if precision.zero? || recall.zero?
(2 * precision * recall / (precision + recall)).round(2)
end
def metrics_by_category
# Group results by expected category and calculate accuracy
category_metrics = {}
results.includes(:sample).each do |result|
expected = result.sample.expected_category_name || "null"
category_metrics[expected] ||= { correct: 0, total: 0 }
category_metrics[expected][:total] += 1
category_metrics[expected][:correct] += 1 if result.correct
end
category_metrics.transform_values do |metrics|
metrics.merge(
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
)
end
end
end