mirror of
https://github.com/we-promise/sure.git
synced 2026-04-19 12:04:08 +00:00
Small llms improvements (#400)
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
68
app/models/eval/metrics/base.rb
Normal file
68
app/models/eval/metrics/base.rb
Normal file
@@ -0,0 +1,68 @@
|
||||
class Eval::Metrics::Base
|
||||
attr_reader :eval_run
|
||||
|
||||
def initialize(eval_run)
|
||||
@eval_run = eval_run
|
||||
end
|
||||
|
||||
def calculate
|
||||
raise NotImplementedError, "Subclasses must implement #calculate"
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def results
|
||||
@results ||= eval_run.results.includes(:sample)
|
||||
end
|
||||
|
||||
def samples
|
||||
@samples ||= eval_run.dataset.samples
|
||||
end
|
||||
|
||||
def total_count
|
||||
results.count
|
||||
end
|
||||
|
||||
def correct_count
|
||||
results.where(correct: true).count
|
||||
end
|
||||
|
||||
def incorrect_count
|
||||
results.where(correct: false).count
|
||||
end
|
||||
|
||||
def accuracy
|
||||
return 0.0 if total_count.zero?
|
||||
(correct_count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def avg_latency_ms
|
||||
return nil if total_count.zero?
|
||||
results.average(:latency_ms)&.round(0)
|
||||
end
|
||||
|
||||
def total_cost
|
||||
results.sum(:cost)&.to_f&.round(6)
|
||||
end
|
||||
|
||||
def cost_per_sample
|
||||
return nil if total_count.zero?
|
||||
(total_cost / total_count).round(6)
|
||||
end
|
||||
|
||||
def metrics_by_difficulty
|
||||
%w[easy medium hard edge_case].index_with do |difficulty|
|
||||
difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty })
|
||||
next nil if difficulty_results.empty?
|
||||
|
||||
correct = difficulty_results.where(correct: true).count
|
||||
total = difficulty_results.count
|
||||
|
||||
{
|
||||
count: total,
|
||||
correct: correct,
|
||||
accuracy: (correct.to_f / total * 100).round(2)
|
||||
}
|
||||
end.compact
|
||||
end
|
||||
end
|
||||
101
app/models/eval/metrics/categorization_metrics.rb
Normal file
101
app/models/eval/metrics/categorization_metrics.rb
Normal file
@@ -0,0 +1,101 @@
|
||||
class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
|
||||
def calculate
|
||||
{
|
||||
accuracy: accuracy,
|
||||
exact_match_accuracy: exact_match_accuracy,
|
||||
alternative_match_count: alternative_match_count,
|
||||
precision: precision,
|
||||
recall: recall,
|
||||
f1_score: f1_score,
|
||||
null_accuracy: null_accuracy,
|
||||
hierarchical_accuracy: hierarchical_accuracy,
|
||||
samples_processed: total_count,
|
||||
samples_correct: correct_count,
|
||||
avg_latency_ms: avg_latency_ms,
|
||||
total_cost: total_cost,
|
||||
cost_per_sample: cost_per_sample,
|
||||
by_difficulty: metrics_by_difficulty,
|
||||
by_category: metrics_by_category
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def exact_match_accuracy
|
||||
# Percentage of results that exactly match the primary expected category
|
||||
return 0.0 if total_count.zero?
|
||||
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def alternative_match_count
|
||||
# Number of results that matched an alternative (but not primary) category
|
||||
results.where(alternative_match: true).count
|
||||
end
|
||||
|
||||
def null_accuracy
|
||||
# Accuracy for samples where null was expected
|
||||
null_expected_results = results.where(null_expected: true)
|
||||
return 100.0 if null_expected_results.empty?
|
||||
|
||||
correct = null_expected_results.where(null_returned: true).count
|
||||
total = null_expected_results.count
|
||||
|
||||
(correct.to_f / total * 100).round(2)
|
||||
end
|
||||
|
||||
def hierarchical_accuracy
|
||||
# Percentage of results that match at hierarchical level (including exact matches)
|
||||
return 0.0 if total_count.zero?
|
||||
(results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def precision
|
||||
# True positives / (True positives + False positives)
|
||||
# TP: Correct non-null predictions
|
||||
# FP: Incorrect non-null predictions (predicted wrong category)
|
||||
true_positives = results.where(correct: true, null_returned: false).count
|
||||
false_positives = results.where(correct: false, null_returned: false).count
|
||||
|
||||
denominator = true_positives + false_positives
|
||||
return 0.0 if denominator.zero?
|
||||
|
||||
(true_positives.to_f / denominator * 100).round(2)
|
||||
end
|
||||
|
||||
def recall
|
||||
# True positives / (True positives + False negatives)
|
||||
# TP: Correct non-null predictions
|
||||
# FN: Incorrectly returned null when category was expected
|
||||
true_positives = results.where(correct: true, null_returned: false).count
|
||||
false_negatives = results.where(null_expected: false, null_returned: true).count
|
||||
|
||||
denominator = true_positives + false_negatives
|
||||
return 0.0 if denominator.zero?
|
||||
|
||||
(true_positives.to_f / denominator * 100).round(2)
|
||||
end
|
||||
|
||||
def f1_score
|
||||
return 0.0 if precision.zero? || recall.zero?
|
||||
(2 * precision * recall / (precision + recall)).round(2)
|
||||
end
|
||||
|
||||
def metrics_by_category
|
||||
# Group results by expected category and calculate accuracy
|
||||
category_metrics = {}
|
||||
|
||||
results.includes(:sample).each do |result|
|
||||
expected = result.sample.expected_category_name || "null"
|
||||
|
||||
category_metrics[expected] ||= { correct: 0, total: 0 }
|
||||
category_metrics[expected][:total] += 1
|
||||
category_metrics[expected][:correct] += 1 if result.correct
|
||||
end
|
||||
|
||||
category_metrics.transform_values do |metrics|
|
||||
metrics.merge(
|
||||
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
125
app/models/eval/metrics/chat_metrics.rb
Normal file
125
app/models/eval/metrics/chat_metrics.rb
Normal file
@@ -0,0 +1,125 @@
|
||||
class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
|
||||
def calculate
|
||||
{
|
||||
accuracy: accuracy,
|
||||
function_selection_accuracy: function_selection_accuracy,
|
||||
parameter_accuracy: parameter_accuracy,
|
||||
response_relevance: response_relevance,
|
||||
exact_match_rate: exact_match_rate,
|
||||
error_rate: error_rate,
|
||||
avg_functions_per_response: avg_functions_per_response,
|
||||
samples_processed: total_count,
|
||||
samples_correct: correct_count,
|
||||
avg_latency_ms: avg_latency_ms,
|
||||
total_cost: total_cost,
|
||||
cost_per_sample: cost_per_sample,
|
||||
by_difficulty: metrics_by_difficulty,
|
||||
by_function: metrics_by_function
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def function_selection_accuracy
|
||||
# Percentage of samples where correct functions were called
|
||||
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
correct = valid_results.count do |r|
|
||||
r.metadata.dig("function_selection_correct") == true
|
||||
end
|
||||
|
||||
(correct.to_f / valid_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def parameter_accuracy
|
||||
# Average parameter accuracy across all samples
|
||||
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
scores = valid_results.map do |r|
|
||||
r.metadata.dig("parameter_accuracy") || 0.0
|
||||
end
|
||||
|
||||
(scores.sum / scores.size * 100).round(2)
|
||||
end
|
||||
|
||||
def response_relevance
|
||||
# Percentage of samples where response contained expected keywords
|
||||
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
correct = valid_results.count do |r|
|
||||
# If no keywords expected, consider it relevant
|
||||
expected_keywords = r.metadata.dig("expected_keywords") || []
|
||||
expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
|
||||
end
|
||||
|
||||
(correct.to_f / valid_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def exact_match_rate
|
||||
return 0.0 if total_count.zero?
|
||||
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def error_rate
|
||||
return 0.0 if total_count.zero?
|
||||
|
||||
errors = results.count do |r|
|
||||
r.metadata.dig("error").present? || r.actual_output.dig("error").present?
|
||||
end
|
||||
|
||||
(errors.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def avg_functions_per_response
|
||||
valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
total_functions = valid_results.sum do |r|
|
||||
functions = r.actual_output.dig("functions") || []
|
||||
functions.size
|
||||
end
|
||||
|
||||
(total_functions.to_f / valid_results.count).round(2)
|
||||
end
|
||||
|
||||
def metrics_by_function
|
||||
# Group results by expected function and calculate accuracy
|
||||
function_metrics = {}
|
||||
|
||||
results.includes(:sample).each do |result|
|
||||
expected_functions = result.sample.expected_functions
|
||||
|
||||
expected_functions.each do |func|
|
||||
name = func["name"]
|
||||
next if name.nil?
|
||||
|
||||
function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
|
||||
function_metrics[name][:total] += 1
|
||||
|
||||
# Check if this specific function was called correctly
|
||||
actual_functions = result.actual_output.dig("functions") || []
|
||||
if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
|
||||
function_metrics[name][:correct] += 1
|
||||
function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function_metrics.transform_values do |metrics|
|
||||
{
|
||||
total: metrics[:total],
|
||||
correct: metrics[:correct],
|
||||
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
|
||||
avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def normalize_name(name)
|
||||
return nil if name.nil?
|
||||
name.to_s.underscore.downcase
|
||||
end
|
||||
end
|
||||
107
app/models/eval/metrics/merchant_detection_metrics.rb
Normal file
107
app/models/eval/metrics/merchant_detection_metrics.rb
Normal file
@@ -0,0 +1,107 @@
|
||||
class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base
|
||||
FUZZY_MATCH_THRESHOLD = 0.8
|
||||
|
||||
def calculate
|
||||
{
|
||||
accuracy: accuracy,
|
||||
name_accuracy: name_accuracy,
|
||||
fuzzy_name_accuracy: fuzzy_name_accuracy,
|
||||
url_accuracy: url_accuracy,
|
||||
false_positive_rate: false_positive_rate,
|
||||
false_negative_rate: false_negative_rate,
|
||||
samples_processed: total_count,
|
||||
samples_correct: correct_count,
|
||||
avg_latency_ms: avg_latency_ms,
|
||||
total_cost: total_cost,
|
||||
cost_per_sample: cost_per_sample,
|
||||
avg_fuzzy_score: avg_fuzzy_score,
|
||||
by_difficulty: metrics_by_difficulty
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def name_accuracy
|
||||
# Exact name match accuracy for non-null expected names
|
||||
name_results = results.includes(:sample).select do |r|
|
||||
r.sample.expected_business_name.present?
|
||||
end
|
||||
|
||||
return 100.0 if name_results.empty?
|
||||
|
||||
correct = name_results.count do |r|
|
||||
actual = r.actual_output.dig("business_name") || r.actual_output["business_name"]
|
||||
expected = r.sample.expected_business_name
|
||||
actual == expected
|
||||
end
|
||||
|
||||
(correct.to_f / name_results.size * 100).round(2)
|
||||
end
|
||||
|
||||
def fuzzy_name_accuracy
|
||||
# Fuzzy name match accuracy (using fuzzy_score >= threshold)
|
||||
name_results = results.includes(:sample).select do |r|
|
||||
r.sample.expected_business_name.present?
|
||||
end
|
||||
|
||||
return 100.0 if name_results.empty?
|
||||
|
||||
correct = name_results.count do |r|
|
||||
(r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD
|
||||
end
|
||||
|
||||
(correct.to_f / name_results.size * 100).round(2)
|
||||
end
|
||||
|
||||
def url_accuracy
|
||||
# URL match accuracy for non-null expected URLs
|
||||
url_results = results.includes(:sample).select do |r|
|
||||
r.sample.expected_business_url.present?
|
||||
end
|
||||
|
||||
return 100.0 if url_results.empty?
|
||||
|
||||
correct = url_results.count do |r|
|
||||
actual = r.actual_output.dig("business_url") || r.actual_output["business_url"]
|
||||
expected = r.sample.expected_business_url
|
||||
normalize_url(actual) == normalize_url(expected)
|
||||
end
|
||||
|
||||
(correct.to_f / url_results.size * 100).round(2)
|
||||
end
|
||||
|
||||
def false_positive_rate
|
||||
# Rate of returning a merchant when null was expected
|
||||
null_expected_results = results.where(null_expected: true)
|
||||
return 0.0 if null_expected_results.empty?
|
||||
|
||||
false_positives = null_expected_results.where(null_returned: false).count
|
||||
|
||||
(false_positives.to_f / null_expected_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def false_negative_rate
|
||||
# Rate of returning null when a merchant was expected
|
||||
merchant_expected_results = results.where(null_expected: false)
|
||||
return 0.0 if merchant_expected_results.empty?
|
||||
|
||||
false_negatives = merchant_expected_results.where(null_returned: true).count
|
||||
|
||||
(false_negatives.to_f / merchant_expected_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def avg_fuzzy_score
|
||||
scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score)
|
||||
return nil if scores.empty?
|
||||
|
||||
(scores.sum / scores.size).round(4)
|
||||
end
|
||||
|
||||
def normalize_url(url)
|
||||
return nil if url.nil?
|
||||
url.to_s.downcase
|
||||
.gsub(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.chomp("/")
|
||||
.strip
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user