Small llms improvements (#400)

* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
soky srm
2025-12-07 18:11:34 +01:00
committed by GitHub
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions

View File

@@ -0,0 +1,68 @@
class Eval::Metrics::Base
attr_reader :eval_run
def initialize(eval_run)
@eval_run = eval_run
end
def calculate
raise NotImplementedError, "Subclasses must implement #calculate"
end
protected
def results
@results ||= eval_run.results.includes(:sample)
end
def samples
@samples ||= eval_run.dataset.samples
end
def total_count
results.count
end
def correct_count
results.where(correct: true).count
end
def incorrect_count
results.where(correct: false).count
end
def accuracy
return 0.0 if total_count.zero?
(correct_count.to_f / total_count * 100).round(2)
end
def avg_latency_ms
return nil if total_count.zero?
results.average(:latency_ms)&.round(0)
end
def total_cost
results.sum(:cost)&.to_f&.round(6)
end
def cost_per_sample
return nil if total_count.zero?
(total_cost / total_count).round(6)
end
def metrics_by_difficulty
%w[easy medium hard edge_case].index_with do |difficulty|
difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty })
next nil if difficulty_results.empty?
correct = difficulty_results.where(correct: true).count
total = difficulty_results.count
{
count: total,
correct: correct,
accuracy: (correct.to_f / total * 100).round(2)
}
end.compact
end
end

View File

@@ -0,0 +1,101 @@
class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
def calculate
{
accuracy: accuracy,
exact_match_accuracy: exact_match_accuracy,
alternative_match_count: alternative_match_count,
precision: precision,
recall: recall,
f1_score: f1_score,
null_accuracy: null_accuracy,
hierarchical_accuracy: hierarchical_accuracy,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
by_difficulty: metrics_by_difficulty,
by_category: metrics_by_category
}
end
private
def exact_match_accuracy
# Percentage of results that exactly match the primary expected category
return 0.0 if total_count.zero?
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
end
def alternative_match_count
# Number of results that matched an alternative (but not primary) category
results.where(alternative_match: true).count
end
def null_accuracy
# Accuracy for samples where null was expected
null_expected_results = results.where(null_expected: true)
return 100.0 if null_expected_results.empty?
correct = null_expected_results.where(null_returned: true).count
total = null_expected_results.count
(correct.to_f / total * 100).round(2)
end
def hierarchical_accuracy
# Percentage of results that match at hierarchical level (including exact matches)
return 0.0 if total_count.zero?
(results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
end
def precision
# True positives / (True positives + False positives)
# TP: Correct non-null predictions
# FP: Incorrect non-null predictions (predicted wrong category)
true_positives = results.where(correct: true, null_returned: false).count
false_positives = results.where(correct: false, null_returned: false).count
denominator = true_positives + false_positives
return 0.0 if denominator.zero?
(true_positives.to_f / denominator * 100).round(2)
end
def recall
# True positives / (True positives + False negatives)
# TP: Correct non-null predictions
# FN: Incorrectly returned null when category was expected
true_positives = results.where(correct: true, null_returned: false).count
false_negatives = results.where(null_expected: false, null_returned: true).count
denominator = true_positives + false_negatives
return 0.0 if denominator.zero?
(true_positives.to_f / denominator * 100).round(2)
end
def f1_score
return 0.0 if precision.zero? || recall.zero?
(2 * precision * recall / (precision + recall)).round(2)
end
def metrics_by_category
# Group results by expected category and calculate accuracy
category_metrics = {}
results.includes(:sample).each do |result|
expected = result.sample.expected_category_name || "null"
category_metrics[expected] ||= { correct: 0, total: 0 }
category_metrics[expected][:total] += 1
category_metrics[expected][:correct] += 1 if result.correct
end
category_metrics.transform_values do |metrics|
metrics.merge(
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
)
end
end
end

View File

@@ -0,0 +1,125 @@
class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
def calculate
{
accuracy: accuracy,
function_selection_accuracy: function_selection_accuracy,
parameter_accuracy: parameter_accuracy,
response_relevance: response_relevance,
exact_match_rate: exact_match_rate,
error_rate: error_rate,
avg_functions_per_response: avg_functions_per_response,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
by_difficulty: metrics_by_difficulty,
by_function: metrics_by_function
}
end
private
def function_selection_accuracy
# Percentage of samples where correct functions were called
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
correct = valid_results.count do |r|
r.metadata.dig("function_selection_correct") == true
end
(correct.to_f / valid_results.count * 100).round(2)
end
def parameter_accuracy
# Average parameter accuracy across all samples
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
scores = valid_results.map do |r|
r.metadata.dig("parameter_accuracy") || 0.0
end
(scores.sum / scores.size * 100).round(2)
end
def response_relevance
# Percentage of samples where response contained expected keywords
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
correct = valid_results.count do |r|
# If no keywords expected, consider it relevant
expected_keywords = r.metadata.dig("expected_keywords") || []
expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
end
(correct.to_f / valid_results.count * 100).round(2)
end
def exact_match_rate
return 0.0 if total_count.zero?
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
end
def error_rate
return 0.0 if total_count.zero?
errors = results.count do |r|
r.metadata.dig("error").present? || r.actual_output.dig("error").present?
end
(errors.to_f / total_count * 100).round(2)
end
def avg_functions_per_response
valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
total_functions = valid_results.sum do |r|
functions = r.actual_output.dig("functions") || []
functions.size
end
(total_functions.to_f / valid_results.count).round(2)
end
def metrics_by_function
# Group results by expected function and calculate accuracy
function_metrics = {}
results.includes(:sample).each do |result|
expected_functions = result.sample.expected_functions
expected_functions.each do |func|
name = func["name"]
next if name.nil?
function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
function_metrics[name][:total] += 1
# Check if this specific function was called correctly
actual_functions = result.actual_output.dig("functions") || []
if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
function_metrics[name][:correct] += 1
function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
end
end
end
function_metrics.transform_values do |metrics|
{
total: metrics[:total],
correct: metrics[:correct],
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
}
end
end
def normalize_name(name)
return nil if name.nil?
name.to_s.underscore.downcase
end
end

View File

@@ -0,0 +1,107 @@
class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base
FUZZY_MATCH_THRESHOLD = 0.8
def calculate
{
accuracy: accuracy,
name_accuracy: name_accuracy,
fuzzy_name_accuracy: fuzzy_name_accuracy,
url_accuracy: url_accuracy,
false_positive_rate: false_positive_rate,
false_negative_rate: false_negative_rate,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
avg_fuzzy_score: avg_fuzzy_score,
by_difficulty: metrics_by_difficulty
}
end
private
def name_accuracy
# Exact name match accuracy for non-null expected names
name_results = results.includes(:sample).select do |r|
r.sample.expected_business_name.present?
end
return 100.0 if name_results.empty?
correct = name_results.count do |r|
actual = r.actual_output.dig("business_name") || r.actual_output["business_name"]
expected = r.sample.expected_business_name
actual == expected
end
(correct.to_f / name_results.size * 100).round(2)
end
def fuzzy_name_accuracy
# Fuzzy name match accuracy (using fuzzy_score >= threshold)
name_results = results.includes(:sample).select do |r|
r.sample.expected_business_name.present?
end
return 100.0 if name_results.empty?
correct = name_results.count do |r|
(r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD
end
(correct.to_f / name_results.size * 100).round(2)
end
def url_accuracy
# URL match accuracy for non-null expected URLs
url_results = results.includes(:sample).select do |r|
r.sample.expected_business_url.present?
end
return 100.0 if url_results.empty?
correct = url_results.count do |r|
actual = r.actual_output.dig("business_url") || r.actual_output["business_url"]
expected = r.sample.expected_business_url
normalize_url(actual) == normalize_url(expected)
end
(correct.to_f / url_results.size * 100).round(2)
end
def false_positive_rate
# Rate of returning a merchant when null was expected
null_expected_results = results.where(null_expected: true)
return 0.0 if null_expected_results.empty?
false_positives = null_expected_results.where(null_returned: false).count
(false_positives.to_f / null_expected_results.count * 100).round(2)
end
def false_negative_rate
# Rate of returning null when a merchant was expected
merchant_expected_results = results.where(null_expected: false)
return 0.0 if merchant_expected_results.empty?
false_negatives = merchant_expected_results.where(null_returned: true).count
(false_negatives.to_f / merchant_expected_results.count * 100).round(2)
end
def avg_fuzzy_score
scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score)
return nil if scores.empty?
(scores.sum / scores.size).round(4)
end
def normalize_url(url)
return nil if url.nil?
url.to_s.downcase
.gsub(/^(https?:\/\/)?(www\.)?/, "")
.chomp("/")
.strip
end
end