Files
sure/app/models/eval/metrics/chat_metrics.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

126 lines
4.0 KiB
Ruby

class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
def calculate
{
accuracy: accuracy,
function_selection_accuracy: function_selection_accuracy,
parameter_accuracy: parameter_accuracy,
response_relevance: response_relevance,
exact_match_rate: exact_match_rate,
error_rate: error_rate,
avg_functions_per_response: avg_functions_per_response,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
by_difficulty: metrics_by_difficulty,
by_function: metrics_by_function
}
end
private
def function_selection_accuracy
# Percentage of samples where correct functions were called
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
correct = valid_results.count do |r|
r.metadata.dig("function_selection_correct") == true
end
(correct.to_f / valid_results.count * 100).round(2)
end
def parameter_accuracy
# Average parameter accuracy across all samples
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
scores = valid_results.map do |r|
r.metadata.dig("parameter_accuracy") || 0.0
end
(scores.sum / scores.size * 100).round(2)
end
def response_relevance
# Percentage of samples where response contained expected keywords
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
correct = valid_results.count do |r|
# If no keywords expected, consider it relevant
expected_keywords = r.metadata.dig("expected_keywords") || []
expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
end
(correct.to_f / valid_results.count * 100).round(2)
end
def exact_match_rate
return 0.0 if total_count.zero?
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
end
def error_rate
return 0.0 if total_count.zero?
errors = results.count do |r|
r.metadata.dig("error").present? || r.actual_output.dig("error").present?
end
(errors.to_f / total_count * 100).round(2)
end
def avg_functions_per_response
valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
total_functions = valid_results.sum do |r|
functions = r.actual_output.dig("functions") || []
functions.size
end
(total_functions.to_f / valid_results.count).round(2)
end
def metrics_by_function
# Group results by expected function and calculate accuracy
function_metrics = {}
results.includes(:sample).each do |result|
expected_functions = result.sample.expected_functions
expected_functions.each do |func|
name = func["name"]
next if name.nil?
function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
function_metrics[name][:total] += 1
# Check if this specific function was called correctly
actual_functions = result.actual_output.dig("functions") || []
if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
function_metrics[name][:correct] += 1
function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
end
end
end
function_metrics.transform_values do |metrics|
{
total: metrics[:total],
correct: metrics[:correct],
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
}
end
end
def normalize_name(name)
return nil if name.nil?
name.to_s.underscore.downcase
end
end