Files
sure/app/models/eval/reporters/comparison_reporter.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

206 lines
6.0 KiB
Ruby

class Eval::Reporters::ComparisonReporter
attr_reader :runs
def initialize(runs)
@runs = Array(runs).sort_by(&:model)
end
# Generate a text table for terminal display
def to_table
return "No runs to compare" if runs.empty?
headers = build_headers
rows = runs.map { |run| build_row(run) }
# Calculate column widths
all_rows = [ headers ] + rows
widths = headers.each_index.map do |i|
all_rows.map { |row| row[i].to_s.length }.max
end
# Build table
separator = "+" + widths.map { |w| "-" * (w + 2) }.join("+") + "+"
lines = []
lines << separator
lines << "| " + headers.each_with_index.map { |h, i| h.to_s.ljust(widths[i]) }.join(" | ") + " |"
lines << separator
rows.each do |row|
lines << "| " + row.each_with_index.map { |c, i| c.to_s.ljust(widths[i]) }.join(" | ") + " |"
end
lines << separator
lines.join("\n")
end
# Export to CSV file
def to_csv(file_path)
require "csv"
CSV.open(file_path, "wb") do |csv|
csv << csv_headers
runs.each { |run| csv << csv_row(run) }
end
file_path
end
# Generate summary with best model recommendations
def summary
return {} if runs.empty?
completed_runs = runs.select { |r| r.status == "completed" && r.metrics.present? }
return {} if completed_runs.empty?
best_accuracy = completed_runs.max_by { |r| r.metrics["accuracy"] || 0 }
lowest_cost = completed_runs.min_by { |r| r.total_cost || Float::INFINITY }
fastest = completed_runs.min_by { |r| r.metrics["avg_latency_ms"] || Float::INFINITY }
{
best_accuracy: {
model: best_accuracy.model,
value: best_accuracy.metrics["accuracy"],
run_id: best_accuracy.id
},
lowest_cost: {
model: lowest_cost.model,
value: lowest_cost.total_cost&.to_f,
run_id: lowest_cost.id
},
fastest: {
model: fastest.model,
value: fastest.metrics["avg_latency_ms"],
run_id: fastest.id
},
recommendation: generate_recommendation(best_accuracy, lowest_cost, fastest)
}
end
# Generate detailed comparison between runs
def detailed_comparison
return {} if runs.empty?
{
runs: runs.map(&:summary),
comparison: pairwise_comparisons,
summary: summary
}
end
private
def build_headers
[ "Model", "Status", "Accuracy", "Precision", "Recall", "F1", "Latency (ms)", "Cost ($)", "Samples" ]
end
def build_row(run)
metrics = run.metrics || {}
[
run.model,
run.status,
format_percentage(metrics["accuracy"]),
format_percentage(metrics["precision"]),
format_percentage(metrics["recall"]),
format_percentage(metrics["f1_score"]),
metrics["avg_latency_ms"]&.round(0) || "-",
format_cost(run.total_cost),
run.results.count
]
end
def csv_headers
[
"Run ID", "Model", "Provider", "Dataset", "Status",
"Accuracy", "Precision", "Recall", "F1 Score",
"Null Accuracy", "Hierarchical Accuracy",
"Avg Latency (ms)", "Total Cost", "Cost Per Sample",
"Samples Processed", "Samples Correct",
"Duration (s)", "Run Date"
]
end
def csv_row(run)
metrics = run.metrics || {}
[
run.id,
run.model,
run.provider,
run.dataset.name,
run.status,
metrics["accuracy"],
metrics["precision"],
metrics["recall"],
metrics["f1_score"],
metrics["null_accuracy"],
metrics["hierarchical_accuracy"],
metrics["avg_latency_ms"],
run.total_cost&.to_f,
metrics["cost_per_sample"],
metrics["samples_processed"],
metrics["samples_correct"],
run.duration_seconds,
run.completed_at&.iso8601
]
end
def format_percentage(value)
return "-" if value.nil?
"#{value}%"
end
def format_cost(value)
return "-" if value.nil?
"$#{value.to_f.round(4)}"
end
def pairwise_comparisons
return [] if runs.size < 2
comparisons = []
runs.combination(2).each do |run1, run2|
comparisons << {
models: [ run1.model, run2.model ],
accuracy_diff: ((run1.metrics["accuracy"] || 0) - (run2.metrics["accuracy"] || 0)).round(2),
cost_diff: ((run1.total_cost || 0) - (run2.total_cost || 0)).to_f.round(6),
latency_diff: ((run1.metrics["avg_latency_ms"] || 0) - (run2.metrics["avg_latency_ms"] || 0)).round(0)
}
end
comparisons
end
def generate_recommendation(best_accuracy, lowest_cost, fastest)
parts = []
# If one model wins all categories
if best_accuracy.id == lowest_cost.id && lowest_cost.id == fastest.id
return "#{best_accuracy.model} is the best choice overall (highest accuracy, lowest cost, fastest)."
end
# Accuracy recommendation
if best_accuracy.metrics["accuracy"] && best_accuracy.metrics["accuracy"] >= 90
parts << "For maximum accuracy, use #{best_accuracy.model} (#{best_accuracy.metrics['accuracy']}% accuracy)"
end
# Cost recommendation if significantly cheaper
if lowest_cost.total_cost && lowest_cost.total_cost > 0
cost_ratio = (best_accuracy.total_cost || 0) / lowest_cost.total_cost
if cost_ratio > 1.5
parts << "For cost efficiency, consider #{lowest_cost.model} (#{format_cost(lowest_cost.total_cost)} vs #{format_cost(best_accuracy.total_cost)})"
end
end
# Speed recommendation
if fastest.metrics["avg_latency_ms"] && fastest.id != best_accuracy.id
latency_ratio = (best_accuracy.metrics["avg_latency_ms"] || 0) / (fastest.metrics["avg_latency_ms"] || 1)
if latency_ratio > 1.5
parts << "For speed, consider #{fastest.model} (#{fastest.metrics['avg_latency_ms']}ms vs #{best_accuracy.metrics['avg_latency_ms']}ms)"
end
end
parts.empty? ? "All models perform similarly." : parts.join(". ")
end
end