mirror of
https://github.com/we-promise/sure.git
synced 2026-04-18 11:34:13 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
206 lines
6.0 KiB
Ruby
206 lines
6.0 KiB
Ruby
class Eval::Reporters::ComparisonReporter
|
|
attr_reader :runs
|
|
|
|
def initialize(runs)
|
|
@runs = Array(runs).sort_by(&:model)
|
|
end
|
|
|
|
# Generate a text table for terminal display
|
|
def to_table
|
|
return "No runs to compare" if runs.empty?
|
|
|
|
headers = build_headers
|
|
rows = runs.map { |run| build_row(run) }
|
|
|
|
# Calculate column widths
|
|
all_rows = [ headers ] + rows
|
|
widths = headers.each_index.map do |i|
|
|
all_rows.map { |row| row[i].to_s.length }.max
|
|
end
|
|
|
|
# Build table
|
|
separator = "+" + widths.map { |w| "-" * (w + 2) }.join("+") + "+"
|
|
|
|
lines = []
|
|
lines << separator
|
|
lines << "| " + headers.each_with_index.map { |h, i| h.to_s.ljust(widths[i]) }.join(" | ") + " |"
|
|
lines << separator
|
|
|
|
rows.each do |row|
|
|
lines << "| " + row.each_with_index.map { |c, i| c.to_s.ljust(widths[i]) }.join(" | ") + " |"
|
|
end
|
|
|
|
lines << separator
|
|
lines.join("\n")
|
|
end
|
|
|
|
# Export to CSV file
|
|
def to_csv(file_path)
|
|
require "csv"
|
|
|
|
CSV.open(file_path, "wb") do |csv|
|
|
csv << csv_headers
|
|
runs.each { |run| csv << csv_row(run) }
|
|
end
|
|
|
|
file_path
|
|
end
|
|
|
|
# Generate summary with best model recommendations
|
|
def summary
|
|
return {} if runs.empty?
|
|
|
|
completed_runs = runs.select { |r| r.status == "completed" && r.metrics.present? }
|
|
return {} if completed_runs.empty?
|
|
|
|
best_accuracy = completed_runs.max_by { |r| r.metrics["accuracy"] || 0 }
|
|
lowest_cost = completed_runs.min_by { |r| r.total_cost || Float::INFINITY }
|
|
fastest = completed_runs.min_by { |r| r.metrics["avg_latency_ms"] || Float::INFINITY }
|
|
|
|
{
|
|
best_accuracy: {
|
|
model: best_accuracy.model,
|
|
value: best_accuracy.metrics["accuracy"],
|
|
run_id: best_accuracy.id
|
|
},
|
|
lowest_cost: {
|
|
model: lowest_cost.model,
|
|
value: lowest_cost.total_cost&.to_f,
|
|
run_id: lowest_cost.id
|
|
},
|
|
fastest: {
|
|
model: fastest.model,
|
|
value: fastest.metrics["avg_latency_ms"],
|
|
run_id: fastest.id
|
|
},
|
|
recommendation: generate_recommendation(best_accuracy, lowest_cost, fastest)
|
|
}
|
|
end
|
|
|
|
# Generate detailed comparison between runs
|
|
def detailed_comparison
|
|
return {} if runs.empty?
|
|
|
|
{
|
|
runs: runs.map(&:summary),
|
|
comparison: pairwise_comparisons,
|
|
summary: summary
|
|
}
|
|
end
|
|
|
|
private
|
|
|
|
def build_headers
|
|
[ "Model", "Status", "Accuracy", "Precision", "Recall", "F1", "Latency (ms)", "Cost ($)", "Samples" ]
|
|
end
|
|
|
|
def build_row(run)
|
|
metrics = run.metrics || {}
|
|
|
|
[
|
|
run.model,
|
|
run.status,
|
|
format_percentage(metrics["accuracy"]),
|
|
format_percentage(metrics["precision"]),
|
|
format_percentage(metrics["recall"]),
|
|
format_percentage(metrics["f1_score"]),
|
|
metrics["avg_latency_ms"]&.round(0) || "-",
|
|
format_cost(run.total_cost),
|
|
run.results.count
|
|
]
|
|
end
|
|
|
|
def csv_headers
|
|
[
|
|
"Run ID", "Model", "Provider", "Dataset", "Status",
|
|
"Accuracy", "Precision", "Recall", "F1 Score",
|
|
"Null Accuracy", "Hierarchical Accuracy",
|
|
"Avg Latency (ms)", "Total Cost", "Cost Per Sample",
|
|
"Samples Processed", "Samples Correct",
|
|
"Duration (s)", "Run Date"
|
|
]
|
|
end
|
|
|
|
def csv_row(run)
|
|
metrics = run.metrics || {}
|
|
|
|
[
|
|
run.id,
|
|
run.model,
|
|
run.provider,
|
|
run.dataset.name,
|
|
run.status,
|
|
metrics["accuracy"],
|
|
metrics["precision"],
|
|
metrics["recall"],
|
|
metrics["f1_score"],
|
|
metrics["null_accuracy"],
|
|
metrics["hierarchical_accuracy"],
|
|
metrics["avg_latency_ms"],
|
|
run.total_cost&.to_f,
|
|
metrics["cost_per_sample"],
|
|
metrics["samples_processed"],
|
|
metrics["samples_correct"],
|
|
run.duration_seconds,
|
|
run.completed_at&.iso8601
|
|
]
|
|
end
|
|
|
|
def format_percentage(value)
|
|
return "-" if value.nil?
|
|
"#{value}%"
|
|
end
|
|
|
|
def format_cost(value)
|
|
return "-" if value.nil?
|
|
"$#{value.to_f.round(4)}"
|
|
end
|
|
|
|
def pairwise_comparisons
|
|
return [] if runs.size < 2
|
|
|
|
comparisons = []
|
|
runs.combination(2).each do |run1, run2|
|
|
comparisons << {
|
|
models: [ run1.model, run2.model ],
|
|
accuracy_diff: ((run1.metrics["accuracy"] || 0) - (run2.metrics["accuracy"] || 0)).round(2),
|
|
cost_diff: ((run1.total_cost || 0) - (run2.total_cost || 0)).to_f.round(6),
|
|
latency_diff: ((run1.metrics["avg_latency_ms"] || 0) - (run2.metrics["avg_latency_ms"] || 0)).round(0)
|
|
}
|
|
end
|
|
comparisons
|
|
end
|
|
|
|
def generate_recommendation(best_accuracy, lowest_cost, fastest)
|
|
parts = []
|
|
|
|
# If one model wins all categories
|
|
if best_accuracy.id == lowest_cost.id && lowest_cost.id == fastest.id
|
|
return "#{best_accuracy.model} is the best choice overall (highest accuracy, lowest cost, fastest)."
|
|
end
|
|
|
|
# Accuracy recommendation
|
|
if best_accuracy.metrics["accuracy"] && best_accuracy.metrics["accuracy"] >= 90
|
|
parts << "For maximum accuracy, use #{best_accuracy.model} (#{best_accuracy.metrics['accuracy']}% accuracy)"
|
|
end
|
|
|
|
# Cost recommendation if significantly cheaper
|
|
if lowest_cost.total_cost && lowest_cost.total_cost > 0
|
|
cost_ratio = (best_accuracy.total_cost || 0) / lowest_cost.total_cost
|
|
if cost_ratio > 1.5
|
|
parts << "For cost efficiency, consider #{lowest_cost.model} (#{format_cost(lowest_cost.total_cost)} vs #{format_cost(best_accuracy.total_cost)})"
|
|
end
|
|
end
|
|
|
|
# Speed recommendation
|
|
if fastest.metrics["avg_latency_ms"] && fastest.id != best_accuracy.id
|
|
latency_ratio = (best_accuracy.metrics["avg_latency_ms"] || 0) / (fastest.metrics["avg_latency_ms"] || 1)
|
|
if latency_ratio > 1.5
|
|
parts << "For speed, consider #{fastest.model} (#{fastest.metrics['avg_latency_ms']}ms vs #{best_accuracy.metrics['avg_latency_ms']}ms)"
|
|
end
|
|
end
|
|
|
|
parts.empty? ? "All models perform similarly." : parts.join(". ")
|
|
end
|
|
end
|