mirror of
https://github.com/we-promise/sure.git
synced 2026-04-07 22:34:47 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
89 lines
2.4 KiB
Ruby
89 lines
2.4 KiB
Ruby
class Eval::Run < ApplicationRecord
|
|
self.table_name = "eval_runs"
|
|
|
|
belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
|
|
has_many :results, class_name: "Eval::Result", foreign_key: :eval_run_id, dependent: :destroy
|
|
|
|
validates :provider, :model, :status, presence: true
|
|
validates :status, inclusion: { in: %w[pending running completed failed] }
|
|
|
|
scope :pending, -> { where(status: "pending") }
|
|
scope :running, -> { where(status: "running") }
|
|
scope :completed, -> { where(status: "completed") }
|
|
scope :failed, -> { where(status: "failed") }
|
|
scope :for_model, ->(model) { where(model: model) }
|
|
scope :for_provider, ->(provider) { where(provider: provider) }
|
|
|
|
# Calculate duration in seconds
|
|
def duration_seconds
|
|
return nil unless started_at && completed_at
|
|
(completed_at - started_at).to_i
|
|
end
|
|
|
|
# Get accuracy from metrics or calculate
|
|
def accuracy
|
|
metrics.dig("accuracy") || calculate_accuracy
|
|
end
|
|
|
|
# Start the evaluation run
|
|
def start!
|
|
update!(status: "running", started_at: Time.current)
|
|
end
|
|
|
|
# Complete the evaluation run with metrics
|
|
def complete!(calculated_metrics)
|
|
update!(
|
|
status: "completed",
|
|
completed_at: Time.current,
|
|
metrics: calculated_metrics,
|
|
total_prompt_tokens: results.sum(:prompt_tokens),
|
|
total_completion_tokens: results.sum(:completion_tokens),
|
|
total_cost: results.sum(:cost)
|
|
)
|
|
end
|
|
|
|
# Fail the evaluation run
|
|
def fail!(error)
|
|
update!(
|
|
status: "failed",
|
|
completed_at: Time.current,
|
|
error_message: error.is_a?(Exception) ? "#{error.class}: #{error.message}" : error.to_s
|
|
)
|
|
end
|
|
|
|
# Summary for display
|
|
def summary
|
|
{
|
|
id: id,
|
|
name: name,
|
|
dataset: dataset.name,
|
|
model: model,
|
|
provider: provider,
|
|
status: status,
|
|
accuracy: accuracy,
|
|
total_cost: total_cost&.to_f,
|
|
duration: duration_seconds,
|
|
samples_processed: results.count,
|
|
samples_correct: results.where(correct: true).count,
|
|
created_at: created_at
|
|
}
|
|
end
|
|
|
|
# Compare this run to another
|
|
def compare_to(other_run)
|
|
{
|
|
accuracy_diff: (accuracy || 0) - (other_run.accuracy || 0),
|
|
cost_diff: (total_cost || 0) - (other_run.total_cost || 0),
|
|
this_model: model,
|
|
other_model: other_run.model
|
|
}
|
|
end
|
|
|
|
private
|
|
|
|
def calculate_accuracy
|
|
return 0.0 if results.empty?
|
|
(results.where(correct: true).count.to_f / results.count * 100).round(2)
|
|
end
|
|
end
|