sure/app/models/eval/run.rb

class Eval::Run < ApplicationRecord
  self.table_name = "eval_runs"

  belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
  has_many :results, class_name: "Eval::Result", foreign_key: :eval_run_id, dependent: :destroy

  validates :provider, :model, :status, presence: true
  validates :status, inclusion: { in: %w[pending running completed failed] }

  scope :pending, -> { where(status: "pending") }
  scope :running, -> { where(status: "running") }
  scope :completed, -> { where(status: "completed") }
  scope :failed, -> { where(status: "failed") }
  scope :for_model, ->(model) { where(model: model) }
  scope :for_provider, ->(provider) { where(provider: provider) }

  # Calculate duration in seconds
  def duration_seconds
    return nil unless started_at && completed_at
    (completed_at - started_at).to_i
  end

  # Get accuracy from metrics or calculate
  def accuracy
    metrics.dig("accuracy") || calculate_accuracy
  end

  # Start the evaluation run
  def start!
    update!(status: "running", started_at: Time.current)
  end

  # Complete the evaluation run with metrics
  def complete!(calculated_metrics)
    update!(
      status: "completed",
      completed_at: Time.current,
      metrics: calculated_metrics,
      total_prompt_tokens: results.sum(:prompt_tokens),
      total_completion_tokens: results.sum(:completion_tokens),
      total_cost: results.sum(:cost)
    )
  end

  # Fail the evaluation run
  def fail!(error)
    update!(
      status: "failed",
      completed_at: Time.current,
      error_message: error.is_a?(Exception) ? "#{error.class}: #{error.message}" : error.to_s
    )
  end

  # Summary for display
  def summary
    {
      id: id,
      name: name,
      dataset: dataset.name,
      model: model,
      provider: provider,
      status: status,
      accuracy: accuracy,
      total_cost: total_cost&.to_f,
      duration: duration_seconds,
      samples_processed: results.count,
      samples_correct: results.where(correct: true).count,
      created_at: created_at
    }
  end

  # Compare this run to another
  def compare_to(other_run)
    {
      accuracy_diff: (accuracy || 0) - (other_run.accuracy || 0),
      cost_diff: (total_cost || 0) - (other_run.total_cost || 0),
      this_model: model,
      other_model: other_run.model
    }
  end

  private

    def calculate_accuracy
      return 0.0 if results.empty?
      (results.where(correct: true).count.to_f / results.count * 100).round(2)
    end
end