sure/app/models/eval/metrics/categorization_metrics.rb

class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
  def calculate
    {
      accuracy: accuracy,
      exact_match_accuracy: exact_match_accuracy,
      alternative_match_count: alternative_match_count,
      precision: precision,
      recall: recall,
      f1_score: f1_score,
      null_accuracy: null_accuracy,
      hierarchical_accuracy: hierarchical_accuracy,
      samples_processed: total_count,
      samples_correct: correct_count,
      avg_latency_ms: avg_latency_ms,
      total_cost: total_cost,
      cost_per_sample: cost_per_sample,
      by_difficulty: metrics_by_difficulty,
      by_category: metrics_by_category
    }
  end

  private

    def exact_match_accuracy
      # Percentage of results that exactly match the primary expected category
      return 0.0 if total_count.zero?
      (results.where(exact_match: true).count.to_f / total_count * 100).round(2)
    end

    def alternative_match_count
      # Number of results that matched an alternative (but not primary) category
      results.where(alternative_match: true).count
    end

    def null_accuracy
      # Accuracy for samples where null was expected
      null_expected_results = results.where(null_expected: true)
      return 100.0 if null_expected_results.empty?

      correct = null_expected_results.where(null_returned: true).count
      total = null_expected_results.count

      (correct.to_f / total * 100).round(2)
    end

    def hierarchical_accuracy
      # Percentage of results that match at hierarchical level (including exact matches)
      return 0.0 if total_count.zero?
      (results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
    end

    def precision
      # True positives / (True positives + False positives)
      # TP: Correct non-null predictions
      # FP: Incorrect non-null predictions (predicted wrong category)
      true_positives = results.where(correct: true, null_returned: false).count
      false_positives = results.where(correct: false, null_returned: false).count

      denominator = true_positives + false_positives
      return 0.0 if denominator.zero?

      (true_positives.to_f / denominator * 100).round(2)
    end

    def recall
      # True positives / (True positives + False negatives)
      # TP: Correct non-null predictions
      # FN: Incorrectly returned null when category was expected
      true_positives = results.where(correct: true, null_returned: false).count
      false_negatives = results.where(null_expected: false, null_returned: true).count

      denominator = true_positives + false_negatives
      return 0.0 if denominator.zero?

      (true_positives.to_f / denominator * 100).round(2)
    end

    def f1_score
      return 0.0 if precision.zero? || recall.zero?
      (2 * precision * recall / (precision + recall)).round(2)
    end

    def metrics_by_category
      # Group results by expected category and calculate accuracy
      category_metrics = {}

      results.includes(:sample).each do |result|
        expected = result.sample.expected_category_name || "null"

        category_metrics[expected] ||= { correct: 0, total: 0 }
        category_metrics[expected][:total] += 1
        category_metrics[expected][:correct] += 1 if result.correct
      end

      category_metrics.transform_values do |metrics|
        metrics.merge(
          accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
        )
      end
    end
end