Small llms improvements (#400)

* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2026-04-19 12:04:08 +00:00 · 2025-12-07 18:11:34 +01:00
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions
--- a/app/models/eval/metrics/base.rb
+++ b/app/models/eval/metrics/base.rb
@@ -0,0 +1,68 @@
+class Eval::Metrics::Base
+  attr_reader :eval_run
+
+  def initialize(eval_run)
+    @eval_run = eval_run
+  end
+
+  def calculate
+    raise NotImplementedError, "Subclasses must implement #calculate"
+  end
+
+  protected
+
+    def results
+      @results ||= eval_run.results.includes(:sample)
+    end
+
+    def samples
+      @samples ||= eval_run.dataset.samples
+    end
+
+    def total_count
+      results.count
+    end
+
+    def correct_count
+      results.where(correct: true).count
+    end
+
+    def incorrect_count
+      results.where(correct: false).count
+    end
+
+    def accuracy
+      return 0.0 if total_count.zero?
+      (correct_count.to_f / total_count * 100).round(2)
+    end
+
+    def avg_latency_ms
+      return nil if total_count.zero?
+      results.average(:latency_ms)&.round(0)
+    end
+
+    def total_cost
+      results.sum(:cost)&.to_f&.round(6)
+    end
+
+    def cost_per_sample
+      return nil if total_count.zero?
+      (total_cost / total_count).round(6)
+    end
+
+    def metrics_by_difficulty
+      %w[easy medium hard edge_case].index_with do |difficulty|
+        difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty })
+        next nil if difficulty_results.empty?
+
+        correct = difficulty_results.where(correct: true).count
+        total = difficulty_results.count
+
+        {
+          count: total,
+          correct: correct,
+          accuracy: (correct.to_f / total * 100).round(2)
+        }
+      end.compact
+    end
+end
--- a/app/models/eval/metrics/categorization_metrics.rb
+++ b/app/models/eval/metrics/categorization_metrics.rb
@@ -0,0 +1,101 @@
+class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
+  def calculate
+    {
+      accuracy: accuracy,
+      exact_match_accuracy: exact_match_accuracy,
+      alternative_match_count: alternative_match_count,
+      precision: precision,
+      recall: recall,
+      f1_score: f1_score,
+      null_accuracy: null_accuracy,
+      hierarchical_accuracy: hierarchical_accuracy,
+      samples_processed: total_count,
+      samples_correct: correct_count,
+      avg_latency_ms: avg_latency_ms,
+      total_cost: total_cost,
+      cost_per_sample: cost_per_sample,
+      by_difficulty: metrics_by_difficulty,
+      by_category: metrics_by_category
+    }
+  end
+
+  private
+
+    def exact_match_accuracy
+      # Percentage of results that exactly match the primary expected category
+      return 0.0 if total_count.zero?
+      (results.where(exact_match: true).count.to_f / total_count * 100).round(2)
+    end
+
+    def alternative_match_count
+      # Number of results that matched an alternative (but not primary) category
+      results.where(alternative_match: true).count
+    end
+
+    def null_accuracy
+      # Accuracy for samples where null was expected
+      null_expected_results = results.where(null_expected: true)
+      return 100.0 if null_expected_results.empty?
+
+      correct = null_expected_results.where(null_returned: true).count
+      total = null_expected_results.count
+
+      (correct.to_f / total * 100).round(2)
+    end
+
+    def hierarchical_accuracy
+      # Percentage of results that match at hierarchical level (including exact matches)
+      return 0.0 if total_count.zero?
+      (results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
+    end
+
+    def precision
+      # True positives / (True positives + False positives)
+      # TP: Correct non-null predictions
+      # FP: Incorrect non-null predictions (predicted wrong category)
+      true_positives = results.where(correct: true, null_returned: false).count
+      false_positives = results.where(correct: false, null_returned: false).count
+
+      denominator = true_positives + false_positives
+      return 0.0 if denominator.zero?
+
+      (true_positives.to_f / denominator * 100).round(2)
+    end
+
+    def recall
+      # True positives / (True positives + False negatives)
+      # TP: Correct non-null predictions
+      # FN: Incorrectly returned null when category was expected
+      true_positives = results.where(correct: true, null_returned: false).count
+      false_negatives = results.where(null_expected: false, null_returned: true).count
+
+      denominator = true_positives + false_negatives
+      return 0.0 if denominator.zero?
+
+      (true_positives.to_f / denominator * 100).round(2)
+    end
+
+    def f1_score
+      return 0.0 if precision.zero? || recall.zero?
+      (2 * precision * recall / (precision + recall)).round(2)
+    end
+
+    def metrics_by_category
+      # Group results by expected category and calculate accuracy
+      category_metrics = {}
+
+      results.includes(:sample).each do |result|
+        expected = result.sample.expected_category_name || "null"
+
+        category_metrics[expected] ||= { correct: 0, total: 0 }
+        category_metrics[expected][:total] += 1
+        category_metrics[expected][:correct] += 1 if result.correct
+      end
+
+      category_metrics.transform_values do |metrics|
+        metrics.merge(
+          accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
+        )
+      end
+    end
+end
--- a/app/models/eval/metrics/chat_metrics.rb
+++ b/app/models/eval/metrics/chat_metrics.rb
@@ -0,0 +1,125 @@
+class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
+  def calculate
+    {
+      accuracy: accuracy,
+      function_selection_accuracy: function_selection_accuracy,
+      parameter_accuracy: parameter_accuracy,
+      response_relevance: response_relevance,
+      exact_match_rate: exact_match_rate,
+      error_rate: error_rate,
+      avg_functions_per_response: avg_functions_per_response,
+      samples_processed: total_count,
+      samples_correct: correct_count,
+      avg_latency_ms: avg_latency_ms,
+      total_cost: total_cost,
+      cost_per_sample: cost_per_sample,
+      by_difficulty: metrics_by_difficulty,
+      by_function: metrics_by_function
+    }
+  end
+
+  private
+
+    def function_selection_accuracy
+      # Percentage of samples where correct functions were called
+      valid_results = results.where.not("metadata->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      correct = valid_results.count do |r|
+        r.metadata.dig("function_selection_correct") == true
+      end
+
+      (correct.to_f / valid_results.count * 100).round(2)
+    end
+
+    def parameter_accuracy
+      # Average parameter accuracy across all samples
+      valid_results = results.where.not("metadata->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      scores = valid_results.map do |r|
+        r.metadata.dig("parameter_accuracy") || 0.0
+      end
+
+      (scores.sum / scores.size * 100).round(2)
+    end
+
+    def response_relevance
+      # Percentage of samples where response contained expected keywords
+      valid_results = results.where.not("metadata->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      correct = valid_results.count do |r|
+        # If no keywords expected, consider it relevant
+        expected_keywords = r.metadata.dig("expected_keywords") || []
+        expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
+      end
+
+      (correct.to_f / valid_results.count * 100).round(2)
+    end
+
+    def exact_match_rate
+      return 0.0 if total_count.zero?
+      (results.where(exact_match: true).count.to_f / total_count * 100).round(2)
+    end
+
+    def error_rate
+      return 0.0 if total_count.zero?
+
+      errors = results.count do |r|
+        r.metadata.dig("error").present? || r.actual_output.dig("error").present?
+      end
+
+      (errors.to_f / total_count * 100).round(2)
+    end
+
+    def avg_functions_per_response
+      valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      total_functions = valid_results.sum do |r|
+        functions = r.actual_output.dig("functions") || []
+        functions.size
+      end
+
+      (total_functions.to_f / valid_results.count).round(2)
+    end
+
+    def metrics_by_function
+      # Group results by expected function and calculate accuracy
+      function_metrics = {}
+
+      results.includes(:sample).each do |result|
+        expected_functions = result.sample.expected_functions
+
+        expected_functions.each do |func|
+          name = func["name"]
+          next if name.nil?
+
+          function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
+          function_metrics[name][:total] += 1
+
+          # Check if this specific function was called correctly
+          actual_functions = result.actual_output.dig("functions") || []
+          if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
+            function_metrics[name][:correct] += 1
+            function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
+          end
+        end
+      end
+
+      function_metrics.transform_values do |metrics|
+        {
+          total: metrics[:total],
+          correct: metrics[:correct],
+          accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
+          avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
+        }
+      end
+    end
+
+    def normalize_name(name)
+      return nil if name.nil?
+      name.to_s.underscore.downcase
+    end
+end
--- a/app/models/eval/metrics/merchant_detection_metrics.rb
+++ b/app/models/eval/metrics/merchant_detection_metrics.rb
@@ -0,0 +1,107 @@
+class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base
+  FUZZY_MATCH_THRESHOLD = 0.8
+
+  def calculate
+    {
+      accuracy: accuracy,
+      name_accuracy: name_accuracy,
+      fuzzy_name_accuracy: fuzzy_name_accuracy,
+      url_accuracy: url_accuracy,
+      false_positive_rate: false_positive_rate,
+      false_negative_rate: false_negative_rate,
+      samples_processed: total_count,
+      samples_correct: correct_count,
+      avg_latency_ms: avg_latency_ms,
+      total_cost: total_cost,
+      cost_per_sample: cost_per_sample,
+      avg_fuzzy_score: avg_fuzzy_score,
+      by_difficulty: metrics_by_difficulty
+    }
+  end
+
+  private
+
+    def name_accuracy
+      # Exact name match accuracy for non-null expected names
+      name_results = results.includes(:sample).select do |r|
+        r.sample.expected_business_name.present?
+      end
+
+      return 100.0 if name_results.empty?
+
+      correct = name_results.count do |r|
+        actual = r.actual_output.dig("business_name") || r.actual_output["business_name"]
+        expected = r.sample.expected_business_name
+        actual == expected
+      end
+
+      (correct.to_f / name_results.size * 100).round(2)
+    end
+
+    def fuzzy_name_accuracy
+      # Fuzzy name match accuracy (using fuzzy_score >= threshold)
+      name_results = results.includes(:sample).select do |r|
+        r.sample.expected_business_name.present?
+      end
+
+      return 100.0 if name_results.empty?
+
+      correct = name_results.count do |r|
+        (r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD
+      end
+
+      (correct.to_f / name_results.size * 100).round(2)
+    end
+
+    def url_accuracy
+      # URL match accuracy for non-null expected URLs
+      url_results = results.includes(:sample).select do |r|
+        r.sample.expected_business_url.present?
+      end
+
+      return 100.0 if url_results.empty?
+
+      correct = url_results.count do |r|
+        actual = r.actual_output.dig("business_url") || r.actual_output["business_url"]
+        expected = r.sample.expected_business_url
+        normalize_url(actual) == normalize_url(expected)
+      end
+
+      (correct.to_f / url_results.size * 100).round(2)
+    end
+
+    def false_positive_rate
+      # Rate of returning a merchant when null was expected
+      null_expected_results = results.where(null_expected: true)
+      return 0.0 if null_expected_results.empty?
+
+      false_positives = null_expected_results.where(null_returned: false).count
+
+      (false_positives.to_f / null_expected_results.count * 100).round(2)
+    end
+
+    def false_negative_rate
+      # Rate of returning null when a merchant was expected
+      merchant_expected_results = results.where(null_expected: false)
+      return 0.0 if merchant_expected_results.empty?
+
+      false_negatives = merchant_expected_results.where(null_returned: true).count
+
+      (false_negatives.to_f / merchant_expected_results.count * 100).round(2)
+    end
+
+    def avg_fuzzy_score
+      scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score)
+      return nil if scores.empty?
+
+      (scores.sum / scores.size).round(4)
+    end
+
+    def normalize_url(url)
+      return nil if url.nil?
+      url.to_s.downcase
+         .gsub(/^(https?:\/\/)?(www\.)?/, "")
+         .chomp("/")
+         .strip
+    end
+end