Small llms improvements (#400)

* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2026-04-20 04:24:06 +00:00 · 2025-12-07 18:11:34 +01:00
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions
--- a/app/models/eval/runners/merchant_detection_runner.rb
+++ b/app/models/eval/runners/merchant_detection_runner.rb
@@ -0,0 +1,199 @@
+class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base
+  BATCH_SIZE = 25  # Matches Provider::Openai limit
+  FUZZY_MATCH_THRESHOLD = 0.8
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}")
+
+      all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
+        log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}")
+        process_batch(batch)
+      end
+    end
+
+    def calculate_metrics
+      Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_batch(batch_samples)
+      # Build inputs for the provider
+      transactions = batch_samples.map do |sample|
+        sample.to_transaction_input.merge(id: sample.id)
+      end
+
+      # Get merchants from first sample's context (should be shared)
+      # Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys
+      merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys)
+
+      start_time = Time.current
+
+      begin
+        response = provider.auto_detect_merchants(
+          transactions: transactions,
+          user_merchants: merchants,
+          model: model
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+
+        if response.success?
+          record_batch_results(batch_samples, response.data, per_sample_latency)
+        else
+          record_batch_errors(batch_samples, response.error, per_sample_latency)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+        record_batch_errors(batch_samples, e, per_sample_latency)
+      end
+    end
+
+    def record_batch_results(batch_samples, merchants_detected, per_sample_latency)
+      batch_samples.each do |sample|
+        # Find the merchant detection result for this sample
+        detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s }
+
+        actual_name = normalize_null(detection&.business_name)
+        actual_url = normalize_null(detection&.business_url)
+
+        expected_name = sample.expected_business_name
+        expected_url = sample.expected_business_url
+
+        # Evaluate correctness
+        name_match = evaluate_name_match(actual_name, expected_name)
+        url_match = evaluate_url_match(actual_url, expected_url)
+        fuzzy_score = calculate_fuzzy_score(actual_name, expected_name)
+
+        # Overall correct if both name and URL match expectations
+        correct = name_match && url_match
+
+        # Exact match requires both to be exactly equal
+        exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url)
+
+        record_result(
+          sample: sample,
+          actual_output: { "business_name" => actual_name, "business_url" => actual_url },
+          correct: correct,
+          exact_match: exact_match,
+          fuzzy_score: fuzzy_score,
+          null_expected: expected_name.nil? && expected_url.nil?,
+          null_returned: actual_name.nil? && actual_url.nil?,
+          latency_ms: per_sample_latency
+        )
+      end
+    end
+
+    def record_batch_errors(batch_samples, error, per_sample_latency)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      batch_samples.each do |sample|
+        record_result(
+          sample: sample,
+          actual_output: { "error" => error_message },
+          correct: false,
+          exact_match: false,
+          fuzzy_score: 0.0,
+          null_expected: sample.expected_business_name.nil?,
+          null_returned: true,
+          latency_ms: per_sample_latency,
+          metadata: { "error" => error_message }
+        )
+      end
+    end
+
+    def normalize_null(value)
+      return nil if value.nil?
+      return nil if value == "null"
+      return nil if value.to_s.strip.empty?
+      value
+    end
+
+    def evaluate_name_match(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = false positive
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = false negative
+      return false if actual.nil? && expected.present?
+      # Use fuzzy matching for name comparison
+      fuzzy_match?(actual, expected)
+    end
+
+    def evaluate_url_match(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = false positive
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = false negative
+      return false if actual.nil? && expected.present?
+      # Normalize and compare URLs
+      normalize_url(actual) == normalize_url(expected)
+    end
+
+    def normalize_url(url)
+      return nil if url.nil?
+      url.to_s.downcase
+         .gsub(/^(https?:\/\/)?(www\.)?/, "")
+         .chomp("/")
+         .strip
+    end
+
+    def fuzzy_match?(actual, expected)
+      return false if actual.nil? || expected.nil?
+      calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD
+    end
+
+    def calculate_fuzzy_score(actual, expected)
+      return 1.0 if actual == expected
+      return 0.0 if actual.nil? || expected.nil?
+
+      # Simple Levenshtein distance-based similarity
+      # Normalize strings for comparison
+      a = actual.to_s.downcase.strip
+      b = expected.to_s.downcase.strip
+
+      return 1.0 if a == b
+
+      # Calculate Levenshtein distance
+      distance = levenshtein_distance(a, b)
+      max_length = [ a.length, b.length ].max
+
+      return 0.0 if max_length == 0
+
+      # Convert distance to similarity score (0.0 to 1.0)
+      (1.0 - (distance.to_f / max_length)).round(4)
+    end
+
+    def levenshtein_distance(s1, s2)
+      m = s1.length
+      n = s2.length
+
+      return m if n == 0
+      return n if m == 0
+
+      # Create distance matrix
+      d = Array.new(m + 1) { Array.new(n + 1) }
+
+      (0..m).each { |i| d[i][0] = i }
+      (0..n).each { |j| d[0][j] = j }
+
+      (1..n).each do |j|
+        (1..m).each do |i|
+          cost = s1[i - 1] == s2[j - 1] ? 0 : 1
+          d[i][j] = [
+            d[i - 1][j] + 1,      # deletion
+            d[i][j - 1] + 1,      # insertion
+            d[i - 1][j - 1] + cost # substitution
+          ].min
+        end
+      end
+
+      d[m][n]
+    end
+end