sure/app/models/eval/runners/merchant_detection_runner.rb

class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base
  BATCH_SIZE = 25  # Matches Provider::Openai limit
  FUZZY_MATCH_THRESHOLD = 0.8

  protected

    def process_samples
      all_samples = samples.to_a
      log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}")

      all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
        log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}")
        process_batch(batch)
      end
    end

    def calculate_metrics
      Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate
    end

  private

    def process_batch(batch_samples)
      # Build inputs for the provider
      transactions = batch_samples.map do |sample|
        sample.to_transaction_input.merge(id: sample.id)
      end

      # Get merchants from first sample's context (should be shared)
      # Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys
      merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys)

      start_time = Time.current

      begin
        response = provider.auto_detect_merchants(
          transactions: transactions,
          user_merchants: merchants,
          model: model
        )

        latency_ms = ((Time.current - start_time) * 1000).to_i
        per_sample_latency = latency_ms / batch_samples.size

        if response.success?
          record_batch_results(batch_samples, response.data, per_sample_latency)
        else
          record_batch_errors(batch_samples, response.error, per_sample_latency)
        end
      rescue => e
        latency_ms = ((Time.current - start_time) * 1000).to_i
        per_sample_latency = latency_ms / batch_samples.size
        record_batch_errors(batch_samples, e, per_sample_latency)
      end
    end

    def record_batch_results(batch_samples, merchants_detected, per_sample_latency)
      batch_samples.each do |sample|
        # Find the merchant detection result for this sample
        detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s }

        actual_name = normalize_null(detection&.business_name)
        actual_url = normalize_null(detection&.business_url)

        expected_name = sample.expected_business_name
        expected_url = sample.expected_business_url

        # Evaluate correctness
        name_match = evaluate_name_match(actual_name, expected_name)
        url_match = evaluate_url_match(actual_url, expected_url)
        fuzzy_score = calculate_fuzzy_score(actual_name, expected_name)

        # Overall correct if both name and URL match expectations
        correct = name_match && url_match

        # Exact match requires both to be exactly equal
        exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url)

        record_result(
          sample: sample,
          actual_output: { "business_name" => actual_name, "business_url" => actual_url },
          correct: correct,
          exact_match: exact_match,
          fuzzy_score: fuzzy_score,
          null_expected: expected_name.nil? && expected_url.nil?,
          null_returned: actual_name.nil? && actual_url.nil?,
          latency_ms: per_sample_latency
        )
      end
    end

    def record_batch_errors(batch_samples, error, per_sample_latency)
      error_message = error.is_a?(Exception) ? error.message : error.to_s

      batch_samples.each do |sample|
        record_result(
          sample: sample,
          actual_output: { "error" => error_message },
          correct: false,
          exact_match: false,
          fuzzy_score: 0.0,
          null_expected: sample.expected_business_name.nil?,
          null_returned: true,
          latency_ms: per_sample_latency,
          metadata: { "error" => error_message }
        )
      end
    end

    def normalize_null(value)
      return nil if value.nil?
      return nil if value == "null"
      return nil if value.to_s.strip.empty?
      value
    end

    def evaluate_name_match(actual, expected)
      # Both null = correct
      return true if actual.nil? && expected.nil?
      # Expected null but got value = false positive
      return false if expected.nil? && actual.present?
      # Expected value but got null = false negative
      return false if actual.nil? && expected.present?
      # Use fuzzy matching for name comparison
      fuzzy_match?(actual, expected)
    end

    def evaluate_url_match(actual, expected)
      # Both null = correct
      return true if actual.nil? && expected.nil?
      # Expected null but got value = false positive
      return false if expected.nil? && actual.present?
      # Expected value but got null = false negative
      return false if actual.nil? && expected.present?
      # Normalize and compare URLs
      normalize_url(actual) == normalize_url(expected)
    end

    def normalize_url(url)
      return nil if url.nil?
      url.to_s.downcase
         .gsub(/^(https?:\/\/)?(www\.)?/, "")
         .chomp("/")
         .strip
    end

    def fuzzy_match?(actual, expected)
      return false if actual.nil? || expected.nil?
      calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD
    end

    def calculate_fuzzy_score(actual, expected)
      return 1.0 if actual == expected
      return 0.0 if actual.nil? || expected.nil?

      # Simple Levenshtein distance-based similarity
      # Normalize strings for comparison
      a = actual.to_s.downcase.strip
      b = expected.to_s.downcase.strip

      return 1.0 if a == b

      # Calculate Levenshtein distance
      distance = levenshtein_distance(a, b)
      max_length = [ a.length, b.length ].max

      return 0.0 if max_length == 0

      # Convert distance to similarity score (0.0 to 1.0)
      (1.0 - (distance.to_f / max_length)).round(4)
    end

    def levenshtein_distance(s1, s2)
      m = s1.length
      n = s2.length

      return m if n == 0
      return n if m == 0

      # Create distance matrix
      d = Array.new(m + 1) { Array.new(n + 1) }

      (0..m).each { |i| d[i][0] = i }
      (0..n).each { |j| d[0][j] = j }

      (1..n).each do |j|
        (1..m).each do |i|
          cost = s1[i - 1] == s2[j - 1] ? 0 : 1
          d[i][j] = [
            d[i - 1][j] + 1,      # deletion
            d[i][j - 1] + 1,      # insertion
            d[i - 1][j - 1] + cost # substitution
          ].min
        end
      end

      d[m][n]
    end
end