sure/app/models/eval/runners/chat_runner.rb

class Eval::Runners::ChatRunner < Eval::Runners::Base
  # Chat samples are processed one at a time (not batched)
  # because each has unique context and function calling requirements

  protected

    def process_samples
      all_samples = samples.to_a
      log_progress("Processing #{all_samples.size} chat samples")

      all_samples.each_with_index do |sample, idx|
        log_progress("Processing sample #{idx + 1}/#{all_samples.size}")
        process_sample(sample)
      end
    end

    def calculate_metrics
      Eval::Metrics::ChatMetrics.new(eval_run).calculate
    end

  private

    def process_sample(sample)
      prompt = sample.chat_prompt
      start_time = Time.current

      begin
        response = provider.chat_response(
          prompt,
          model: model,
          instructions: build_instructions,
          functions: build_function_definitions
        )

        latency_ms = ((Time.current - start_time) * 1000).to_i

        if response.success?
          record_chat_result(sample, response.data, latency_ms)
        else
          record_error_result(sample, response.error, latency_ms)
        end
      rescue => e
        latency_ms = ((Time.current - start_time) * 1000).to_i
        record_error_result(sample, e, latency_ms)
      end
    end

    def record_chat_result(sample, chat_response, latency_ms)
      # Extract function calls from response
      actual_functions = extract_functions_from_response(chat_response)

      # Extract response text
      response_text = extract_response_text(chat_response)

      # Evaluate function calling accuracy
      expected_functions = sample.expected_functions
      function_match = evaluate_function_match(actual_functions, expected_functions)

      # Evaluate response content
      expected_keywords = sample.expected_response_contains
      response_match = evaluate_response_contains(response_text, expected_keywords)

      # Overall correctness: functions are correct AND response contains expected keywords
      correct = function_match[:correct] && response_match

      record_result(
        sample: sample,
        actual_output: {
          "functions" => actual_functions,
          "response_text" => response_text,
          "function_match_details" => function_match
        },
        correct: correct,
        exact_match: function_match[:exact_match],
        latency_ms: latency_ms,
        metadata: {
          "function_selection_correct" => function_match[:selection_correct],
          "parameter_accuracy" => function_match[:parameter_accuracy],
          "response_keywords_found" => response_match,
          "expected_functions" => expected_functions,
          "expected_keywords" => expected_keywords
        }
      )
    end

    def record_error_result(sample, error, latency_ms)
      error_message = error.is_a?(Exception) ? error.message : error.to_s

      record_result(
        sample: sample,
        actual_output: { "error" => error_message },
        correct: false,
        exact_match: false,
        latency_ms: latency_ms,
        metadata: { "error" => error_message }
      )
    end

    def extract_functions_from_response(chat_response)
      # ChatResponse has function_requests array
      function_requests = chat_response.function_requests || []

      function_requests.map do |req|
        {
          "name" => req.function_name,
          "params" => parse_function_args(req.function_args)
        }
      end
    end

    def parse_function_args(args)
      return {} if args.nil?
      return args if args.is_a?(Hash)
      JSON.parse(args)
    rescue JSON::ParserError
      {}
    end

    def extract_response_text(chat_response)
      # ChatResponse has messages array with output_text
      messages = chat_response.messages || []
      messages.map(&:output_text).compact.join("\n")
    end

    def evaluate_function_match(actual_functions, expected_functions)
      return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty?
      return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any?

      # Check function selection accuracy
      expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact
      actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact

      selection_correct = expected_names.all? { |name| actual_names.include?(name) }

      # Check parameter accuracy for matched functions
      param_scores = []
      expected_functions.each do |expected_func|
        expected_name = normalize_function_name(expected_func["name"])
        actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name }

        if actual_func
          param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {})
          param_scores << param_score
        else
          param_scores << 0.0
        end
      end

      parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4)

      # Exact match requires same functions with same parameters
      exact_match = selection_correct && parameter_accuracy == 1.0

      # Correct if all expected functions were called (parameters don't have to be exact)
      correct = selection_correct

      {
        correct: correct,
        exact_match: exact_match,
        selection_correct: selection_correct,
        parameter_accuracy: parameter_accuracy
      }
    end

    def normalize_function_name(name)
      return nil if name.nil?
      # Convert to snake_case and downcase
      name.to_s.underscore.downcase
    end

    def evaluate_parameters(actual_params, expected_params)
      return 1.0 if expected_params.empty?
      return 0.0 if actual_params.nil?

      actual_params = actual_params.stringify_keys
      expected_params = expected_params.stringify_keys

      matches = 0
      total = expected_params.size

      expected_params.each do |key, expected_value|
        actual_value = actual_params[key]

        if values_match?(actual_value, expected_value)
          matches += 1
        end
      end

      (matches.to_f / total).round(4)
    end

    def values_match?(actual, expected)
      return true if actual == expected
      return true if actual.to_s.downcase == expected.to_s.downcase

      # For arrays, check if all expected values are present
      if expected.is_a?(Array) && actual.is_a?(Array)
        expected_normalized = expected.map { |v| v.to_s.downcase }
        actual_normalized = actual.map { |v| v.to_s.downcase }
        return expected_normalized.all? { |v| actual_normalized.include?(v) }
      end

      # For dates, try to parse and compare
      if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/
        begin
          expected_date = Date.parse(expected.to_s)
          actual_date = Date.parse(actual.to_s)
          return expected_date == actual_date
        rescue
          # Not valid dates, fall through
        end
      end

      false
    end

    def evaluate_response_contains(response_text, expected_keywords)
      return true if expected_keywords.empty?
      return false if response_text.nil? || response_text.empty?

      normalized_response = response_text.downcase

      expected_keywords.all? do |keyword|
        normalized_response.include?(keyword.to_s.downcase)
      end
    end

    def build_instructions
      # Simple instructions for evaluation - we don't have a real user/family context
      <<~PROMPT
      You are a financial assistant helping users understand their financial data.
      Use the functions available to answer questions about accounts, transactions, and financial statements.
      Today's date is #{Date.current}.
    PROMPT
    end

    def build_function_definitions
      # Return the function definitions that the chat would normally have
      [
        build_function_definition("get_transactions", "Get paginated transactions with optional filters"),
        build_function_definition("get_accounts", "Get all accounts with balances and historical data"),
        build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"),
        build_function_definition("get_income_statement", "Get income and expenses by category for a period")
      ]
    end

    def build_function_definition(name, description)
      {
        name: name,
        description: description,
        params_schema: { type: "object", properties: {}, additionalProperties: true },
        strict: false
      }
    end
end