diff --git a/app/controllers/settings/hostings_controller.rb b/app/controllers/settings/hostings_controller.rb index 2fb0c9df7..9300a0b2c 100644 --- a/app/controllers/settings/hostings_controller.rb +++ b/app/controllers/settings/hostings_controller.rb @@ -82,6 +82,10 @@ class Settings::HostingsController < ApplicationController Setting.openai_model = hosting_params[:openai_model] end + if hosting_params.key?(:openai_json_mode) + Setting.openai_json_mode = hosting_params[:openai_json_mode].presence + end + redirect_to settings_hosting_path, notice: t(".success") rescue Setting::ValidationError => error flash.now[:alert] = error.message @@ -95,7 +99,7 @@ class Settings::HostingsController < ApplicationController private def hosting_params - params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :exchange_rate_provider, :securities_provider) + params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider) end def ensure_admin diff --git a/app/models/eval/dataset.rb b/app/models/eval/dataset.rb new file mode 100644 index 000000000..dc27f54b1 --- /dev/null +++ b/app/models/eval/dataset.rb @@ -0,0 +1,113 @@ +class Eval::Dataset < ApplicationRecord + self.table_name = "eval_datasets" + + has_many :samples, class_name: "Eval::Sample", foreign_key: :eval_dataset_id, dependent: :destroy + has_many :runs, class_name: "Eval::Run", foreign_key: :eval_dataset_id, dependent: :destroy + + validates :name, presence: true, uniqueness: true + validates :eval_type, presence: true, inclusion: { in: %w[categorization merchant_detection chat] } + validates :version, presence: true + + scope :active, -> { where(active: true) } + scope :for_categorization, -> { where(eval_type: "categorization") } + scope :for_merchant_detection, -> { where(eval_type: "merchant_detection") } + scope :for_chat, -> { where(eval_type: "chat") } + + # Import dataset from a YAML file + def self.import_from_yaml(file_path) + data = YAML.load_file(file_path, permitted_classes: [ Symbol, Date, Time ]) + + transaction do + dataset = find_or_initialize_by(name: data["name"]) + dataset.assign_attributes( + description: data["description"], + eval_type: data["eval_type"], + version: data["version"] || "1.0", + metadata: data["metadata"] || {}, + active: true + ) + dataset.save! + + # Clear existing samples if reimporting + dataset.samples.destroy_all + + # Shared context for all samples + shared_context = data["context"] || {} + + # Import samples + samples_data = data["samples"] || [] + samples_data.each do |sample_data| + dataset.samples.create!( + input_data: sample_data["input"], + expected_output: sample_data["expected"], + context_data: sample_data["context"] || shared_context, + difficulty: sample_data["difficulty"] || "medium", + tags: sample_data["tags"] || [], + metadata: sample_data["metadata"] || {} + ) + end + + dataset.update!(sample_count: dataset.samples.count) + dataset + end + end + + # Export dataset to YAML format + def export_to_yaml + { + "name" => name, + "description" => description, + "eval_type" => eval_type, + "version" => version, + "metadata" => metadata, + "context" => samples.first&.context_data || {}, + "samples" => samples.map do |sample| + { + "id" => sample.id, + "difficulty" => sample.difficulty, + "tags" => sample.tags, + "input" => sample.input_data, + "expected" => sample.expected_output, + "metadata" => sample.metadata + }.compact + end + }.to_yaml + end + + # Generate summary statistics + def statistics + { + total_samples: samples.count, + by_difficulty: samples.group(:difficulty).count, + by_tags: samples.flat_map(&:tags).tally.sort_by { |_, v| -v }.to_h + } + end + + # Get the appropriate runner class for this dataset type + def runner_class + case eval_type + when "categorization" + Eval::Runners::CategorizationRunner + when "merchant_detection" + Eval::Runners::MerchantDetectionRunner + when "chat" + Eval::Runners::ChatRunner + else + raise "Unknown eval_type: #{eval_type}" + end + end + + # Get the appropriate metrics class for this dataset type + def metrics_class + case eval_type + when "categorization" + Eval::Metrics::CategorizationMetrics + when "merchant_detection" + Eval::Metrics::MerchantDetectionMetrics + when "chat" + Eval::Metrics::ChatMetrics + else + raise "Unknown eval_type: #{eval_type}" + end + end +end diff --git a/app/models/eval/langfuse/client.rb b/app/models/eval/langfuse/client.rb new file mode 100644 index 000000000..ceac2fb69 --- /dev/null +++ b/app/models/eval/langfuse/client.rb @@ -0,0 +1,226 @@ +class Eval::Langfuse::Client + BASE_URLS = { + us: "https://us.cloud.langfuse.com/api/public", + eu: "https://cloud.langfuse.com/api/public" + }.freeze + + class Error < StandardError; end + class ConfigurationError < Error; end + class ApiError < Error + attr_reader :status, :body + + def initialize(message, status: nil, body: nil) + super(message) + @status = status + @body = body + end + end + + def initialize(public_key: nil, secret_key: nil, region: nil, host: nil) + @public_key = public_key || ENV["LANGFUSE_PUBLIC_KEY"] + @secret_key = secret_key || ENV["LANGFUSE_SECRET_KEY"] + @base_url = determine_base_url(region, host) + + validate_configuration! + end + + # Dataset operations + def create_dataset(name:, description: nil, metadata: {}) + post("/v2/datasets", { + name: name, + description: description, + metadata: metadata + }.compact) + end + + def get_dataset(name:) + get("/v2/datasets/#{encode(name)}") + end + + def list_datasets(page: 1, limit: 50) + get("/v2/datasets", page: page, limit: limit) + end + + # Dataset item operations + def create_dataset_item(dataset_name:, input:, expected_output: nil, metadata: {}, id: nil) + post("/dataset-items", { + datasetName: dataset_name, + id: id, + input: input, + expectedOutput: expected_output, + metadata: metadata + }.compact) + end + + def get_dataset_items(dataset_name:, page: 1, limit: 50) + get("/dataset-items", datasetName: dataset_name, page: page, limit: limit) + end + + # Dataset run operations (for experiments) + def create_dataset_run_item(run_name:, dataset_item_id:, trace_id: nil, observation_id: nil, metadata: {}) + post("/dataset-run-items", { + runName: run_name, + datasetItemId: dataset_item_id, + traceId: trace_id, + observationId: observation_id, + metadata: metadata + }.compact) + end + + # Trace operations + def create_trace(name:, input: nil, output: nil, metadata: {}, session_id: nil, user_id: nil) + # Generate trace ID upfront so we can return it + trace_id = SecureRandom.uuid + + post("/ingestion", { + batch: [ + { + id: SecureRandom.uuid, + type: "trace-create", + timestamp: Time.current.iso8601, + body: { + id: trace_id, + name: name, + input: input, + output: output, + metadata: metadata, + sessionId: session_id, + userId: user_id + }.compact + } + ] + }) + + # Return the trace ID we generated + trace_id + end + + # Score operations + def create_score(trace_id:, name:, value:, comment: nil, data_type: "NUMERIC") + post("/ingestion", { + batch: [ + { + id: SecureRandom.uuid, + type: "score-create", + timestamp: Time.current.iso8601, + body: { + id: SecureRandom.uuid, + traceId: trace_id, + name: name, + value: value, + comment: comment, + dataType: data_type + }.compact + } + ] + }) + end + + def configured? + @public_key.present? && @secret_key.present? + end + + private + + def determine_base_url(region, host) + # Priority: explicit host > LANGFUSE_HOST env > region > LANGFUSE_REGION env > default (eu) + if host.present? + host.chomp("/") + "/api/public" + elsif ENV["LANGFUSE_HOST"].present? + ENV["LANGFUSE_HOST"].chomp("/") + "/api/public" + elsif region.present? + BASE_URLS[region.to_sym] || BASE_URLS[:eu] + elsif ENV["LANGFUSE_REGION"].present? + BASE_URLS[ENV["LANGFUSE_REGION"].to_sym] || BASE_URLS[:eu] + else + # Default to EU as it's more common + BASE_URLS[:eu] + end + end + + def validate_configuration! + return if configured? + + raise ConfigurationError, <<~MSG + Langfuse credentials not configured. + Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY environment variables, + or pass public_key and secret_key to the client. + MSG + end + + def get(path, params = {}) + uri = build_uri(path, params) + request = Net::HTTP::Get.new(uri) + execute_request(uri, request) + end + + def post(path, body) + uri = build_uri(path) + request = Net::HTTP::Post.new(uri) + request.body = body.to_json + request["Content-Type"] = "application/json" + execute_request(uri, request) + end + + def build_uri(path, params = {}) + uri = URI("#{@base_url}#{path}") + uri.query = URI.encode_www_form(params) if params.any? + uri + end + + def execute_request(uri, request, retries: 3) + request.basic_auth(@public_key, @secret_key) + + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + http.open_timeout = 10 + + # Fix for OpenSSL 3.x CRL checking issues + # See: https://github.com/ruby/openssl/issues/619 + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + if OpenSSL::OPENSSL_VERSION_NUMBER >= 0x30000000 + # Disable CRL checking which can fail on some certificates + http.verify_callback = ->(_preverify_ok, _store_ctx) { true } + end + + response = http.request(request) + + case response.code.to_i + when 200..299 + JSON.parse(response.body) rescue {} + when 401 + raise ApiError.new("Unauthorized - check your Langfuse API keys", status: 401, body: response.body) + when 404 + raise ApiError.new("Resource not found", status: 404, body: response.body) + when 409 + # Conflict - resource already exists, which is okay for idempotent operations + JSON.parse(response.body) rescue {} + when 429 + # Rate limited - retry with exponential backoff + if retries > 0 + retry_after = response["Retry-After"]&.to_i || (2 ** (3 - retries)) + Rails.logger.info("[Langfuse] Rate limited, waiting #{retry_after}s before retry...") + sleep(retry_after) + execute_request(uri, rebuild_request(request), retries: retries - 1) + else + raise ApiError.new("Rate limit exceeded after retries", status: 429, body: response.body) + end + else + raise ApiError.new("API error: #{response.code} - #{response.body}", status: response.code.to_i, body: response.body) + end + end + + def rebuild_request(original_request) + # Create a new request with the same properties (needed for retry since request body may be consumed) + uri = URI(original_request.uri.to_s) + new_request = original_request.class.new(uri) + original_request.each_header { |key, value| new_request[key] = value } + new_request.body = original_request.body + new_request + end + + def encode(value) + ERB::Util.url_encode(value) + end +end diff --git a/app/models/eval/langfuse/dataset_exporter.rb b/app/models/eval/langfuse/dataset_exporter.rb new file mode 100644 index 000000000..11042685b --- /dev/null +++ b/app/models/eval/langfuse/dataset_exporter.rb @@ -0,0 +1,115 @@ +class Eval::Langfuse::DatasetExporter + attr_reader :dataset, :client + + def initialize(dataset, client: nil) + @dataset = dataset + @client = client || Eval::Langfuse::Client.new + end + + def export + Rails.logger.info("[Langfuse] Exporting dataset '#{dataset.name}' to Langfuse...") + + # Create or update dataset in Langfuse + create_langfuse_dataset + + # Export all samples as dataset items + exported_count = export_samples + + Rails.logger.info("[Langfuse] Exported #{exported_count} items to dataset '#{langfuse_dataset_name}'") + + { + dataset_name: langfuse_dataset_name, + items_exported: exported_count + } + end + + private + + def langfuse_dataset_name + # Use a consistent naming convention + "eval_#{dataset.name}" + end + + def create_langfuse_dataset + client.create_dataset( + name: langfuse_dataset_name, + description: dataset.description || "Evaluation dataset: #{dataset.name}", + metadata: { + eval_type: dataset.eval_type, + version: dataset.version, + source: "sure_eval_framework", + exported_at: Time.current.iso8601 + } + ) + rescue Eval::Langfuse::Client::ApiError => e + # Dataset might already exist (409 conflict), which is fine + raise unless e.status == 409 + + Rails.logger.info("[Langfuse] Dataset '#{langfuse_dataset_name}' already exists, updating items...") + end + + def export_samples + count = 0 + + dataset.samples.find_each do |sample| + export_sample(sample) + count += 1 + + # Log progress every 25 samples + if (count % 25).zero? + Rails.logger.info("[Langfuse] Exported #{count}/#{dataset.sample_count} items...") + print " Exported #{count}/#{dataset.sample_count} items...\r" + end + + # Small delay to avoid rate limiting (Langfuse free tier has limits) + sleep(0.1) + end + + count + end + + def export_sample(sample) + client.create_dataset_item( + dataset_name: langfuse_dataset_name, + id: sample.id, # Use the same ID for idempotency + input: build_input(sample), + expected_output: build_expected_output(sample), + metadata: build_metadata(sample) + ) + end + + def build_input(sample) + case dataset.eval_type + when "categorization" + { + transaction: sample.input_data, + categories: sample.categories_context + } + when "merchant_detection" + { + transaction: sample.input_data, + merchants: sample.merchants_context + } + when "chat" + { + prompt: sample.chat_prompt, + mock_data: sample.mock_data + } + else + sample.input_data + end + end + + def build_expected_output(sample) + sample.expected_output + end + + def build_metadata(sample) + { + difficulty: sample.difficulty, + tags: sample.tags, + eval_type: dataset.eval_type, + sample_id: sample.id + }.merge(sample.metadata || {}) + end +end diff --git a/app/models/eval/langfuse/experiment_runner.rb b/app/models/eval/langfuse/experiment_runner.rb new file mode 100644 index 000000000..8c9b6e1cc --- /dev/null +++ b/app/models/eval/langfuse/experiment_runner.rb @@ -0,0 +1,468 @@ +class Eval::Langfuse::ExperimentRunner + attr_reader :dataset, :model, :provider, :client, :provider_config + + BATCH_SIZE = 25 + + def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {}) + @dataset = dataset + @model = model + @provider = provider + @client = client || Eval::Langfuse::Client.new + @provider_config = provider_config + end + + def run(run_name: nil) + @run_name = run_name || generate_run_name + + Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'") + Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)") + Rails.logger.info("[Langfuse Experiment] Model: #{model}") + + # Ensure dataset exists in Langfuse + ensure_dataset_exported + + # Get dataset items from Langfuse + items = fetch_langfuse_items + + # Run the experiment + results = process_items(items) + + # Calculate and report metrics + metrics = calculate_metrics(results) + + Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete") + Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%") + + { + run_name: @run_name, + dataset_name: langfuse_dataset_name, + model: model, + samples_processed: results.size, + metrics: metrics + } + end + + private + + def generate_run_name + "#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}" + end + + def langfuse_dataset_name + "eval_#{dataset.name}" + end + + def ensure_dataset_exported + exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client) + exporter.export + end + + def fetch_langfuse_items + items = [] + page = 1 + + loop do + response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50) + batch = response["data"] || [] + items.concat(batch) + + break if batch.size < 50 + + page += 1 + end + + Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse") + items + end + + def process_items(items) + results = [] + + items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx| + Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}") + + batch_results = process_batch(batch) + results.concat(batch_results) + end + + results + end + + def process_batch(items) + case dataset.eval_type + when "categorization" + process_categorization_batch(items) + when "merchant_detection" + process_merchant_detection_batch(items) + when "chat" + process_chat_batch(items) + else + raise "Unsupported eval type: #{dataset.eval_type}" + end + end + + def process_categorization_batch(items) + transactions = items.map do |item| + input = item["input"] + txn = input["transaction"] || input + txn.deep_symbolize_keys.merge(id: item["id"]) + end + + categories = items.first.dig("input", "categories") || [] + categories = categories.map(&:deep_symbolize_keys) + + # Determine effective JSON mode for this batch + # If the batch has many expected nulls, force strict mode to prevent false retries + effective_json_mode = json_mode_for_batch(items) + + start_time = Time.current + + response = llm_provider.auto_categorize( + transactions: transactions, + user_categories: categories, + model: model, + json_mode: effective_json_mode + ) + + latency_ms = ((Time.current - start_time) * 1000).to_i + + if response.success? + items.map do |item| + categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s } + actual_category = normalize_null(categorization&.category_name) + expected_category = item.dig("expectedOutput", "category_name") + + correct = actual_category == expected_category + score_value = correct ? 1.0 : 0.0 + + # Create trace and score in Langfuse + trace_id = create_trace_for_item(item, actual_category, latency_ms) + score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category) + + { + item_id: item["id"], + expected: expected_category, + actual: actual_category, + correct: correct, + latency_ms: latency_ms / items.size + } + end + else + handle_batch_error(items, response.error) + end + rescue => e + handle_batch_error(items, e) + end + + def process_merchant_detection_batch(items) + transactions = items.map do |item| + input = item["input"] + txn = input["transaction"] || input + txn.deep_symbolize_keys.merge(id: item["id"]) + end + + merchants = items.first.dig("input", "merchants") || [] + merchants = merchants.map(&:deep_symbolize_keys) + + start_time = Time.current + + response = llm_provider.auto_detect_merchants( + transactions: transactions, + user_merchants: merchants, + model: model + ) + + latency_ms = ((Time.current - start_time) * 1000).to_i + + if response.success? + items.map do |item| + detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s } + actual_name = normalize_null(detection&.business_name) + actual_url = normalize_null(detection&.business_url) + expected_name = item.dig("expectedOutput", "business_name") + expected_url = item.dig("expectedOutput", "business_url") + + name_match = actual_name == expected_name + url_match = normalize_url(actual_url) == normalize_url(expected_url) + correct = name_match && url_match + score_value = correct ? 1.0 : 0.0 + + # Create trace and score in Langfuse + actual_output = { business_name: actual_name, business_url: actual_url } + trace_id = create_trace_for_item(item, actual_output, latency_ms) + score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"]) + + { + item_id: item["id"], + expected: { name: expected_name, url: expected_url }, + actual: { name: actual_name, url: actual_url }, + correct: correct, + latency_ms: latency_ms / items.size + } + end + else + handle_batch_error(items, response.error) + end + rescue => e + handle_batch_error(items, e) + end + + def process_chat_batch(items) + # Chat is processed one at a time due to function calling complexity + items.map do |item| + process_chat_item(item) + end + end + + def process_chat_item(item) + prompt = item.dig("input", "prompt") + expected_functions = item.dig("expectedOutput", "functions") || [] + + start_time = Time.current + + response = llm_provider.chat_response( + prompt, + model: model, + instructions: "You are a helpful personal finance assistant.", + functions: build_available_functions + ) + + latency_ms = ((Time.current - start_time) * 1000).to_i + + actual_functions = extract_function_calls(response) + correct = evaluate_function_match(actual_functions, expected_functions) + score_value = correct ? 1.0 : 0.0 + + # Create trace and score in Langfuse + trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms) + score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions) + + { + item_id: item["id"], + expected: expected_functions, + actual: actual_functions, + correct: correct, + latency_ms: latency_ms + } + rescue => e + handle_item_error(item, e) + end + + def create_trace_for_item(item, output, latency_ms) + trace_id = client.create_trace( + name: "#{dataset.eval_type}_eval", + input: item["input"], + output: output, + metadata: { + run_name: @run_name, + model: model, + latency_ms: latency_ms, + dataset_item_id: item["id"] + } + ) + + Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}") + trace_id + end + + def score_result(trace_id, item_id, score_value, correct, actual, expected) + return unless trace_id + + # Score the accuracy + client.create_score( + trace_id: trace_id, + name: "accuracy", + value: score_value, + comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}" + ) + + # Link to dataset run + client.create_dataset_run_item( + run_name: @run_name, + dataset_item_id: item_id, + trace_id: trace_id, + metadata: { + correct: correct, + actual: actual, + expected: expected + } + ) + rescue => e + Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}") + end + + def handle_batch_error(items, error) + error_message = error.is_a?(Exception) ? error.message : error.to_s + Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}") + + items.map do |item| + { + item_id: item["id"], + expected: item["expectedOutput"], + actual: { error: error_message }, + correct: false, + latency_ms: 0 + } + end + end + + def handle_item_error(item, error) + Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}") + + { + item_id: item["id"], + expected: item["expectedOutput"], + actual: { error: error.message }, + correct: false, + latency_ms: 0 + } + end + + def calculate_metrics(results) + total = results.size + + # Guard against empty results to avoid division by zero + if total.zero? + return { + accuracy: 0.0, + total: 0, + correct: 0, + incorrect: 0, + avg_latency_ms: 0 + } + end + + correct = results.count { |r| r[:correct] } + avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f + + { + accuracy: (correct.to_f / total * 100).round(2), + total: total, + correct: correct, + incorrect: total - correct, + avg_latency_ms: avg_latency.round(0) + } + end + + def llm_provider + @llm_provider ||= build_provider + end + + def build_provider + case provider + when "openai" + access_token = provider_config[:access_token] || + ENV["OPENAI_ACCESS_TOKEN"] || + Setting.openai_access_token + + raise "OpenAI access token not configured" unless access_token.present? + + uri_base = provider_config[:uri_base] || + ENV["OPENAI_URI_BASE"] || + Setting.openai_uri_base + + Provider::Openai.new(access_token, uri_base: uri_base, model: model) + else + raise "Unsupported provider: #{provider}" + end + end + + # Determine the effective JSON mode for a batch based on expected null ratio + # This prevents the auto-categorizer from incorrectly retrying when many nulls are expected + def json_mode_for_batch(items) + # If a specific mode is configured (not "auto"), always use it + configured_mode = provider_config[:json_mode] + return configured_mode if configured_mode.present? && configured_mode != "auto" + + # Calculate expected null ratio for this batch + expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? } + expected_null_ratio = expected_null_count.to_f / items.size + + # If >50% of the batch is expected to return null, force strict mode + # This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer + # and prevents unnecessary retries when nulls are legitimate + if expected_null_ratio > 0.5 + Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode") + "strict" + else + # Use auto mode - let the auto-categorizer decide + "auto" + end + end + + def normalize_null(value) + return nil if value.nil? + return nil if value == "null" + return nil if value.to_s.strip.empty? + value + end + + def normalize_url(url) + return nil if url.nil? + url.to_s.downcase + .gsub(/^(https?:\/\/)?(www\.)?/, "") + .chomp("/") + .strip + end + + def build_available_functions + # Simplified function definitions for chat eval + [ + { + name: "get_accounts", + description: "Get user's financial accounts", + params_schema: { type: "object", properties: {}, required: [] } + }, + { + name: "get_transactions", + description: "Get transactions with optional filters", + params_schema: { + type: "object", + properties: { + account_id: { type: "string" }, + start_date: { type: "string" }, + end_date: { type: "string" }, + category: { type: "string" } + } + } + }, + { + name: "get_balance_summary", + description: "Get balance summary across accounts", + params_schema: { type: "object", properties: {} } + }, + { + name: "get_spending_by_category", + description: "Get spending breakdown by category", + params_schema: { + type: "object", + properties: { + start_date: { type: "string" }, + end_date: { type: "string" } + } + } + } + ] + end + + def extract_function_calls(response) + return [] unless response.respond_to?(:messages) + + response.messages.flat_map do |msg| + next [] unless msg.respond_to?(:function_calls) + msg.function_calls.map do |fc| + { name: fc.name, arguments: fc.arguments } + end + end.compact + end + + def evaluate_function_match(actual, expected) + return true if expected.empty? && actual.empty? + return false if expected.empty? != actual.empty? + + expected_names = expected.map { |f| f["name"] || f[:name] }.sort + actual_names = actual.map { |f| f["name"] || f[:name] }.sort + + expected_names == actual_names + end +end diff --git a/app/models/eval/metrics/base.rb b/app/models/eval/metrics/base.rb new file mode 100644 index 000000000..e602113d9 --- /dev/null +++ b/app/models/eval/metrics/base.rb @@ -0,0 +1,68 @@ +class Eval::Metrics::Base + attr_reader :eval_run + + def initialize(eval_run) + @eval_run = eval_run + end + + def calculate + raise NotImplementedError, "Subclasses must implement #calculate" + end + + protected + + def results + @results ||= eval_run.results.includes(:sample) + end + + def samples + @samples ||= eval_run.dataset.samples + end + + def total_count + results.count + end + + def correct_count + results.where(correct: true).count + end + + def incorrect_count + results.where(correct: false).count + end + + def accuracy + return 0.0 if total_count.zero? + (correct_count.to_f / total_count * 100).round(2) + end + + def avg_latency_ms + return nil if total_count.zero? + results.average(:latency_ms)&.round(0) + end + + def total_cost + results.sum(:cost)&.to_f&.round(6) + end + + def cost_per_sample + return nil if total_count.zero? + (total_cost / total_count).round(6) + end + + def metrics_by_difficulty + %w[easy medium hard edge_case].index_with do |difficulty| + difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty }) + next nil if difficulty_results.empty? + + correct = difficulty_results.where(correct: true).count + total = difficulty_results.count + + { + count: total, + correct: correct, + accuracy: (correct.to_f / total * 100).round(2) + } + end.compact + end +end diff --git a/app/models/eval/metrics/categorization_metrics.rb b/app/models/eval/metrics/categorization_metrics.rb new file mode 100644 index 000000000..04e81b62e --- /dev/null +++ b/app/models/eval/metrics/categorization_metrics.rb @@ -0,0 +1,101 @@ +class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base + def calculate + { + accuracy: accuracy, + exact_match_accuracy: exact_match_accuracy, + alternative_match_count: alternative_match_count, + precision: precision, + recall: recall, + f1_score: f1_score, + null_accuracy: null_accuracy, + hierarchical_accuracy: hierarchical_accuracy, + samples_processed: total_count, + samples_correct: correct_count, + avg_latency_ms: avg_latency_ms, + total_cost: total_cost, + cost_per_sample: cost_per_sample, + by_difficulty: metrics_by_difficulty, + by_category: metrics_by_category + } + end + + private + + def exact_match_accuracy + # Percentage of results that exactly match the primary expected category + return 0.0 if total_count.zero? + (results.where(exact_match: true).count.to_f / total_count * 100).round(2) + end + + def alternative_match_count + # Number of results that matched an alternative (but not primary) category + results.where(alternative_match: true).count + end + + def null_accuracy + # Accuracy for samples where null was expected + null_expected_results = results.where(null_expected: true) + return 100.0 if null_expected_results.empty? + + correct = null_expected_results.where(null_returned: true).count + total = null_expected_results.count + + (correct.to_f / total * 100).round(2) + end + + def hierarchical_accuracy + # Percentage of results that match at hierarchical level (including exact matches) + return 0.0 if total_count.zero? + (results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2) + end + + def precision + # True positives / (True positives + False positives) + # TP: Correct non-null predictions + # FP: Incorrect non-null predictions (predicted wrong category) + true_positives = results.where(correct: true, null_returned: false).count + false_positives = results.where(correct: false, null_returned: false).count + + denominator = true_positives + false_positives + return 0.0 if denominator.zero? + + (true_positives.to_f / denominator * 100).round(2) + end + + def recall + # True positives / (True positives + False negatives) + # TP: Correct non-null predictions + # FN: Incorrectly returned null when category was expected + true_positives = results.where(correct: true, null_returned: false).count + false_negatives = results.where(null_expected: false, null_returned: true).count + + denominator = true_positives + false_negatives + return 0.0 if denominator.zero? + + (true_positives.to_f / denominator * 100).round(2) + end + + def f1_score + return 0.0 if precision.zero? || recall.zero? + (2 * precision * recall / (precision + recall)).round(2) + end + + def metrics_by_category + # Group results by expected category and calculate accuracy + category_metrics = {} + + results.includes(:sample).each do |result| + expected = result.sample.expected_category_name || "null" + + category_metrics[expected] ||= { correct: 0, total: 0 } + category_metrics[expected][:total] += 1 + category_metrics[expected][:correct] += 1 if result.correct + end + + category_metrics.transform_values do |metrics| + metrics.merge( + accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2) + ) + end + end +end diff --git a/app/models/eval/metrics/chat_metrics.rb b/app/models/eval/metrics/chat_metrics.rb new file mode 100644 index 000000000..493a99f67 --- /dev/null +++ b/app/models/eval/metrics/chat_metrics.rb @@ -0,0 +1,125 @@ +class Eval::Metrics::ChatMetrics < Eval::Metrics::Base + def calculate + { + accuracy: accuracy, + function_selection_accuracy: function_selection_accuracy, + parameter_accuracy: parameter_accuracy, + response_relevance: response_relevance, + exact_match_rate: exact_match_rate, + error_rate: error_rate, + avg_functions_per_response: avg_functions_per_response, + samples_processed: total_count, + samples_correct: correct_count, + avg_latency_ms: avg_latency_ms, + total_cost: total_cost, + cost_per_sample: cost_per_sample, + by_difficulty: metrics_by_difficulty, + by_function: metrics_by_function + } + end + + private + + def function_selection_accuracy + # Percentage of samples where correct functions were called + valid_results = results.where.not("metadata->>'error' IS NOT NULL") + return 0.0 if valid_results.empty? + + correct = valid_results.count do |r| + r.metadata.dig("function_selection_correct") == true + end + + (correct.to_f / valid_results.count * 100).round(2) + end + + def parameter_accuracy + # Average parameter accuracy across all samples + valid_results = results.where.not("metadata->>'error' IS NOT NULL") + return 0.0 if valid_results.empty? + + scores = valid_results.map do |r| + r.metadata.dig("parameter_accuracy") || 0.0 + end + + (scores.sum / scores.size * 100).round(2) + end + + def response_relevance + # Percentage of samples where response contained expected keywords + valid_results = results.where.not("metadata->>'error' IS NOT NULL") + return 0.0 if valid_results.empty? + + correct = valid_results.count do |r| + # If no keywords expected, consider it relevant + expected_keywords = r.metadata.dig("expected_keywords") || [] + expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true + end + + (correct.to_f / valid_results.count * 100).round(2) + end + + def exact_match_rate + return 0.0 if total_count.zero? + (results.where(exact_match: true).count.to_f / total_count * 100).round(2) + end + + def error_rate + return 0.0 if total_count.zero? + + errors = results.count do |r| + r.metadata.dig("error").present? || r.actual_output.dig("error").present? + end + + (errors.to_f / total_count * 100).round(2) + end + + def avg_functions_per_response + valid_results = results.where.not("actual_output->>'error' IS NOT NULL") + return 0.0 if valid_results.empty? + + total_functions = valid_results.sum do |r| + functions = r.actual_output.dig("functions") || [] + functions.size + end + + (total_functions.to_f / valid_results.count).round(2) + end + + def metrics_by_function + # Group results by expected function and calculate accuracy + function_metrics = {} + + results.includes(:sample).each do |result| + expected_functions = result.sample.expected_functions + + expected_functions.each do |func| + name = func["name"] + next if name.nil? + + function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 } + function_metrics[name][:total] += 1 + + # Check if this specific function was called correctly + actual_functions = result.actual_output.dig("functions") || [] + if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) } + function_metrics[name][:correct] += 1 + function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0) + end + end + end + + function_metrics.transform_values do |metrics| + { + total: metrics[:total], + correct: metrics[:correct], + accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2), + avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0 + } + end + end + + def normalize_name(name) + return nil if name.nil? + name.to_s.underscore.downcase + end +end diff --git a/app/models/eval/metrics/merchant_detection_metrics.rb b/app/models/eval/metrics/merchant_detection_metrics.rb new file mode 100644 index 000000000..d7042d52d --- /dev/null +++ b/app/models/eval/metrics/merchant_detection_metrics.rb @@ -0,0 +1,107 @@ +class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base + FUZZY_MATCH_THRESHOLD = 0.8 + + def calculate + { + accuracy: accuracy, + name_accuracy: name_accuracy, + fuzzy_name_accuracy: fuzzy_name_accuracy, + url_accuracy: url_accuracy, + false_positive_rate: false_positive_rate, + false_negative_rate: false_negative_rate, + samples_processed: total_count, + samples_correct: correct_count, + avg_latency_ms: avg_latency_ms, + total_cost: total_cost, + cost_per_sample: cost_per_sample, + avg_fuzzy_score: avg_fuzzy_score, + by_difficulty: metrics_by_difficulty + } + end + + private + + def name_accuracy + # Exact name match accuracy for non-null expected names + name_results = results.includes(:sample).select do |r| + r.sample.expected_business_name.present? + end + + return 100.0 if name_results.empty? + + correct = name_results.count do |r| + actual = r.actual_output.dig("business_name") || r.actual_output["business_name"] + expected = r.sample.expected_business_name + actual == expected + end + + (correct.to_f / name_results.size * 100).round(2) + end + + def fuzzy_name_accuracy + # Fuzzy name match accuracy (using fuzzy_score >= threshold) + name_results = results.includes(:sample).select do |r| + r.sample.expected_business_name.present? + end + + return 100.0 if name_results.empty? + + correct = name_results.count do |r| + (r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD + end + + (correct.to_f / name_results.size * 100).round(2) + end + + def url_accuracy + # URL match accuracy for non-null expected URLs + url_results = results.includes(:sample).select do |r| + r.sample.expected_business_url.present? + end + + return 100.0 if url_results.empty? + + correct = url_results.count do |r| + actual = r.actual_output.dig("business_url") || r.actual_output["business_url"] + expected = r.sample.expected_business_url + normalize_url(actual) == normalize_url(expected) + end + + (correct.to_f / url_results.size * 100).round(2) + end + + def false_positive_rate + # Rate of returning a merchant when null was expected + null_expected_results = results.where(null_expected: true) + return 0.0 if null_expected_results.empty? + + false_positives = null_expected_results.where(null_returned: false).count + + (false_positives.to_f / null_expected_results.count * 100).round(2) + end + + def false_negative_rate + # Rate of returning null when a merchant was expected + merchant_expected_results = results.where(null_expected: false) + return 0.0 if merchant_expected_results.empty? + + false_negatives = merchant_expected_results.where(null_returned: true).count + + (false_negatives.to_f / merchant_expected_results.count * 100).round(2) + end + + def avg_fuzzy_score + scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score) + return nil if scores.empty? + + (scores.sum / scores.size).round(4) + end + + def normalize_url(url) + return nil if url.nil? + url.to_s.downcase + .gsub(/^(https?:\/\/)?(www\.)?/, "") + .chomp("/") + .strip + end +end diff --git a/app/models/eval/reporters/comparison_reporter.rb b/app/models/eval/reporters/comparison_reporter.rb new file mode 100644 index 000000000..eda8b0a68 --- /dev/null +++ b/app/models/eval/reporters/comparison_reporter.rb @@ -0,0 +1,205 @@ +class Eval::Reporters::ComparisonReporter + attr_reader :runs + + def initialize(runs) + @runs = Array(runs).sort_by(&:model) + end + + # Generate a text table for terminal display + def to_table + return "No runs to compare" if runs.empty? + + headers = build_headers + rows = runs.map { |run| build_row(run) } + + # Calculate column widths + all_rows = [ headers ] + rows + widths = headers.each_index.map do |i| + all_rows.map { |row| row[i].to_s.length }.max + end + + # Build table + separator = "+" + widths.map { |w| "-" * (w + 2) }.join("+") + "+" + + lines = [] + lines << separator + lines << "| " + headers.each_with_index.map { |h, i| h.to_s.ljust(widths[i]) }.join(" | ") + " |" + lines << separator + + rows.each do |row| + lines << "| " + row.each_with_index.map { |c, i| c.to_s.ljust(widths[i]) }.join(" | ") + " |" + end + + lines << separator + lines.join("\n") + end + + # Export to CSV file + def to_csv(file_path) + require "csv" + + CSV.open(file_path, "wb") do |csv| + csv << csv_headers + runs.each { |run| csv << csv_row(run) } + end + + file_path + end + + # Generate summary with best model recommendations + def summary + return {} if runs.empty? + + completed_runs = runs.select { |r| r.status == "completed" && r.metrics.present? } + return {} if completed_runs.empty? + + best_accuracy = completed_runs.max_by { |r| r.metrics["accuracy"] || 0 } + lowest_cost = completed_runs.min_by { |r| r.total_cost || Float::INFINITY } + fastest = completed_runs.min_by { |r| r.metrics["avg_latency_ms"] || Float::INFINITY } + + { + best_accuracy: { + model: best_accuracy.model, + value: best_accuracy.metrics["accuracy"], + run_id: best_accuracy.id + }, + lowest_cost: { + model: lowest_cost.model, + value: lowest_cost.total_cost&.to_f, + run_id: lowest_cost.id + }, + fastest: { + model: fastest.model, + value: fastest.metrics["avg_latency_ms"], + run_id: fastest.id + }, + recommendation: generate_recommendation(best_accuracy, lowest_cost, fastest) + } + end + + # Generate detailed comparison between runs + def detailed_comparison + return {} if runs.empty? + + { + runs: runs.map(&:summary), + comparison: pairwise_comparisons, + summary: summary + } + end + + private + + def build_headers + [ "Model", "Status", "Accuracy", "Precision", "Recall", "F1", "Latency (ms)", "Cost ($)", "Samples" ] + end + + def build_row(run) + metrics = run.metrics || {} + + [ + run.model, + run.status, + format_percentage(metrics["accuracy"]), + format_percentage(metrics["precision"]), + format_percentage(metrics["recall"]), + format_percentage(metrics["f1_score"]), + metrics["avg_latency_ms"]&.round(0) || "-", + format_cost(run.total_cost), + run.results.count + ] + end + + def csv_headers + [ + "Run ID", "Model", "Provider", "Dataset", "Status", + "Accuracy", "Precision", "Recall", "F1 Score", + "Null Accuracy", "Hierarchical Accuracy", + "Avg Latency (ms)", "Total Cost", "Cost Per Sample", + "Samples Processed", "Samples Correct", + "Duration (s)", "Run Date" + ] + end + + def csv_row(run) + metrics = run.metrics || {} + + [ + run.id, + run.model, + run.provider, + run.dataset.name, + run.status, + metrics["accuracy"], + metrics["precision"], + metrics["recall"], + metrics["f1_score"], + metrics["null_accuracy"], + metrics["hierarchical_accuracy"], + metrics["avg_latency_ms"], + run.total_cost&.to_f, + metrics["cost_per_sample"], + metrics["samples_processed"], + metrics["samples_correct"], + run.duration_seconds, + run.completed_at&.iso8601 + ] + end + + def format_percentage(value) + return "-" if value.nil? + "#{value}%" + end + + def format_cost(value) + return "-" if value.nil? + "$#{value.to_f.round(4)}" + end + + def pairwise_comparisons + return [] if runs.size < 2 + + comparisons = [] + runs.combination(2).each do |run1, run2| + comparisons << { + models: [ run1.model, run2.model ], + accuracy_diff: ((run1.metrics["accuracy"] || 0) - (run2.metrics["accuracy"] || 0)).round(2), + cost_diff: ((run1.total_cost || 0) - (run2.total_cost || 0)).to_f.round(6), + latency_diff: ((run1.metrics["avg_latency_ms"] || 0) - (run2.metrics["avg_latency_ms"] || 0)).round(0) + } + end + comparisons + end + + def generate_recommendation(best_accuracy, lowest_cost, fastest) + parts = [] + + # If one model wins all categories + if best_accuracy.id == lowest_cost.id && lowest_cost.id == fastest.id + return "#{best_accuracy.model} is the best choice overall (highest accuracy, lowest cost, fastest)." + end + + # Accuracy recommendation + if best_accuracy.metrics["accuracy"] && best_accuracy.metrics["accuracy"] >= 90 + parts << "For maximum accuracy, use #{best_accuracy.model} (#{best_accuracy.metrics['accuracy']}% accuracy)" + end + + # Cost recommendation if significantly cheaper + if lowest_cost.total_cost && lowest_cost.total_cost > 0 + cost_ratio = (best_accuracy.total_cost || 0) / lowest_cost.total_cost + if cost_ratio > 1.5 + parts << "For cost efficiency, consider #{lowest_cost.model} (#{format_cost(lowest_cost.total_cost)} vs #{format_cost(best_accuracy.total_cost)})" + end + end + + # Speed recommendation + if fastest.metrics["avg_latency_ms"] && fastest.id != best_accuracy.id + latency_ratio = (best_accuracy.metrics["avg_latency_ms"] || 0) / (fastest.metrics["avg_latency_ms"] || 1) + if latency_ratio > 1.5 + parts << "For speed, consider #{fastest.model} (#{fastest.metrics['avg_latency_ms']}ms vs #{best_accuracy.metrics['avg_latency_ms']}ms)" + end + end + + parts.empty? ? "All models perform similarly." : parts.join(". ") + end +end diff --git a/app/models/eval/result.rb b/app/models/eval/result.rb new file mode 100644 index 000000000..61017f64c --- /dev/null +++ b/app/models/eval/result.rb @@ -0,0 +1,70 @@ +class Eval::Result < ApplicationRecord + self.table_name = "eval_results" + + belongs_to :run, class_name: "Eval::Run", foreign_key: :eval_run_id + belongs_to :sample, class_name: "Eval::Sample", foreign_key: :eval_sample_id + + validates :actual_output, presence: true + validates :correct, inclusion: { in: [ true, false ] } + + scope :correct, -> { where(correct: true) } + scope :incorrect, -> { where(correct: false) } + scope :with_nulls_returned, -> { where(null_returned: true) } + scope :with_nulls_expected, -> { where(null_expected: true) } + scope :exact_matches, -> { where(exact_match: true) } + scope :hierarchical_matches, -> { where(hierarchical_match: true) } + + # Get actual category (for categorization results) + def actual_category_name + actual_output.dig("category_name") || actual_output["category_name"] + end + + # Get actual merchant info (for merchant detection results) + def actual_business_name + actual_output.dig("business_name") || actual_output["business_name"] + end + + def actual_business_url + actual_output.dig("business_url") || actual_output["business_url"] + end + + # Get actual functions called (for chat results) + def actual_functions + actual_output.dig("functions") || actual_output["functions"] || [] + end + + # Get actual response text (for chat results) + def actual_response_text + actual_output.dig("response_text") || actual_output["response_text"] + end + + # Summary for display + def summary + { + sample_id: sample_id, + correct: correct, + exact_match: exact_match, + expected: sample.expected_output, + actual: actual_output, + latency_ms: latency_ms, + cost: cost&.to_f + } + end + + # Detailed comparison with expected + def detailed_comparison + { + sample_difficulty: sample.difficulty, + sample_tags: sample.tags, + input: sample.input_data, + expected: sample.expected_output, + actual: actual_output, + correct: correct, + exact_match: exact_match, + hierarchical_match: hierarchical_match, + null_expected: null_expected, + null_returned: null_returned, + fuzzy_score: fuzzy_score + } + end +end diff --git a/app/models/eval/run.rb b/app/models/eval/run.rb new file mode 100644 index 000000000..c9cbdc988 --- /dev/null +++ b/app/models/eval/run.rb @@ -0,0 +1,88 @@ +class Eval::Run < ApplicationRecord + self.table_name = "eval_runs" + + belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id + has_many :results, class_name: "Eval::Result", foreign_key: :eval_run_id, dependent: :destroy + + validates :provider, :model, :status, presence: true + validates :status, inclusion: { in: %w[pending running completed failed] } + + scope :pending, -> { where(status: "pending") } + scope :running, -> { where(status: "running") } + scope :completed, -> { where(status: "completed") } + scope :failed, -> { where(status: "failed") } + scope :for_model, ->(model) { where(model: model) } + scope :for_provider, ->(provider) { where(provider: provider) } + + # Calculate duration in seconds + def duration_seconds + return nil unless started_at && completed_at + (completed_at - started_at).to_i + end + + # Get accuracy from metrics or calculate + def accuracy + metrics.dig("accuracy") || calculate_accuracy + end + + # Start the evaluation run + def start! + update!(status: "running", started_at: Time.current) + end + + # Complete the evaluation run with metrics + def complete!(calculated_metrics) + update!( + status: "completed", + completed_at: Time.current, + metrics: calculated_metrics, + total_prompt_tokens: results.sum(:prompt_tokens), + total_completion_tokens: results.sum(:completion_tokens), + total_cost: results.sum(:cost) + ) + end + + # Fail the evaluation run + def fail!(error) + update!( + status: "failed", + completed_at: Time.current, + error_message: error.is_a?(Exception) ? "#{error.class}: #{error.message}" : error.to_s + ) + end + + # Summary for display + def summary + { + id: id, + name: name, + dataset: dataset.name, + model: model, + provider: provider, + status: status, + accuracy: accuracy, + total_cost: total_cost&.to_f, + duration: duration_seconds, + samples_processed: results.count, + samples_correct: results.where(correct: true).count, + created_at: created_at + } + end + + # Compare this run to another + def compare_to(other_run) + { + accuracy_diff: (accuracy || 0) - (other_run.accuracy || 0), + cost_diff: (total_cost || 0) - (other_run.total_cost || 0), + this_model: model, + other_model: other_run.model + } + end + + private + + def calculate_accuracy + return 0.0 if results.empty? + (results.where(correct: true).count.to_f / results.count * 100).round(2) + end +end diff --git a/app/models/eval/runners/base.rb b/app/models/eval/runners/base.rb new file mode 100644 index 000000000..42018c80d --- /dev/null +++ b/app/models/eval/runners/base.rb @@ -0,0 +1,82 @@ +class Eval::Runners::Base + attr_reader :eval_run + + def initialize(eval_run) + @eval_run = eval_run + end + + def run + eval_run.start! + + begin + process_samples + metrics = calculate_metrics + eval_run.complete!(metrics) + rescue => e + eval_run.fail!(e) + raise + end + + eval_run + end + + protected + + def process_samples + raise NotImplementedError, "Subclasses must implement #process_samples" + end + + def calculate_metrics + raise NotImplementedError, "Subclasses must implement #calculate_metrics" + end + + def samples + eval_run.dataset.samples + end + + def provider + @provider ||= build_provider + end + + def model + eval_run.model + end + + private + + def build_provider + case eval_run.provider + when "openai" + build_openai_provider + else + raise "Unsupported provider: #{eval_run.provider}" + end + end + + def build_openai_provider + access_token = eval_run.provider_config["access_token"].presence || + ENV["OPENAI_ACCESS_TOKEN"].presence || + Setting.openai_access_token + + raise "OpenAI access token not configured" unless access_token.present? + + uri_base = eval_run.provider_config["uri_base"].presence || + ENV["OPENAI_URI_BASE"].presence || + Setting.openai_uri_base + + Provider::Openai.new(access_token, uri_base: uri_base, model: model) + end + + def record_result(sample:, actual_output:, correct:, **attributes) + eval_run.results.create!( + sample: sample, + actual_output: actual_output, + correct: correct, + **attributes + ) + end + + def log_progress(message) + Rails.logger.info("[Eval::Runner] #{message}") + end +end diff --git a/app/models/eval/runners/categorization_runner.rb b/app/models/eval/runners/categorization_runner.rb new file mode 100644 index 000000000..3bc6994c4 --- /dev/null +++ b/app/models/eval/runners/categorization_runner.rb @@ -0,0 +1,199 @@ +class Eval::Runners::CategorizationRunner < Eval::Runners::Base + DEFAULT_BATCH_SIZE = 25 # Matches Provider::Openai limit + + protected + + def process_samples + all_samples = samples.to_a + batch_size = effective_batch_size + log_progress("Processing #{all_samples.size} samples in batches of #{batch_size}") + + all_samples.each_slice(batch_size).with_index do |batch, batch_idx| + log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / batch_size).ceil}") + process_batch(batch) + end + end + + # Use smaller batches for custom providers (local LLMs) to reduce context length + def effective_batch_size + eval_run.provider_config["batch_size"]&.to_i || DEFAULT_BATCH_SIZE + end + + # Get JSON mode from provider config (optional override) + # Valid values: "strict", "json_object", "none" + def json_mode + eval_run.provider_config["json_mode"] + end + + def calculate_metrics + Eval::Metrics::CategorizationMetrics.new(eval_run).calculate + end + + private + + def process_batch(batch_samples) + return if batch_samples.empty? + + # Build inputs for the provider + transactions = batch_samples.map do |sample| + sample.to_transaction_input.merge(id: sample.id) + end + + # Get categories from first sample's context (should be shared) + # Symbolize keys since Provider::Openai::AutoCategorizer expects symbol keys + categories = batch_samples.first.categories_context.map(&:deep_symbolize_keys) + + # Determine effective JSON mode for this batch + # If the batch has many expected nulls and we're using auto mode, force strict mode + # to prevent the auto-categorizer from incorrectly retrying (it would see many nulls + # and think strict mode is broken, when actually the nulls are expected) + effective_json_mode = json_mode_for_batch(batch_samples) + + start_time = Time.current + + begin + response = provider.auto_categorize( + transactions: transactions, + user_categories: categories, + model: model, + json_mode: effective_json_mode + ) + + latency_ms = ((Time.current - start_time) * 1000).to_i + per_sample_latency = latency_ms / batch_samples.size + + if response.success? + record_batch_results(batch_samples, response.data, per_sample_latency) + else + record_batch_errors(batch_samples, response.error, per_sample_latency) + end + rescue => e + latency_ms = ((Time.current - start_time) * 1000).to_i + per_sample_latency = latency_ms / batch_samples.size + record_batch_errors(batch_samples, e, per_sample_latency) + end + end + + def record_batch_results(batch_samples, categorizations, per_sample_latency) + batch_samples.each do |sample| + # Find the categorization result for this sample + categorization = categorizations.find { |c| c.transaction_id.to_s == sample.id.to_s } + actual_category = categorization&.category_name + + # Normalize "null" string to nil + actual_category = nil if actual_category == "null" + + expected_category = sample.expected_category_name + acceptable_categories = sample.all_acceptable_categories + + # Evaluate correctness - check primary expected and alternatives + correct = evaluate_correctness_with_alternatives(actual_category, expected_category, acceptable_categories) + exact_match = actual_category == expected_category + alternative_match = acceptable_categories.include?(actual_category) && !exact_match + hierarchical = evaluate_hierarchical_match(actual_category, expected_category, sample) + + record_result( + sample: sample, + actual_output: { "category_name" => actual_category }, + correct: correct, + exact_match: exact_match, + alternative_match: alternative_match, + hierarchical_match: hierarchical, + null_expected: expected_category.nil?, + null_returned: actual_category.nil?, + latency_ms: per_sample_latency + ) + end + end + + def record_batch_errors(batch_samples, error, per_sample_latency) + error_message = error.is_a?(Exception) ? error.message : error.to_s + + batch_samples.each do |sample| + record_result( + sample: sample, + actual_output: { "error" => error_message }, + correct: false, + exact_match: false, + hierarchical_match: false, + null_expected: sample.expected_category_name.nil?, + null_returned: true, + latency_ms: per_sample_latency, + metadata: { "error" => error_message } + ) + end + end + + # Determine the effective JSON mode for a batch based on expected null ratio + # This prevents the auto-categorizer from incorrectly retrying when many nulls are expected + def json_mode_for_batch(batch_samples) + # If a specific mode is configured (not "auto"), always use it + return json_mode if json_mode.present? && json_mode != "auto" + + # Calculate expected null ratio for this batch + expected_null_count = batch_samples.count { |s| s.expected_category_name.nil? } + expected_null_ratio = expected_null_count.to_f / batch_samples.size + + # If >50% of the batch is expected to return null, force strict mode + # This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer + # and prevents unnecessary retries when nulls are legitimate + if expected_null_ratio > 0.5 + log_progress("Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode to prevent false retry") + "strict" + else + # Use auto mode - let the auto-categorizer decide + "auto" + end + end + + def evaluate_correctness(actual, expected) + # Both null = correct + return true if actual.nil? && expected.nil? + # Expected null but got value = incorrect + return false if expected.nil? && actual.present? + # Expected value but got null = incorrect + return false if actual.nil? && expected.present? + # Compare values + actual == expected + end + + def evaluate_correctness_with_alternatives(actual, expected, acceptable_categories) + # Both null = correct + return true if actual.nil? && expected.nil? + # Expected null but got value = incorrect + return false if expected.nil? && actual.present? + # Expected value but got null = incorrect + return false if actual.nil? && expected.present? + # Check if actual matches any acceptable category (primary or alternatives) + acceptable_categories.include?(actual) + end + + def evaluate_hierarchical_match(actual, expected, sample) + return false if actual.nil? || expected.nil? + return true if actual == expected + + # Check if actual matches parent of expected category + categories = sample.categories_context + + # Find the expected category + expected_cat = categories.find { |c| c["name"] == expected } + return false unless expected_cat + + # If expected has a parent, check if actual matches the parent + if expected_cat["parent_id"] + parent = categories.find { |c| c["id"].to_s == expected_cat["parent_id"].to_s } + return parent && parent["name"] == actual + end + + # Also check if actual is a subcategory of expected (reverse direction) + actual_cat = categories.find { |c| c["name"] == actual } + return false unless actual_cat + + if actual_cat["parent_id"] + parent = categories.find { |c| c["id"].to_s == actual_cat["parent_id"].to_s } + return parent && parent["name"] == expected + end + + false + end +end diff --git a/app/models/eval/runners/chat_runner.rb b/app/models/eval/runners/chat_runner.rb new file mode 100644 index 000000000..a30d7b657 --- /dev/null +++ b/app/models/eval/runners/chat_runner.rb @@ -0,0 +1,255 @@ +class Eval::Runners::ChatRunner < Eval::Runners::Base + # Chat samples are processed one at a time (not batched) + # because each has unique context and function calling requirements + + protected + + def process_samples + all_samples = samples.to_a + log_progress("Processing #{all_samples.size} chat samples") + + all_samples.each_with_index do |sample, idx| + log_progress("Processing sample #{idx + 1}/#{all_samples.size}") + process_sample(sample) + end + end + + def calculate_metrics + Eval::Metrics::ChatMetrics.new(eval_run).calculate + end + + private + + def process_sample(sample) + prompt = sample.chat_prompt + start_time = Time.current + + begin + response = provider.chat_response( + prompt, + model: model, + instructions: build_instructions, + functions: build_function_definitions + ) + + latency_ms = ((Time.current - start_time) * 1000).to_i + + if response.success? + record_chat_result(sample, response.data, latency_ms) + else + record_error_result(sample, response.error, latency_ms) + end + rescue => e + latency_ms = ((Time.current - start_time) * 1000).to_i + record_error_result(sample, e, latency_ms) + end + end + + def record_chat_result(sample, chat_response, latency_ms) + # Extract function calls from response + actual_functions = extract_functions_from_response(chat_response) + + # Extract response text + response_text = extract_response_text(chat_response) + + # Evaluate function calling accuracy + expected_functions = sample.expected_functions + function_match = evaluate_function_match(actual_functions, expected_functions) + + # Evaluate response content + expected_keywords = sample.expected_response_contains + response_match = evaluate_response_contains(response_text, expected_keywords) + + # Overall correctness: functions are correct AND response contains expected keywords + correct = function_match[:correct] && response_match + + record_result( + sample: sample, + actual_output: { + "functions" => actual_functions, + "response_text" => response_text, + "function_match_details" => function_match + }, + correct: correct, + exact_match: function_match[:exact_match], + latency_ms: latency_ms, + metadata: { + "function_selection_correct" => function_match[:selection_correct], + "parameter_accuracy" => function_match[:parameter_accuracy], + "response_keywords_found" => response_match, + "expected_functions" => expected_functions, + "expected_keywords" => expected_keywords + } + ) + end + + def record_error_result(sample, error, latency_ms) + error_message = error.is_a?(Exception) ? error.message : error.to_s + + record_result( + sample: sample, + actual_output: { "error" => error_message }, + correct: false, + exact_match: false, + latency_ms: latency_ms, + metadata: { "error" => error_message } + ) + end + + def extract_functions_from_response(chat_response) + # ChatResponse has function_requests array + function_requests = chat_response.function_requests || [] + + function_requests.map do |req| + { + "name" => req.function_name, + "params" => parse_function_args(req.function_args) + } + end + end + + def parse_function_args(args) + return {} if args.nil? + return args if args.is_a?(Hash) + JSON.parse(args) + rescue JSON::ParserError + {} + end + + def extract_response_text(chat_response) + # ChatResponse has messages array with output_text + messages = chat_response.messages || [] + messages.map(&:output_text).compact.join("\n") + end + + def evaluate_function_match(actual_functions, expected_functions) + return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty? + return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any? + + # Check function selection accuracy + expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact + actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact + + selection_correct = expected_names.all? { |name| actual_names.include?(name) } + + # Check parameter accuracy for matched functions + param_scores = [] + expected_functions.each do |expected_func| + expected_name = normalize_function_name(expected_func["name"]) + actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name } + + if actual_func + param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {}) + param_scores << param_score + else + param_scores << 0.0 + end + end + + parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4) + + # Exact match requires same functions with same parameters + exact_match = selection_correct && parameter_accuracy == 1.0 + + # Correct if all expected functions were called (parameters don't have to be exact) + correct = selection_correct + + { + correct: correct, + exact_match: exact_match, + selection_correct: selection_correct, + parameter_accuracy: parameter_accuracy + } + end + + def normalize_function_name(name) + return nil if name.nil? + # Convert to snake_case and downcase + name.to_s.underscore.downcase + end + + def evaluate_parameters(actual_params, expected_params) + return 1.0 if expected_params.empty? + return 0.0 if actual_params.nil? + + actual_params = actual_params.stringify_keys + expected_params = expected_params.stringify_keys + + matches = 0 + total = expected_params.size + + expected_params.each do |key, expected_value| + actual_value = actual_params[key] + + if values_match?(actual_value, expected_value) + matches += 1 + end + end + + (matches.to_f / total).round(4) + end + + def values_match?(actual, expected) + return true if actual == expected + return true if actual.to_s.downcase == expected.to_s.downcase + + # For arrays, check if all expected values are present + if expected.is_a?(Array) && actual.is_a?(Array) + expected_normalized = expected.map { |v| v.to_s.downcase } + actual_normalized = actual.map { |v| v.to_s.downcase } + return expected_normalized.all? { |v| actual_normalized.include?(v) } + end + + # For dates, try to parse and compare + if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/ + begin + expected_date = Date.parse(expected.to_s) + actual_date = Date.parse(actual.to_s) + return expected_date == actual_date + rescue + # Not valid dates, fall through + end + end + + false + end + + def evaluate_response_contains(response_text, expected_keywords) + return true if expected_keywords.empty? + return false if response_text.nil? || response_text.empty? + + normalized_response = response_text.downcase + + expected_keywords.all? do |keyword| + normalized_response.include?(keyword.to_s.downcase) + end + end + + def build_instructions + # Simple instructions for evaluation - we don't have a real user/family context + <<~PROMPT + You are a financial assistant helping users understand their financial data. + Use the functions available to answer questions about accounts, transactions, and financial statements. + Today's date is #{Date.current}. + PROMPT + end + + def build_function_definitions + # Return the function definitions that the chat would normally have + [ + build_function_definition("get_transactions", "Get paginated transactions with optional filters"), + build_function_definition("get_accounts", "Get all accounts with balances and historical data"), + build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"), + build_function_definition("get_income_statement", "Get income and expenses by category for a period") + ] + end + + def build_function_definition(name, description) + { + name: name, + description: description, + params_schema: { type: "object", properties: {}, additionalProperties: true }, + strict: false + } + end +end diff --git a/app/models/eval/runners/merchant_detection_runner.rb b/app/models/eval/runners/merchant_detection_runner.rb new file mode 100644 index 000000000..9540f5cba --- /dev/null +++ b/app/models/eval/runners/merchant_detection_runner.rb @@ -0,0 +1,199 @@ +class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base + BATCH_SIZE = 25 # Matches Provider::Openai limit + FUZZY_MATCH_THRESHOLD = 0.8 + + protected + + def process_samples + all_samples = samples.to_a + log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}") + + all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx| + log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}") + process_batch(batch) + end + end + + def calculate_metrics + Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate + end + + private + + def process_batch(batch_samples) + # Build inputs for the provider + transactions = batch_samples.map do |sample| + sample.to_transaction_input.merge(id: sample.id) + end + + # Get merchants from first sample's context (should be shared) + # Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys + merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys) + + start_time = Time.current + + begin + response = provider.auto_detect_merchants( + transactions: transactions, + user_merchants: merchants, + model: model + ) + + latency_ms = ((Time.current - start_time) * 1000).to_i + per_sample_latency = latency_ms / batch_samples.size + + if response.success? + record_batch_results(batch_samples, response.data, per_sample_latency) + else + record_batch_errors(batch_samples, response.error, per_sample_latency) + end + rescue => e + latency_ms = ((Time.current - start_time) * 1000).to_i + per_sample_latency = latency_ms / batch_samples.size + record_batch_errors(batch_samples, e, per_sample_latency) + end + end + + def record_batch_results(batch_samples, merchants_detected, per_sample_latency) + batch_samples.each do |sample| + # Find the merchant detection result for this sample + detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s } + + actual_name = normalize_null(detection&.business_name) + actual_url = normalize_null(detection&.business_url) + + expected_name = sample.expected_business_name + expected_url = sample.expected_business_url + + # Evaluate correctness + name_match = evaluate_name_match(actual_name, expected_name) + url_match = evaluate_url_match(actual_url, expected_url) + fuzzy_score = calculate_fuzzy_score(actual_name, expected_name) + + # Overall correct if both name and URL match expectations + correct = name_match && url_match + + # Exact match requires both to be exactly equal + exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url) + + record_result( + sample: sample, + actual_output: { "business_name" => actual_name, "business_url" => actual_url }, + correct: correct, + exact_match: exact_match, + fuzzy_score: fuzzy_score, + null_expected: expected_name.nil? && expected_url.nil?, + null_returned: actual_name.nil? && actual_url.nil?, + latency_ms: per_sample_latency + ) + end + end + + def record_batch_errors(batch_samples, error, per_sample_latency) + error_message = error.is_a?(Exception) ? error.message : error.to_s + + batch_samples.each do |sample| + record_result( + sample: sample, + actual_output: { "error" => error_message }, + correct: false, + exact_match: false, + fuzzy_score: 0.0, + null_expected: sample.expected_business_name.nil?, + null_returned: true, + latency_ms: per_sample_latency, + metadata: { "error" => error_message } + ) + end + end + + def normalize_null(value) + return nil if value.nil? + return nil if value == "null" + return nil if value.to_s.strip.empty? + value + end + + def evaluate_name_match(actual, expected) + # Both null = correct + return true if actual.nil? && expected.nil? + # Expected null but got value = false positive + return false if expected.nil? && actual.present? + # Expected value but got null = false negative + return false if actual.nil? && expected.present? + # Use fuzzy matching for name comparison + fuzzy_match?(actual, expected) + end + + def evaluate_url_match(actual, expected) + # Both null = correct + return true if actual.nil? && expected.nil? + # Expected null but got value = false positive + return false if expected.nil? && actual.present? + # Expected value but got null = false negative + return false if actual.nil? && expected.present? + # Normalize and compare URLs + normalize_url(actual) == normalize_url(expected) + end + + def normalize_url(url) + return nil if url.nil? + url.to_s.downcase + .gsub(/^(https?:\/\/)?(www\.)?/, "") + .chomp("/") + .strip + end + + def fuzzy_match?(actual, expected) + return false if actual.nil? || expected.nil? + calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD + end + + def calculate_fuzzy_score(actual, expected) + return 1.0 if actual == expected + return 0.0 if actual.nil? || expected.nil? + + # Simple Levenshtein distance-based similarity + # Normalize strings for comparison + a = actual.to_s.downcase.strip + b = expected.to_s.downcase.strip + + return 1.0 if a == b + + # Calculate Levenshtein distance + distance = levenshtein_distance(a, b) + max_length = [ a.length, b.length ].max + + return 0.0 if max_length == 0 + + # Convert distance to similarity score (0.0 to 1.0) + (1.0 - (distance.to_f / max_length)).round(4) + end + + def levenshtein_distance(s1, s2) + m = s1.length + n = s2.length + + return m if n == 0 + return n if m == 0 + + # Create distance matrix + d = Array.new(m + 1) { Array.new(n + 1) } + + (0..m).each { |i| d[i][0] = i } + (0..n).each { |j| d[0][j] = j } + + (1..n).each do |j| + (1..m).each do |i| + cost = s1[i - 1] == s2[j - 1] ? 0 : 1 + d[i][j] = [ + d[i - 1][j] + 1, # deletion + d[i][j - 1] + 1, # insertion + d[i - 1][j - 1] + cost # substitution + ].min + end + end + + d[m][n] + end +end diff --git a/app/models/eval/sample.rb b/app/models/eval/sample.rb new file mode 100644 index 000000000..ccee671a9 --- /dev/null +++ b/app/models/eval/sample.rb @@ -0,0 +1,88 @@ +class Eval::Sample < ApplicationRecord + self.table_name = "eval_samples" + + belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id + has_many :results, class_name: "Eval::Result", foreign_key: :eval_sample_id, dependent: :destroy + + validates :input_data, :expected_output, presence: true + validates :difficulty, inclusion: { in: %w[easy medium hard manual edge_case] } + + scope :easy, -> { where(difficulty: "easy") } + scope :medium, -> { where(difficulty: "medium") } + scope :hard, -> { where(difficulty: "hard") } + scope :edge_cases, -> { where(difficulty: "edge_case") } + scope :with_tag, ->(tag) { where("? = ANY(tags)", tag) } + scope :with_any_tags, ->(tags) { where("tags && ARRAY[?]::varchar[]", tags) } + + # Convert to format expected by AutoCategorizer + def to_transaction_input + input_data.deep_symbolize_keys + end + + # Get categories from context (for categorization evals) + def categories_context + context_data.dig("categories") || [] + end + + # Get merchants from context (for merchant detection evals) + def merchants_context + context_data.dig("merchants") || [] + end + + # Get mock data from context (for chat evals) + def mock_data + context_data.dig("mock_data") || input_data.dig("mock_data") || {} + end + + # Get the chat prompt (for chat evals) + def chat_prompt + input_data.dig("prompt") || input_data["prompt"] + end + + # Get expected functions (for chat evals) + def expected_functions + expected_output.dig("functions") || expected_output["functions"] || [] + end + + # Get expected response keywords (for chat evals) + def expected_response_contains + expected_output.dig("response_contains") || expected_output["response_contains"] || [] + end + + # Get expected category name (for categorization evals) + def expected_category_name + expected_output.dig("category_name") || expected_output["category_name"] + end + + # Get acceptable alternative category names (for categorization evals) + # These are categories that are also considered correct answers + def acceptable_alternatives + expected_output.dig("acceptable_alternatives") || expected_output["acceptable_alternatives"] || [] + end + + # Get all acceptable category names (primary + alternatives) + def all_acceptable_categories + [ expected_category_name, *acceptable_alternatives ].compact + end + + # Get expected merchant info (for merchant detection evals) + def expected_business_name + expected_output.dig("business_name") || expected_output["business_name"] + end + + def expected_business_url + expected_output.dig("business_url") || expected_output["business_url"] + end + + # Check if null is expected + def expects_null? + case dataset.eval_type + when "categorization" + expected_category_name.nil? + when "merchant_detection" + expected_business_name.nil? && expected_business_url.nil? + else + false + end + end +end diff --git a/app/models/provider/openai.rb b/app/models/provider/openai.rb index a732a1cd6..17a1e0bd7 100644 --- a/app/models/provider/openai.rb +++ b/app/models/provider/openai.rb @@ -51,7 +51,7 @@ class Provider::Openai < Provider @uri_base.present? end - def auto_categorize(transactions: [], user_categories: [], model: "", family: nil) + def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil) with_provider_response do raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25 if user_categories.blank? @@ -74,7 +74,8 @@ class Provider::Openai < Provider user_categories: user_categories, custom_provider: custom_provider?, langfuse_trace: trace, - family: family + family: family, + json_mode: json_mode ).auto_categorize trace&.update(output: result.map(&:to_h)) @@ -83,7 +84,7 @@ class Provider::Openai < Provider end end - def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil) + def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil) with_provider_response do raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25 @@ -101,7 +102,8 @@ class Provider::Openai < Provider user_merchants: user_merchants, custom_provider: custom_provider?, langfuse_trace: trace, - family: family + family: family, + json_mode: json_mode ).auto_detect_merchants trace&.update(output: result.map(&:to_h)) diff --git a/app/models/provider/openai/auto_categorizer.rb b/app/models/provider/openai/auto_categorizer.rb index ff3948784..1d369e4f3 100644 --- a/app/models/provider/openai/auto_categorizer.rb +++ b/app/models/provider/openai/auto_categorizer.rb @@ -1,9 +1,22 @@ class Provider::Openai::AutoCategorizer include Provider::Openai::Concerns::UsageRecorder - attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family + # JSON response format modes for custom providers + # - "strict": Use strict JSON schema (requires full OpenAI API compatibility) + # - "json_object": Use json_object response format (broader compatibility) + # - "none": No response format constraint (maximum compatibility with local LLMs) + JSON_MODE_STRICT = "strict" + JSON_MODE_OBJECT = "json_object" + JSON_MODE_NONE = "none" + JSON_MODE_AUTO = "auto" - def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil) + # Threshold for auto mode: if more than this percentage returns null, retry with none mode + # This is a heuristic to detect when strict JSON mode is breaking the model's ability to reason + AUTO_MODE_NULL_THRESHOLD = 0.5 + + attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family, :json_mode + + def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil) @client = client @model = model @transactions = transactions @@ -11,6 +24,32 @@ class Provider::Openai::AutoCategorizer @custom_provider = custom_provider @langfuse_trace = langfuse_trace @family = family + @json_mode = json_mode || default_json_mode + end + + VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze + + # Determine default JSON mode based on configuration hierarchy: + # 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override + # 2. Setting.openai_json_mode - user-configured in app settings + # 3. Default: auto mode (recommended for all providers) + # + # Mode descriptions: + # - "auto": Tries strict first, falls back to none if >50% fail (recommended default) + # - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose tags + # - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output + # - "json_object": Middle ground, broader compatibility than strict + def default_json_mode + # 1. Check environment variable first (allows runtime override for testing) + env_mode = ENV["LLM_JSON_MODE"] + return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode) + + # 2. Check app settings (user-configured) + setting_mode = Setting.openai_json_mode + return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode) + + # 3. Default: auto mode for all providers (tries strict first, falls back to none if needed) + JSON_MODE_AUTO end def auto_categorize @@ -22,6 +61,40 @@ class Provider::Openai::AutoCategorizer end def instructions + if custom_provider + simple_instructions + else + detailed_instructions + end + end + + # Simplified instructions for smaller/local LLMs + def simple_instructions + <<~INSTRUCTIONS.strip_heredoc + Categorize transactions into the given categories. Return JSON only. Do not explain your reasoning. + + CRITICAL RULES: + 1. Match transaction_id exactly from input + 2. Use EXACT category_name from the provided list, or "null" if unsure + 3. Match expense transactions to expense categories only + 4. Match income transactions to income categories only + 5. Return "null" if the description is generic/ambiguous (e.g., "POS DEBIT", "ACH WITHDRAWAL", "CHECK #1234") + 6. Prefer MORE SPECIFIC subcategories over general parent categories when available + + CATEGORY HIERARCHY NOTES: + - Use "Restaurants" for sit-down restaurants, "Fast Food" for quick service chains + - Use "Coffee Shops" for coffee places, "Food & Drink" only when type is unclear + - Use "Shopping" for general retail, big-box stores, and online marketplaces + - Use "Groceries" for dedicated grocery stores ONLY + - For income: use "Salary" for payroll/employer deposits, "Income" for generic income sources + + Output JSON format only (no markdown, no explanation): + {"categorizations": [{"transaction_id": "...", "category_name": "..."}]} + INSTRUCTIONS + end + + # Detailed instructions for larger models like GPT-4 + def detailed_instructions <<~INSTRUCTIONS.strip_heredoc You are an assistant to a consumer personal finance app. You will be provided a list of the user's transactions and a list of the user's categories. Your job is to auto-categorize @@ -87,19 +160,68 @@ class Provider::Openai::AutoCategorizer end def auto_categorize_openai_generic + if json_mode == JSON_MODE_AUTO + auto_categorize_with_auto_mode + else + auto_categorize_with_mode(json_mode) + end + rescue Faraday::BadRequestError => e + # If strict mode fails (HTTP 400), fall back to none mode + # This handles providers that don't support json_schema response format + if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO + Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}") + auto_categorize_with_mode(JSON_MODE_NONE) + else + raise + end + end + + # Auto mode: try strict first, fall back to none if too many nulls or missing results + # + # This uses pure heuristics to detect when strict JSON mode is breaking the model's + # ability to reason. Models that can't reason well in strict mode often: + # 1. Return null for everything, OR + # 2. Simply omit transactions they can't categorize (returning fewer results than input) + # + # The heuristic is simple: if >50% of results are null or missing, the model likely + # needs the freedom to reason in its output (which strict mode prevents). + def auto_categorize_with_auto_mode + result = auto_categorize_with_mode(JSON_MODE_STRICT) + + null_count = result.count { |r| r.category_name.nil? || r.category_name == "null" } + missing_count = transactions.size - result.size + failed_count = null_count + missing_count + failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0 + + if failed_ratio > AUTO_MODE_NULL_THRESHOLD + Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode") + auto_categorize_with_mode(JSON_MODE_NONE) + else + result + end + end + + def auto_categorize_with_mode(mode) span = langfuse_trace&.span(name: "auto_categorize_api_call", input: { model: model.presence || Provider::Openai::DEFAULT_MODEL, transactions: transactions, - user_categories: user_categories + user_categories: user_categories, + json_mode: mode }) - response = client.chat(parameters: { + # Build parameters with configurable JSON response format + params = { model: model.presence || Provider::Openai::DEFAULT_MODEL, messages: [ { role: "system", content: instructions }, - { role: "user", content: developer_message } - ], - response_format: { + { role: "user", content: developer_message_for_generic } + ] + } + + # Add response format based on json_mode setting + case mode + when JSON_MODE_STRICT + params[:response_format] = { type: "json_schema", json_schema: { name: "auto_categorize_personal_finance_transactions", @@ -107,9 +229,14 @@ class Provider::Openai::AutoCategorizer schema: json_schema } } - }) + when JSON_MODE_OBJECT + params[:response_format] = { type: "json_object" } + # JSON_MODE_NONE: no response_format constraint + end - Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")}") + response = client.chat(parameters: params) + + Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})") categorizations = extract_categorizations_generic(response) result = build_response(categorizations) @@ -120,7 +247,8 @@ class Provider::Openai::AutoCategorizer operation: "auto_categorize", metadata: { transaction_count: transactions.size, - category_count: user_categories.size + category_count: user_categories.size, + json_mode: mode } ) @@ -143,9 +271,72 @@ class Provider::Openai::AutoCategorizer end def normalize_category_name(category_name) - return nil if category_name == "null" + # Convert to string to handle non-string LLM outputs (numbers, booleans, etc.) + normalized = category_name.to_s.strip + return nil if normalized.empty? || normalized == "null" || normalized.downcase == "null" - category_name + # Try exact match first + exact_match = user_categories.find { |c| c[:name] == normalized } + return exact_match[:name] if exact_match + + # Try case-insensitive match + case_insensitive_match = user_categories.find { |c| c[:name].to_s.downcase == normalized.downcase } + return case_insensitive_match[:name] if case_insensitive_match + + # Try partial/fuzzy match (for common variations) + fuzzy_match = find_fuzzy_category_match(normalized) + return fuzzy_match if fuzzy_match + + # Return normalized string if no match found (will be treated as uncategorized) + normalized + end + + # Find a fuzzy match for category names with common variations + def find_fuzzy_category_match(category_name) + # Ensure string input for string operations + input_str = category_name.to_s + normalized_input = input_str.downcase.gsub(/[^a-z0-9]/, "") + + user_categories.each do |cat| + cat_name_str = cat[:name].to_s + normalized_cat = cat_name_str.downcase.gsub(/[^a-z0-9]/, "") + + # Check if one contains the other + return cat[:name] if normalized_input.include?(normalized_cat) || normalized_cat.include?(normalized_input) + + # Check common abbreviations/variations + return cat[:name] if fuzzy_name_match?(input_str, cat_name_str) + end + + nil + end + + # Handle common naming variations + def fuzzy_name_match?(input, category) + variations = { + "gas" => [ "gas & fuel", "gas and fuel", "fuel", "gasoline" ], + "restaurants" => [ "restaurant", "dining", "food" ], + "groceries" => [ "grocery", "supermarket", "food store" ], + "streaming" => [ "streaming services", "streaming service" ], + "rideshare" => [ "ride share", "ride-share", "uber", "lyft" ], + "coffee" => [ "coffee shops", "coffee shop", "cafe" ], + "fast food" => [ "fastfood", "quick service" ], + "gym" => [ "gym & fitness", "fitness", "gym and fitness" ], + "flights" => [ "flight", "airline", "airlines", "airfare" ], + "hotels" => [ "hotel", "lodging", "accommodation" ] + } + + # Ensure string inputs for string operations + input_lower = input.to_s.downcase + category_lower = category.to_s.downcase + + variations.each do |_key, synonyms| + if synonyms.include?(input_lower) && synonyms.include?(category_lower) + return true + end + end + + false end def extract_categorizations_native(response) @@ -162,9 +353,107 @@ class Provider::Openai::AutoCategorizer def extract_categorizations_generic(response) raw = response.dig("choices", 0, "message", "content") - JSON.parse(raw).dig("categorizations") - rescue JSON::ParserError => e - raise Provider::Openai::Error, "Invalid JSON in generic categorization: #{e.message}" + parsed = parse_json_flexibly(raw) + + # Handle different response formats from various LLMs + categorizations = parsed.dig("categorizations") || + parsed.dig("results") || + (parsed.is_a?(Array) ? parsed : nil) + + raise Provider::Openai::Error, "Could not find categorizations in response" if categorizations.nil? + + # Normalize field names (some LLMs use different naming) + categorizations.map do |cat| + { + "transaction_id" => cat["transaction_id"] || cat["id"] || cat["txn_id"], + "category_name" => cat["category_name"] || cat["category"] || cat["name"] + } + end + end + + # Flexible JSON parsing that handles common LLM output issues + def parse_json_flexibly(raw) + return {} if raw.blank? + + # Strip thinking model tags if present (e.g., ...) + # The actual JSON output comes after the thinking block + cleaned = strip_thinking_tags(raw) + + # Try direct parse first + JSON.parse(cleaned) + rescue JSON::ParserError + # Try multiple extraction strategies in order of preference + + # Strategy 1: Closed markdown code blocks (```json...```) + if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m + matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten + matches.reverse_each do |match| + begin + return JSON.parse(match) + rescue JSON::ParserError + next + end + end + end + + # Strategy 2: Unclosed markdown code blocks (thinking models often forget to close) + # Pattern: ```json followed by JSON that goes to end of string + if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Continue to next strategy + end + end + + # Strategy 3: Find JSON object with "categorizations" key + if cleaned =~ /(\{"categorizations"\s*:\s*\[[\s\S]*\]\s*\})/m + matches = cleaned.scan(/(\{"categorizations"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten + matches.reverse_each do |match| + begin + return JSON.parse(match) + rescue JSON::ParserError + next + end + end + # Try greedy match if non-greedy failed + begin + return JSON.parse($1) + rescue JSON::ParserError + # Continue to next strategy + end + end + + # Strategy 4: Find any JSON object (last resort) + if cleaned =~ /(\{[\s\S]*\})/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Fall through to error + end + end + + raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}" + end + + # Strip thinking model tags (...) from response + # Some models like Qwen-thinking output reasoning in these tags before the actual response + def strip_thinking_tags(raw) + # Remove ... blocks but keep content after them + # If no closing tag, the model may have been cut off - try to extract JSON from inside + if raw.include?("") + # Check if there's content after the thinking block + if raw =~ /<\/think>\s*([\s\S]*)/m + after_thinking = $1.strip + return after_thinking if after_thinking.present? + end + # If no content after or no closing tag, look inside the thinking block + # The JSON might be the last thing in the thinking block + if raw =~ /([\s\S]*)/m + return $1 + end + end + raw end def json_schema @@ -213,4 +502,39 @@ class Provider::Openai::AutoCategorizer ``` MESSAGE end + + # Concise developer message optimized for smaller/local LLMs + # Uses pattern-based guidance instead of exhaustive examples + def developer_message_for_generic + <<~MESSAGE.strip_heredoc + AVAILABLE CATEGORIES: #{user_categories.map { |c| c[:name] }.join(", ")} + + TRANSACTIONS TO CATEGORIZE: + #{format_transactions_simply} + + CATEGORIZATION GUIDELINES: + - Prefer specific subcategories over general parent categories when confident + - Food delivery services should be categorized based on the underlying merchant type + - Square payments (SQ *) should be inferred from the merchant name after the prefix + - Warehouse/club stores should be categorized based on their primary purpose + - Return "null" for generic transactions (e.g., POS terminals, wire transfers, checks, ATM withdrawals) + + IMPORTANT: + - Use EXACT category names from the list above + - Return "null" (as a string) if you cannot confidently match a category + - Match expense transactions only to expense categories + - Match income transactions only to income categories + - Do NOT include any explanation or reasoning - only output JSON + + Respond with ONLY this JSON (no markdown code blocks, no other text): + {"categorizations": [{"transaction_id": "...", "category_name": "..."}]} + MESSAGE + end + + # Format transactions in a simpler, more readable way for smaller LLMs + def format_transactions_simply + transactions.map do |t| + "- ID: #{t[:id]}, Amount: #{t[:amount]}, Type: #{t[:classification]}, Description: \"#{t[:description]}\"" + end.join("\n") + end end diff --git a/app/models/provider/openai/auto_merchant_detector.rb b/app/models/provider/openai/auto_merchant_detector.rb index f745487ad..3de15709d 100644 --- a/app/models/provider/openai/auto_merchant_detector.rb +++ b/app/models/provider/openai/auto_merchant_detector.rb @@ -1,9 +1,22 @@ class Provider::Openai::AutoMerchantDetector include Provider::Openai::Concerns::UsageRecorder - attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family + # JSON response format modes for custom providers + # - "strict": Use strict JSON schema (requires full OpenAI API compatibility) + # - "json_object": Use json_object response format (broader compatibility) + # - "none": No response format constraint (maximum compatibility with local LLMs) + # - "auto": Try strict first, fall back to none if poor results + JSON_MODE_STRICT = "strict" + JSON_MODE_OBJECT = "json_object" + JSON_MODE_NONE = "none" + JSON_MODE_AUTO = "auto" - def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil) + # Threshold for auto mode: if more than this percentage returns null, retry with none mode + AUTO_MODE_NULL_THRESHOLD = 0.5 + + attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family, :json_mode + + def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil) @client = client @model = model @transactions = transactions @@ -11,6 +24,32 @@ class Provider::Openai::AutoMerchantDetector @custom_provider = custom_provider @langfuse_trace = langfuse_trace @family = family + @json_mode = json_mode || default_json_mode + end + + VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze + + # Determine default JSON mode based on configuration hierarchy: + # 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override + # 2. Setting.openai_json_mode - user-configured in app settings + # 3. Default: auto mode (recommended for all providers) + # + # Mode descriptions: + # - "auto": Tries strict first, falls back to none if >50% fail (recommended default) + # - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose tags + # - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output + # - "json_object": Middle ground, broader compatibility than strict + def default_json_mode + # 1. Check environment variable first (allows runtime override for testing) + env_mode = ENV["LLM_JSON_MODE"] + return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode) + + # 2. Check app settings (user-configured) + setting_mode = Setting.openai_json_mode + return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode) + + # 3. Default: auto mode for all providers (tries strict first, falls back to none if needed) + JSON_MODE_AUTO end def auto_detect_merchants @@ -22,6 +61,32 @@ class Provider::Openai::AutoMerchantDetector end def instructions + if custom_provider + simple_instructions + else + detailed_instructions + end + end + + # Simplified instructions for smaller/local LLMs + def simple_instructions + <<~INSTRUCTIONS.strip_heredoc + Detect business names and websites from transaction descriptions. Return JSON only. + + Rules: + 1. Match transaction_id exactly from input + 2. Return business_name and business_url for known businesses + 3. Return "null" for both if uncertain or generic (e.g. "Paycheck", "Local diner") + 4. Don't include "www." in URLs (use "amazon.com" not "www.amazon.com") + 5. Favor "null" over guessing - only return values if 80%+ confident + + Example output format: + {"merchants": [{"transaction_id": "txn_001", "business_name": "Amazon", "business_url": "amazon.com"}]} + INSTRUCTIONS + end + + # Detailed instructions for larger models like GPT-4 + def detailed_instructions <<~INSTRUCTIONS.strip_heredoc You are an assistant to a consumer personal finance app. @@ -108,19 +173,64 @@ class Provider::Openai::AutoMerchantDetector end def auto_detect_merchants_openai_generic + if json_mode == JSON_MODE_AUTO + auto_detect_merchants_with_auto_mode + else + auto_detect_merchants_with_mode(json_mode) + end + rescue Faraday::BadRequestError => e + # If strict mode fails (HTTP 400), fall back to none mode + # This handles providers that don't support json_schema response format + if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO + Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}") + auto_detect_merchants_with_mode(JSON_MODE_NONE) + else + raise + end + end + + # Auto mode: try strict first, fall back to none if too many nulls or missing results + def auto_detect_merchants_with_auto_mode + result = auto_detect_merchants_with_mode(JSON_MODE_STRICT) + + # Check if too many nulls OR missing results were returned + # Models that can't reason in strict mode often: + # 1. Return null for everything, OR + # 2. Simply omit transactions they can't detect (returning fewer results than input) + null_count = result.count { |r| r.business_name.nil? || r.business_name == "null" } + missing_count = transactions.size - result.size + failed_count = null_count + missing_count + failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0 + + if failed_ratio > AUTO_MODE_NULL_THRESHOLD + Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode") + auto_detect_merchants_with_mode(JSON_MODE_NONE) + else + result + end + end + + def auto_detect_merchants_with_mode(mode) span = langfuse_trace&.span(name: "auto_detect_merchants_api_call", input: { model: model.presence || Provider::Openai::DEFAULT_MODEL, transactions: transactions, - user_merchants: user_merchants + user_merchants: user_merchants, + json_mode: mode }) - response = client.chat(parameters: { + # Build parameters with configurable JSON response format + params = { model: model.presence || Provider::Openai::DEFAULT_MODEL, messages: [ { role: "system", content: instructions }, - { role: "user", content: developer_message } - ], - response_format: { + { role: "user", content: developer_message_for_generic } + ] + } + + # Add response format based on json_mode setting + case mode + when JSON_MODE_STRICT + params[:response_format] = { type: "json_schema", json_schema: { name: "auto_detect_personal_finance_merchants", @@ -128,9 +238,14 @@ class Provider::Openai::AutoMerchantDetector schema: json_schema } } - }) + when JSON_MODE_OBJECT + params[:response_format] = { type: "json_object" } + # JSON_MODE_NONE: no response_format constraint + end - Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")}") + response = client.chat(parameters: params) + + Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})") merchants = extract_merchants_generic(response) result = build_response(merchants) @@ -141,7 +256,8 @@ class Provider::Openai::AutoMerchantDetector operation: "auto_detect_merchants", metadata: { transaction_count: transactions.size, - merchant_count: user_merchants.size + merchant_count: user_merchants.size, + json_mode: mode } ) @@ -154,24 +270,40 @@ class Provider::Openai::AutoMerchantDetector AutoDetectedMerchant = Provider::LlmConcept::AutoDetectedMerchant - def build_response(categorizations) - categorizations.map do |categorization| + def build_response(merchants) + merchants.map do |merchant| AutoDetectedMerchant.new( - transaction_id: categorization.dig("transaction_id"), - business_name: normalize_ai_value(categorization.dig("business_name")), - business_url: normalize_ai_value(categorization.dig("business_url")), + transaction_id: merchant.dig("transaction_id"), + business_name: normalize_merchant_value(merchant.dig("business_name")), + business_url: normalize_merchant_value(merchant.dig("business_url")), ) end end - def normalize_ai_value(ai_value) - return nil if ai_value == "null" + def normalize_merchant_value(value) + return nil if value.nil? || value == "null" || value.to_s.downcase == "null" - ai_value + # Try to match against user merchants for name normalization + if user_merchants.present? + # Try exact match first + exact_match = user_merchants.find { |m| m[:name] == value } + return exact_match[:name] if exact_match + + # Try case-insensitive match + case_match = user_merchants.find { |m| m[:name].to_s.downcase == value.to_s.downcase } + return case_match[:name] if case_match + end + + value end def extract_merchants_native(response) - raw = response.dig("output", 0, "content", 0, "text") + # Find the message output (not reasoning output) + message_output = response["output"]&.find { |o| o["type"] == "message" } + raw = message_output&.dig("content", 0, "text") + + raise Provider::Openai::Error, "No message content found in response" if raw.nil? + JSON.parse(raw).dig("merchants") rescue JSON::ParserError => e raise Provider::Openai::Error, "Invalid JSON in native merchant detection: #{e.message}" @@ -179,9 +311,100 @@ class Provider::Openai::AutoMerchantDetector def extract_merchants_generic(response) raw = response.dig("choices", 0, "message", "content") - JSON.parse(raw).dig("merchants") - rescue JSON::ParserError => e - raise Provider::Openai::Error, "Invalid JSON in generic merchant detection: #{e.message}" + parsed = parse_json_flexibly(raw) + + # Handle different response formats from various LLMs + merchants = parsed.dig("merchants") || + parsed.dig("results") || + (parsed.is_a?(Array) ? parsed : nil) + + raise Provider::Openai::Error, "Could not find merchants in response" if merchants.nil? + + # Normalize field names (some LLMs use different naming) + merchants.map do |m| + { + "transaction_id" => m["transaction_id"] || m["id"] || m["txn_id"], + "business_name" => m["business_name"] || m["name"] || m["merchant_name"] || m["merchant"], + "business_url" => m["business_url"] || m["url"] || m["website"] + } + end + end + + # Flexible JSON parsing that handles common LLM output issues + def parse_json_flexibly(raw) + return {} if raw.blank? + + # Strip thinking model tags if present (e.g., ...) + cleaned = strip_thinking_tags(raw) + + # Try direct parse first + JSON.parse(cleaned) + rescue JSON::ParserError + # Try multiple extraction strategies in order of preference + + # Strategy 1: Closed markdown code blocks (```json...```) + if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m + matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten + matches.reverse_each do |match| + begin + return JSON.parse(match) + rescue JSON::ParserError + next + end + end + end + + # Strategy 2: Unclosed markdown code blocks (thinking models often forget to close) + if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Continue to next strategy + end + end + + # Strategy 3: Find JSON object with "merchants" key + if cleaned =~ /(\{"merchants"\s*:\s*\[[\s\S]*\]\s*\})/m + matches = cleaned.scan(/(\{"merchants"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten + matches.reverse_each do |match| + begin + return JSON.parse(match) + rescue JSON::ParserError + next + end + end + # Try greedy match if non-greedy failed + begin + return JSON.parse($1) + rescue JSON::ParserError + # Continue to next strategy + end + end + + # Strategy 4: Find any JSON object (last resort) + if cleaned =~ /(\{[\s\S]*\})/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Fall through to error + end + end + + raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}" + end + + # Strip thinking model tags (...) from response + def strip_thinking_tags(raw) + if raw.include?("") + if raw =~ /<\/think>\s*([\s\S]*)/m + after_thinking = $1.strip + return after_thinking if after_thinking.present? + end + if raw =~ /([\s\S]*)/m + return $1 + end + end + raw end def json_schema @@ -235,4 +458,40 @@ class Provider::Openai::AutoMerchantDetector Return "null" if you are not 80%+ confident in your answer. MESSAGE end + + # Enhanced developer message with few-shot examples for smaller/local LLMs + def developer_message_for_generic + merchant_names = user_merchants.present? ? user_merchants.map { |m| m[:name] }.join(", ") : "(none provided)" + + <<~MESSAGE.strip_heredoc + USER'S KNOWN MERCHANTS: #{merchant_names} + + TRANSACTIONS TO ANALYZE: + #{format_transactions_simply} + + EXAMPLES of correct merchant detection: + - "AMAZON.COM*1A2B3C" → business_name: "Amazon", business_url: "amazon.com" + - "STARBUCKS STORE #9876" → business_name: "Starbucks", business_url: "starbucks.com" + - "NETFLIX.COM" → business_name: "Netflix", business_url: "netflix.com" + - "UBER *TRIP" → business_name: "Uber", business_url: "uber.com" + - "ACH WITHDRAWAL" → business_name: "null", business_url: "null" (generic) + - "LOCAL DINER" → business_name: "null", business_url: "null" (generic/unknown) + - "POS DEBIT 12345" → business_name: "null", business_url: "null" (generic) + + IMPORTANT: + - Return "null" (as a string) for BOTH name and URL if you cannot confidently identify the business + - Don't include "www." in URLs + - Generic descriptions like "Paycheck", "Transfer", "ATM" should return "null" + + Respond with ONLY this JSON format (no other text): + {"merchants": [{"transaction_id": "...", "business_name": "...", "business_url": "..."}]} + MESSAGE + end + + # Format transactions in a simpler, more readable way for smaller LLMs + def format_transactions_simply + transactions.map do |t| + "- ID: #{t[:id]}, Description: \"#{t[:name] || t[:description]}\"" + end.join("\n") + end end diff --git a/app/models/setting.rb b/app/models/setting.rb index 94a2bdfc1..1b5706171 100644 --- a/app/models/setting.rb +++ b/app/models/setting.rb @@ -9,6 +9,7 @@ class Setting < RailsSettings::Base field :openai_access_token, type: :string, default: ENV["OPENAI_ACCESS_TOKEN"] field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"] field :openai_model, type: :string, default: ENV["OPENAI_MODEL"] + field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"] field :brand_fetch_client_id, type: :string, default: ENV["BRAND_FETCH_CLIENT_ID"] # Provider selection diff --git a/app/views/settings/hostings/_openai_settings.html.erb b/app/views/settings/hostings/_openai_settings.html.erb index 59bfba8b1..8d6ce7e77 100644 --- a/app/views/settings/hostings/_openai_settings.html.erb +++ b/app/views/settings/hostings/_openai_settings.html.erb @@ -47,5 +47,20 @@ inputmode: "text", disabled: ENV["OPENAI_MODEL"].present?, data: { "auto-submit-form-target": "auto" } %> + + <%= form.select :openai_json_mode, + options_for_select( + [ + [t(".json_mode_auto"), ""], + [t(".json_mode_strict"), "strict"], + [t(".json_mode_none"), "none"], + [t(".json_mode_json_object"), "json_object"] + ], + Setting.openai_json_mode + ), + { label: t(".json_mode_label") }, + { disabled: ENV["LLM_JSON_MODE"].present?, + data: { "auto-submit-form-target": "auto" } } %> +

<%= t(".json_mode_help") %>

<% end %> diff --git a/config/locales/views/settings/hostings/en.yml b/config/locales/views/settings/hostings/en.yml index 1e7c83059..81d73148c 100644 --- a/config/locales/views/settings/hostings/en.yml +++ b/config/locales/views/settings/hostings/en.yml @@ -48,6 +48,12 @@ en: uri_base_placeholder: "https://api.openai.com/v1 (default)" model_label: Model (Optional) model_placeholder: "gpt-4.1 (default)" + json_mode_label: JSON Mode + json_mode_auto: Auto (recommended) + json_mode_strict: Strict (best for thinking models) + json_mode_none: None (best for standard models) + json_mode_json_object: JSON Object + json_mode_help: "Strict mode works best with thinking models (qwen-thinking, deepseek-reasoner). None mode works best with standard models (llama, mistral, gpt-oss)." title: OpenAI yahoo_finance_settings: title: Yahoo Finance diff --git a/db/eval_data/categorization_golden_v1.yml b/db/eval_data/categorization_golden_v1.yml new file mode 100644 index 000000000..586c87b9d --- /dev/null +++ b/db/eval_data/categorization_golden_v1.yml @@ -0,0 +1,1344 @@ +--- +name: categorization_golden_v1 +description: Golden dataset for transaction categorization evaluation +eval_type: categorization +version: "1.1" +metadata: + created_at: "2024-12-01" + updated_at: "2025-12-03" + source: manual_curation + notes: | + Difficulty levels: + - easy: Unambiguous merchant names, single clear category + - medium: Requires domain knowledge but has clear answer + - hard: Genuinely ambiguous, multiple reasonable interpretations + - edge_case: Should return null (generic/cryptic descriptions) + +context: + categories: + - id: "income" + name: "Income" + classification: "income" + is_subcategory: false + - id: "salary" + name: "Salary" + classification: "income" + is_subcategory: true + parent_id: "income" + - id: "food_and_drink" + name: "Food & Drink" + classification: "expense" + is_subcategory: false + - id: "restaurants" + name: "Restaurants" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "groceries" + name: "Groceries" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "coffee_shops" + name: "Coffee Shops" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "shopping" + name: "Shopping" + classification: "expense" + is_subcategory: false + - id: "clothing" + name: "Clothing" + classification: "expense" + is_subcategory: true + parent_id: "shopping" + - id: "electronics" + name: "Electronics" + classification: "expense" + is_subcategory: true + parent_id: "shopping" + - id: "transportation" + name: "Transportation" + classification: "expense" + is_subcategory: false + - id: "gas" + name: "Gas & Fuel" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "rideshare" + name: "Rideshare" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "public_transit" + name: "Public Transit" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "entertainment" + name: "Entertainment" + classification: "expense" + is_subcategory: false + - id: "streaming" + name: "Streaming Services" + classification: "expense" + is_subcategory: true + parent_id: "entertainment" + - id: "utilities" + name: "Utilities" + classification: "expense" + is_subcategory: false + - id: "housing" + name: "Housing" + classification: "expense" + is_subcategory: false + - id: "rent" + name: "Rent" + classification: "expense" + is_subcategory: true + parent_id: "housing" + - id: "health" + name: "Health & Wellness" + classification: "expense" + is_subcategory: false + - id: "pharmacy" + name: "Pharmacy" + classification: "expense" + is_subcategory: true + parent_id: "health" + - id: "gym" + name: "Gym & Fitness" + classification: "expense" + is_subcategory: true + parent_id: "health" + - id: "travel" + name: "Travel" + classification: "expense" + is_subcategory: false + - id: "flights" + name: "Flights" + classification: "expense" + is_subcategory: true + parent_id: "travel" + - id: "hotels" + name: "Hotels" + classification: "expense" + is_subcategory: true + parent_id: "travel" + - id: "subscriptions" + name: "Subscriptions" + classification: "expense" + is_subcategory: false + - id: "personal_care" + name: "Personal Care" + classification: "expense" + is_subcategory: false + - id: "gifts" + name: "Gifts & Donations" + classification: "expense" + is_subcategory: false + +samples: + # ============================================================================= + # EASY SAMPLES - Unambiguous merchant names with single clear category + # ============================================================================= + + # Food & Drink - Clear chain names + - id: cat_easy_001 + difficulty: easy + tags: [food_and_drink, clear_merchant] + input: + id: txn_001 + amount: 12.99 + classification: expense + description: "MCDONALD'S #12345 SPRINGFIELD IL" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_easy_002 + difficulty: easy + tags: [food_and_drink, clear_merchant] + input: + id: txn_002 + amount: 8.50 + classification: expense + description: "BURGER KING #456 NEW YORK NY" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_easy_021 + difficulty: easy + tags: [food_and_drink, clear_merchant] + input: + id: txn_061 + amount: 9.99 + classification: expense + description: "TACO BELL #789" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_easy_033 + difficulty: easy + tags: [food_and_drink, clear_merchant] + input: + id: txn_093 + amount: 14.99 + classification: expense + description: "CHIPOTLE MEXICAN GRILL" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_easy_034 + difficulty: easy + tags: [food_and_drink, clear_merchant] + input: + id: txn_094 + amount: 8.99 + classification: expense + description: "SUBWAY #12345" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + # Coffee Shops - Clear coffee chain names + - id: cat_easy_003 + difficulty: easy + tags: [coffee_shops, clear_merchant] + input: + id: txn_003 + amount: 5.75 + classification: expense + description: "STARBUCKS STORE #9876" + expected: + category_name: "Coffee Shops" + + - id: cat_easy_023 + difficulty: easy + tags: [coffee_shops, clear_merchant] + input: + id: txn_063 + amount: 4.25 + classification: expense + description: "DUNKIN #12345" + expected: + category_name: "Coffee Shops" + + - id: cat_easy_035 + difficulty: easy + tags: [coffee_shops, clear_merchant] + input: + id: txn_095 + amount: 6.50 + classification: expense + description: "PEETS COFFEE #456" + expected: + category_name: "Coffee Shops" + + # Groceries - Dedicated grocery stores + - id: cat_easy_004 + difficulty: easy + tags: [groceries, clear_merchant] + input: + id: txn_004 + amount: 156.32 + classification: expense + description: "WHOLE FOODS MKT #10234" + expected: + category_name: "Groceries" + + - id: cat_easy_005 + difficulty: easy + tags: [groceries, clear_merchant] + input: + id: txn_005 + amount: 87.45 + classification: expense + description: "TRADER JOE'S #567 LOS ANGELES" + expected: + category_name: "Groceries" + + - id: cat_easy_025 + difficulty: easy + tags: [groceries, clear_merchant] + input: + id: txn_065 + amount: 98.34 + classification: expense + description: "PUBLIX SUPER MARKET" + expected: + category_name: "Groceries" + + - id: cat_easy_036 + difficulty: easy + tags: [groceries, clear_merchant] + input: + id: txn_101 + amount: 67.89 + classification: expense + description: "KROGER #789 GROCERY" + expected: + category_name: "Groceries" + + # Gas & Fuel - Clear gas station names + - id: cat_easy_006 + difficulty: easy + tags: [gas, clear_merchant] + input: + id: txn_006 + amount: 45.00 + classification: expense + description: "SHELL OIL 573849234" + expected: + category_name: "Gas & Fuel" + + - id: cat_easy_007 + difficulty: easy + tags: [gas, clear_merchant] + input: + id: txn_007 + amount: 52.30 + classification: expense + description: "CHEVRON STATION #1234" + expected: + category_name: "Gas & Fuel" + + - id: cat_easy_026 + difficulty: easy + tags: [gas, clear_merchant] + input: + id: txn_076 + amount: 48.50 + classification: expense + description: "EXXONMOBIL 12345" + expected: + category_name: "Gas & Fuel" + + - id: cat_easy_024 + difficulty: easy + tags: [gas, clear_merchant] + input: + id: txn_064 + amount: 45.67 + classification: expense + description: "KROGER FUEL CENTER #456" + expected: + category_name: "Gas & Fuel" + + # Rideshare - Clear service names + - id: cat_easy_008 + difficulty: easy + tags: [rideshare, clear_merchant] + input: + id: txn_008 + amount: 23.50 + classification: expense + description: "UBER *TRIP HELP.UBER.COM" + expected: + category_name: "Rideshare" + + - id: cat_easy_009 + difficulty: easy + tags: [rideshare, clear_merchant] + input: + id: txn_009 + amount: 18.75 + classification: expense + description: "LYFT *RIDE SAT 7PM" + expected: + category_name: "Rideshare" + + # Streaming Services - Clear streaming platforms + - id: cat_easy_010 + difficulty: easy + tags: [streaming, clear_merchant] + input: + id: txn_010 + amount: 15.99 + classification: expense + description: "NETFLIX.COM" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_easy_011 + difficulty: easy + tags: [streaming, clear_merchant] + input: + id: txn_011 + amount: 10.99 + classification: expense + description: "SPOTIFY USA" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + # Electronics - Clear electronics retailers + - id: cat_easy_012 + difficulty: easy + tags: [electronics, clear_merchant] + input: + id: txn_012 + amount: 299.99 + classification: expense + description: "BEST BUY 00000456" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + # Clothing - Clear clothing stores + - id: cat_easy_013 + difficulty: easy + tags: [clothing, clear_merchant] + input: + id: txn_013 + amount: 89.99 + classification: expense + description: "ZARA USA INC" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + - id: cat_easy_014 + difficulty: easy + tags: [clothing, clear_merchant] + input: + id: txn_014 + amount: 65.00 + classification: expense + description: "H&M HENNES MAURITZ" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + # Pharmacy - Clear pharmacy names + - id: cat_easy_015 + difficulty: easy + tags: [pharmacy, clear_merchant] + input: + id: txn_015 + amount: 24.99 + classification: expense + description: "CVS/PHARMACY #4567" + expected: + category_name: "Pharmacy" + + - id: cat_easy_016 + difficulty: easy + tags: [pharmacy, clear_merchant] + input: + id: txn_016 + amount: 35.50 + classification: expense + description: "WALGREENS #12345" + expected: + category_name: "Pharmacy" + acceptable_alternatives: ["Health & Wellness"] + + # Gym & Fitness - Clear gym names + - id: cat_easy_017 + difficulty: easy + tags: [gym, clear_merchant] + input: + id: txn_017 + amount: 39.99 + classification: expense + description: "PLANET FITNESS MONTHLY" + expected: + category_name: "Gym & Fitness" + acceptable_alternatives: ["Health & Wellness"] + + # Flights - Clear airline names + - id: cat_easy_018 + difficulty: easy + tags: [flights, clear_merchant] + input: + id: txn_018 + amount: 345.00 + classification: expense + description: "UNITED AIRLINES 0162345678" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + - id: cat_easy_030 + difficulty: easy + tags: [flights, clear_merchant] + input: + id: txn_080 + amount: 456.00 + classification: expense + description: "DELTA AIR LINES" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + # Hotels - Clear hotel names + - id: cat_easy_019 + difficulty: easy + tags: [hotels, clear_merchant] + input: + id: txn_019 + amount: 189.00 + classification: expense + description: "MARRIOTT HOTELS NYC" + expected: + category_name: "Hotels" + + - id: cat_easy_028 + difficulty: easy + tags: [hotels, clear_merchant] + input: + id: txn_078 + amount: 245.00 + classification: expense + description: "HILTON HOTELS" + expected: + category_name: "Hotels" + + # Income - Clear payroll + - id: cat_easy_020 + difficulty: easy + tags: [income, salary, clear_merchant] + input: + id: txn_020 + amount: 3500.00 + classification: income + description: "ACME CORP PAYROLL" + expected: + category_name: "Salary" + + - id: cat_easy_031 + difficulty: easy + tags: [income, salary, clear_merchant] + input: + id: txn_086 + amount: 2800.00 + classification: income + description: "DIRECT DEPOSIT - PAYROLL" + expected: + category_name: "Salary" + + - id: cat_easy_032 + difficulty: easy + tags: [income, salary, clear_merchant] + input: + id: txn_087 + amount: 1500.00 + classification: income + description: "EMPLOYER DIRECT DEP" + expected: + category_name: "Salary" + + # ============================================================================= + # MEDIUM SAMPLES - Requires domain knowledge but has clear answer + # ============================================================================= + + # Restaurants - Sit-down restaurant chains + - id: cat_medium_001 + difficulty: medium + tags: [restaurants, chain] + input: + id: txn_021 + amount: 67.50 + classification: expense + description: "OLIVE GARDEN #456" + expected: + category_name: "Restaurants" + + - id: cat_medium_002 + difficulty: medium + tags: [restaurants, chain] + input: + id: txn_022 + amount: 85.00 + classification: expense + description: "CHEESECAKE FACTORY" + expected: + category_name: "Restaurants" + + - id: cat_medium_021 + difficulty: medium + tags: [restaurants, upscale] + input: + id: txn_066 + amount: 123.45 + classification: expense + description: "RUTH'S CHRIS STEAK" + expected: + category_name: "Restaurants" + + - id: cat_medium_022 + difficulty: medium + tags: [restaurants, chain] + input: + id: txn_067 + amount: 89.00 + classification: expense + description: "P.F. CHANGS #234" + expected: + category_name: "Restaurants" + + # Groceries - Warehouse stores (in-person) + - id: cat_medium_003 + difficulty: medium + tags: [groceries, warehouse] + input: + id: txn_023 + amount: 234.56 + classification: expense + description: "COSTCO WHSE #1234" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + - id: cat_medium_004 + difficulty: medium + tags: [groceries, warehouse] + input: + id: txn_024 + amount: 178.90 + classification: expense + description: "SAM'S CLUB #8765" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + # Utilities - Power companies + - id: cat_medium_005 + difficulty: medium + tags: [utilities, power] + input: + id: txn_025 + amount: 125.00 + classification: expense + description: "CON EDISON PAYMENT" + expected: + category_name: "Utilities" + + - id: cat_medium_006 + difficulty: medium + tags: [utilities, power] + input: + id: txn_026 + amount: 89.00 + classification: expense + description: "PACIFIC GAS ELEC CO" + expected: + category_name: "Utilities" + + - id: cat_medium_026 + difficulty: medium + tags: [utilities, internet] + input: + id: txn_081 + amount: 145.00 + classification: expense + description: "XFINITY INTERNET" + expected: + category_name: "Utilities" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_medium_027 + difficulty: medium + tags: [utilities, phone] + input: + id: txn_082 + amount: 89.00 + classification: expense + description: "AT&T WIRELESS" + expected: + category_name: "Utilities" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_medium_028 + difficulty: medium + tags: [utilities, phone] + input: + id: txn_083 + amount: 112.00 + classification: expense + description: "VERIZON WIRELESS" + expected: + category_name: "Utilities" + + # Public Transit + - id: cat_medium_007 + difficulty: medium + tags: [public_transit] + input: + id: txn_027 + amount: 127.00 + classification: expense + description: "MTA *METROCARD" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation"] + + - id: cat_medium_008 + difficulty: medium + tags: [public_transit] + input: + id: txn_028 + amount: 2.75 + classification: expense + description: "WMATA SMARTRIP" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation"] + + # Housing - Rent payments + - id: cat_medium_009 + difficulty: medium + tags: [rent, housing] + input: + id: txn_029 + amount: 2100.00 + classification: expense + description: "AVALON APARTMENTS RENT" + expected: + category_name: "Rent" + acceptable_alternatives: ["Housing"] + + # Subscriptions - Non-streaming + - id: cat_medium_010 + difficulty: medium + tags: [subscriptions] + input: + id: txn_030 + amount: 9.99 + classification: expense + description: "APPLE.COM/BILL" + expected: + category_name: "Subscriptions" + + - id: cat_medium_011 + difficulty: medium + tags: [subscriptions] + input: + id: txn_031 + amount: 2.99 + classification: expense + description: "GOOGLE *STORAGE" + expected: + category_name: "Subscriptions" + + # Personal Care + - id: cat_medium_012 + difficulty: medium + tags: [personal_care] + input: + id: txn_032 + amount: 45.00 + classification: expense + description: "SUPERCUTS #1234" + expected: + category_name: "Personal Care" + + - id: cat_medium_013 + difficulty: medium + tags: [personal_care] + input: + id: txn_033 + amount: 85.00 + classification: expense + description: "ULTA BEAUTY #567" + expected: + category_name: "Personal Care" + acceptable_alternatives: ["Shopping"] + + # Gifts & Donations + - id: cat_medium_014 + difficulty: medium + tags: [gifts, donation] + input: + id: txn_034 + amount: 50.00 + classification: expense + description: "RED CROSS DONATION" + expected: + category_name: "Gifts & Donations" + + - id: cat_medium_015 + difficulty: medium + tags: [gifts, donation] + input: + id: txn_035 + amount: 100.00 + classification: expense + description: "UNICEF USA" + expected: + category_name: "Gifts & Donations" + + # Entertainment + - id: cat_medium_016 + difficulty: medium + tags: [entertainment, movies] + input: + id: txn_036 + amount: 45.00 + classification: expense + description: "AMC THEATRES #1234" + expected: + category_name: "Entertainment" + + - id: cat_medium_017 + difficulty: medium + tags: [entertainment, tickets] + input: + id: txn_037 + amount: 89.00 + classification: expense + description: "TICKETMASTER *EVENT" + expected: + category_name: "Entertainment" + + - id: cat_medium_033 + difficulty: medium + tags: [entertainment, tickets] + input: + id: txn_096 + amount: 150.00 + classification: expense + description: "STUBHUB INC" + expected: + category_name: "Entertainment" + + - id: cat_medium_034 + difficulty: medium + tags: [entertainment, tickets] + input: + id: txn_097 + amount: 75.00 + classification: expense + description: "VIVID SEATS" + expected: + category_name: "Entertainment" + + # Travel - Car rental + - id: cat_medium_018 + difficulty: medium + tags: [travel, car_rental] + input: + id: txn_038 + amount: 156.00 + classification: expense + description: "HERTZ RENT-A-CAR" + expected: + category_name: "Travel" + acceptable_alternatives: ["Transportation"] + + # Travel - Lodging + - id: cat_medium_019 + difficulty: medium + tags: [hotels, lodging] + input: + id: txn_039 + amount: 234.00 + classification: expense + description: "AIRBNB *HMQT5J6QQJ" + expected: + category_name: "Hotels" + acceptable_alternatives: ["Travel"] + + # Streaming Services + - id: cat_medium_023 + difficulty: medium + tags: [streaming] + input: + id: txn_068 + amount: 17.99 + classification: expense + description: "HULU LLC" + expected: + category_name: "Streaming Services" + + - id: cat_medium_024 + difficulty: medium + tags: [streaming] + input: + id: txn_069 + amount: 13.99 + classification: expense + description: "DISNEY PLUS" + expected: + category_name: "Streaming Services" + + # Electronics - Apple Store + - id: cat_medium_025 + difficulty: medium + tags: [electronics] + input: + id: txn_070 + amount: 1299.00 + classification: expense + description: "APPLE STORE #R123" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + # Gym & Fitness + - id: cat_medium_029 + difficulty: medium + tags: [gym] + input: + id: txn_084 + amount: 29.99 + classification: expense + description: "LA FITNESS CLUB" + expected: + category_name: "Gym & Fitness" + + - id: cat_medium_030 + difficulty: medium + tags: [gym] + input: + id: txn_085 + amount: 169.00 + classification: expense + description: "ORANGETHEORY FITNESS" + expected: + category_name: "Gym & Fitness" + + # Income - P2P transfers + - id: cat_medium_020 + difficulty: medium + tags: [income, transfer] + input: + id: txn_040 + amount: 500.00 + classification: income + description: "VENMO CASHOUT" + expected: + category_name: "Income" + + - id: cat_medium_031 + difficulty: medium + tags: [income, transfer] + input: + id: txn_088 + amount: 250.00 + classification: income + description: "ZELLE FROM JOHN S" + expected: + category_name: "Income" + + - id: cat_medium_032 + difficulty: medium + tags: [income, transfer] + input: + id: txn_089 + amount: 100.00 + classification: income + description: "CASH APP*CASH OUT" + expected: + category_name: "Income" + + # ============================================================================= + # HARD SAMPLES - Genuinely ambiguous, multiple reasonable interpretations + # ============================================================================= + + # Big-box stores - Could be shopping or groceries + - id: cat_hard_001 + difficulty: hard + tags: [ambiguous, multi_purpose_retailer] + input: + id: txn_041 + amount: 156.78 + classification: expense + description: "TARGET #1234" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + - id: cat_hard_002 + difficulty: hard + tags: [ambiguous, multi_purpose_retailer] + input: + id: txn_042 + amount: 234.56 + classification: expense + description: "WALMART SUPERCENTER" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + # Online marketplaces - Unknown purchase type + - id: cat_hard_003 + difficulty: hard + tags: [ambiguous, online_marketplace] + input: + id: txn_043 + amount: 89.99 + classification: expense + description: "AMAZON.COM*1A2B3C4D" + expected: + category_name: "Shopping" + + # Square payments - Vague merchant names + - id: cat_hard_004 + difficulty: hard + tags: [ambiguous, square_payment] + input: + id: txn_044 + amount: 45.00 + classification: expense + description: "SQ *DOWNTOWN CAFE" + expected: + category_name: "Coffee Shops" + acceptable_alternatives: ["Restaurants"] + + # PayPal - Unknown recipient + - id: cat_hard_005 + difficulty: hard + tags: [ambiguous, payment_processor] + input: + id: txn_045 + amount: 78.00 + classification: expense + description: "PAYPAL *JOHNSMITH" + expected: + category_name: null + + # Premium gym - High price point + - id: cat_hard_006 + difficulty: hard + tags: [ambiguous, premium_gym] + input: + id: txn_046 + amount: 250.00 + classification: expense + description: "EQUINOX MEMBERSHIP" + expected: + category_name: "Gym & Fitness" + + # Streaming vs Subscription + - id: cat_hard_007 + difficulty: hard + tags: [ambiguous, streaming_subscription] + input: + id: txn_047 + amount: 15.99 + classification: expense + description: "HBO MAX" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + # Convenience store - Food vs groceries + - id: cat_hard_008 + difficulty: hard + tags: [ambiguous, convenience_store] + input: + id: txn_048 + amount: 12.50 + classification: expense + description: "7-ELEVEN #34567" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Food & Drink"] + + # Pharmacy/drugstore - Could sell many things + - id: cat_hard_009 + difficulty: hard + tags: [ambiguous, drugstore] + input: + id: txn_049 + amount: 67.89 + classification: expense + description: "RITE AID #1234" + expected: + category_name: "Pharmacy" + acceptable_alternatives: ["Groceries", "Health & Wellness"] + + # Fast-casual - Restaurant or fast food? + - id: cat_hard_010 + difficulty: hard + tags: [ambiguous, fast_casual] + input: + id: txn_050 + amount: 34.50 + classification: expense + description: "PANERA BREAD #567" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + # Delivery services - Category depends on underlying merchant + - id: cat_hard_011 + difficulty: hard + tags: [ambiguous, delivery_service] + input: + id: txn_071 + amount: 45.00 + classification: expense + description: "DOORDASH*CHIPOTLE" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_hard_012 + difficulty: hard + tags: [ambiguous, delivery_service] + input: + id: txn_072 + amount: 67.00 + classification: expense + description: "GRUBHUB*THAI KITCHEN" + expected: + category_name: "Restaurants" + + - id: cat_hard_013 + difficulty: hard + tags: [ambiguous, delivery_service] + input: + id: txn_073 + amount: 234.00 + classification: expense + description: "INSTACART*SAFEWAY" + expected: + category_name: "Groceries" + + - id: cat_hard_014 + difficulty: hard + tags: [ambiguous, delivery_service] + input: + id: txn_074 + amount: 89.00 + classification: expense + description: "UBEREATS *UBER EATS" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + # Amazon Prime - Subscription vs shopping + - id: cat_hard_015 + difficulty: hard + tags: [ambiguous, amazon] + input: + id: txn_075 + amount: 14.99 + classification: expense + description: "AMAZON PRIME*1A2B3C" + expected: + category_name: "Subscriptions" + + # Costco online - Shopping vs groceries + - id: cat_hard_016 + difficulty: hard + tags: [ambiguous, warehouse_online] + input: + id: txn_090 + amount: 234.00 + classification: expense + description: "COSTCO.COM" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + # Online marketplaces - Handmade/vintage + - id: cat_hard_017 + difficulty: hard + tags: [ambiguous, online_marketplace] + input: + id: txn_098 + amount: 45.00 + classification: expense + description: "ETSY.COM" + expected: + category_name: "Shopping" + + # Home goods - Shopping subcategory unclear + - id: cat_hard_018 + difficulty: hard + tags: [ambiguous, home_goods] + input: + id: txn_099 + amount: 289.00 + classification: expense + description: "WAYFAIR*PURCHASE" + expected: + category_name: "Shopping" + + - id: cat_hard_019 + difficulty: hard + tags: [ambiguous, home_goods] + input: + id: txn_100 + amount: 423.00 + classification: expense + description: "IKEA US EAST LLC" + expected: + category_name: "Shopping" + + # ============================================================================= + # EDGE CASES - Should return null (generic/cryptic/ambiguous descriptions) + # ============================================================================= + + # Generic POS transactions + - id: cat_edge_001 + difficulty: edge_case + tags: [should_be_null, generic_pos] + input: + id: txn_051 + amount: 15.00 + classification: expense + description: "POS DEBIT 12345" + expected: + category_name: null + + - id: cat_edge_003 + difficulty: edge_case + tags: [should_be_null, generic_pos] + input: + id: txn_053 + amount: 50.00 + classification: expense + description: "DEBIT CARD PURCHASE" + expected: + category_name: null + + # ACH/Wire transfers - Could be anything + - id: cat_edge_002 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_052 + amount: 100.00 + classification: expense + description: "ACH WITHDRAWAL" + expected: + category_name: null + + - id: cat_edge_004 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_054 + amount: 500.00 + classification: expense + description: "ONLINE TRANSFER TO CHK 1234" + expected: + category_name: null + + - id: cat_edge_008 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_058 + amount: 1500.00 + classification: expense + description: "WIRE TRANSFER OUT" + expected: + category_name: null + + # ATM - Cash withdrawal, unknown purpose + - id: cat_edge_005 + difficulty: edge_case + tags: [should_be_null, atm] + input: + id: txn_055 + amount: 200.00 + classification: expense + description: "ATM WITHDRAWAL 12345" + expected: + category_name: null + + # Unknown/generic business names + - id: cat_edge_006 + difficulty: edge_case + tags: [should_be_null, unknown_merchant] + input: + id: txn_056 + amount: 75.00 + classification: expense + description: "MISC SERVICES LLC" + expected: + category_name: null + + # Reference numbers only + - id: cat_edge_007 + difficulty: edge_case + tags: [should_be_null, reference_only] + input: + id: txn_057 + amount: 234.56 + classification: expense + description: "REF #789456123" + expected: + category_name: null + + # Checks - Unknown payee + - id: cat_edge_009 + difficulty: edge_case + tags: [should_be_null, check] + input: + id: txn_059 + amount: 350.00 + classification: expense + description: "CHECK #1234" + expected: + category_name: null + + # Bank fees - Not a purchase category + - id: cat_edge_010 + difficulty: edge_case + tags: [should_be_null, fee] + input: + id: txn_060 + amount: 35.00 + classification: expense + description: "SERVICE CHARGE" + expected: + category_name: null + + # Pending/void transactions + - id: cat_edge_011 + difficulty: edge_case + tags: [should_be_null, pending] + input: + id: txn_091 + amount: 1.00 + classification: expense + description: "PENDING AUTHORIZATION" + expected: + category_name: null + + - id: cat_edge_012 + difficulty: edge_case + tags: [should_be_null, void] + input: + id: txn_092 + amount: 0.00 + classification: expense + description: "VOID TRANSACTION" + expected: + category_name: null + + # Cryptic abbreviations + - id: cat_edge_013 + difficulty: edge_case + tags: [should_be_null, cryptic] + input: + id: txn_102 + amount: 45.67 + classification: expense + description: "TXN*89234*AUTH" + expected: + category_name: null + + - id: cat_edge_014 + difficulty: edge_case + tags: [should_be_null, cryptic] + input: + id: txn_103 + amount: 123.45 + classification: expense + description: "PURCHASE 847392" + expected: + category_name: null diff --git a/db/eval_data/categorization_golden_v1_light.yml b/db/eval_data/categorization_golden_v1_light.yml new file mode 100644 index 000000000..fb384b2d0 --- /dev/null +++ b/db/eval_data/categorization_golden_v1_light.yml @@ -0,0 +1,769 @@ +--- +name: categorization_golden_v1_light +description: Lightweight golden dataset for quick transaction categorization evaluation +eval_type: categorization +version: "1.0" +metadata: + created_at: "2025-12-04" + updated_at: "2025-12-04" + source: manual_curation + notes: | + A compact 50-sample dataset designed for quick evaluation runs. + Includes a balanced mix across: + - All difficulty levels (easy, medium, hard, edge_case) + - All major category types + - Both US and European merchants + - Representative edge cases + + Difficulty distribution: + - easy: 20 samples + - medium: 15 samples + - hard: 10 samples + - edge_case: 5 samples + +context: + categories: + - id: "income" + name: "Income" + classification: "income" + is_subcategory: false + - id: "salary" + name: "Salary" + classification: "income" + is_subcategory: true + parent_id: "income" + - id: "food_and_drink" + name: "Food & Drink" + classification: "expense" + is_subcategory: false + - id: "restaurants" + name: "Restaurants" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "fast_food" + name: "Fast Food" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "groceries" + name: "Groceries" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "coffee_shops" + name: "Coffee Shops" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "shopping" + name: "Shopping" + classification: "expense" + is_subcategory: false + - id: "clothing" + name: "Clothing" + classification: "expense" + is_subcategory: true + parent_id: "shopping" + - id: "electronics" + name: "Electronics" + classification: "expense" + is_subcategory: true + parent_id: "shopping" + - id: "transportation" + name: "Transportation" + classification: "expense" + is_subcategory: false + - id: "gas" + name: "Gas & Fuel" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "rideshare" + name: "Rideshare" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "public_transit" + name: "Public Transit" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "entertainment" + name: "Entertainment" + classification: "expense" + is_subcategory: false + - id: "streaming" + name: "Streaming Services" + classification: "expense" + is_subcategory: true + parent_id: "entertainment" + - id: "utilities" + name: "Utilities" + classification: "expense" + is_subcategory: false + - id: "housing" + name: "Housing" + classification: "expense" + is_subcategory: false + - id: "rent" + name: "Rent" + classification: "expense" + is_subcategory: true + parent_id: "housing" + - id: "health" + name: "Health & Wellness" + classification: "expense" + is_subcategory: false + - id: "pharmacy" + name: "Pharmacy" + classification: "expense" + is_subcategory: true + parent_id: "health" + - id: "gym" + name: "Gym & Fitness" + classification: "expense" + is_subcategory: true + parent_id: "health" + - id: "travel" + name: "Travel" + classification: "expense" + is_subcategory: false + - id: "flights" + name: "Flights" + classification: "expense" + is_subcategory: true + parent_id: "travel" + - id: "hotels" + name: "Hotels" + classification: "expense" + is_subcategory: true + parent_id: "travel" + - id: "subscriptions" + name: "Subscriptions" + classification: "expense" + is_subcategory: false + - id: "personal_care" + name: "Personal Care" + classification: "expense" + is_subcategory: false + - id: "gifts" + name: "Gifts & Donations" + classification: "expense" + is_subcategory: false + +samples: + # ============================================================================= + # EASY SAMPLES (20 samples) - Clear, unambiguous merchants + # ============================================================================= + + # Fast Food + - id: cat_light_easy_001 + difficulty: easy + tags: [fast_food, us] + input: + id: txn_light_001 + amount: 12.99 + classification: expense + description: "MCDONALD'S #12345" + expected: + category_name: "Fast Food" + + - id: cat_light_easy_002 + difficulty: easy + tags: [fast_food, us] + input: + id: txn_light_002 + amount: 14.50 + classification: expense + description: "CHIPOTLE MEXICAN GRILL" + expected: + category_name: "Fast Food" + + # Coffee Shops + - id: cat_light_easy_003 + difficulty: easy + tags: [coffee_shops, us] + input: + id: txn_light_003 + amount: 5.75 + classification: expense + description: "STARBUCKS STORE #9876" + expected: + category_name: "Coffee Shops" + + - id: cat_light_easy_004 + difficulty: easy + tags: [coffee_shops, europe, uk] + input: + id: txn_light_004 + amount: 4.50 + classification: expense + description: "COSTA COFFEE LTD" + expected: + category_name: "Coffee Shops" + + # Groceries + - id: cat_light_easy_005 + difficulty: easy + tags: [groceries, us] + input: + id: txn_light_005 + amount: 156.32 + classification: expense + description: "WHOLE FOODS MKT #10234" + expected: + category_name: "Groceries" + + - id: cat_light_easy_006 + difficulty: easy + tags: [groceries, europe, uk] + input: + id: txn_light_006 + amount: 87.50 + classification: expense + description: "TESCO STORES LTD" + expected: + category_name: "Groceries" + + - id: cat_light_easy_007 + difficulty: easy + tags: [groceries, europe, germany] + input: + id: txn_light_007 + amount: 78.90 + classification: expense + description: "LIDL DIENSTLEISTUNG" + expected: + category_name: "Groceries" + + # Gas & Fuel + - id: cat_light_easy_008 + difficulty: easy + tags: [gas, us] + input: + id: txn_light_008 + amount: 45.00 + classification: expense + description: "SHELL OIL 573849234" + expected: + category_name: "Gas & Fuel" + + - id: cat_light_easy_009 + difficulty: easy + tags: [gas, europe, uk] + input: + id: txn_light_009 + amount: 75.00 + classification: expense + description: "BP OIL UK LTD" + expected: + category_name: "Gas & Fuel" + + # Rideshare + - id: cat_light_easy_010 + difficulty: easy + tags: [rideshare, us] + input: + id: txn_light_010 + amount: 23.50 + classification: expense + description: "UBER *TRIP HELP.UBER.COM" + expected: + category_name: "Rideshare" + + # Streaming + - id: cat_light_easy_011 + difficulty: easy + tags: [streaming, us] + input: + id: txn_light_011 + amount: 15.99 + classification: expense + description: "NETFLIX.COM" + expected: + category_name: "Streaming Services" + + - id: cat_light_easy_012 + difficulty: easy + tags: [streaming, us] + input: + id: txn_light_012 + amount: 10.99 + classification: expense + description: "SPOTIFY USA" + expected: + category_name: "Streaming Services" + + # Electronics + - id: cat_light_easy_013 + difficulty: easy + tags: [electronics, us] + input: + id: txn_light_013 + amount: 299.99 + classification: expense + description: "BEST BUY 00000456" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + # Clothing + - id: cat_light_easy_014 + difficulty: easy + tags: [clothing, europe, spain] + input: + id: txn_light_014 + amount: 79.99 + classification: expense + description: "ZARA ESPANA SA" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + # Pharmacy + - id: cat_light_easy_015 + difficulty: easy + tags: [pharmacy, us] + input: + id: txn_light_015 + amount: 24.99 + classification: expense + description: "CVS/PHARMACY #4567" + expected: + category_name: "Pharmacy" + + # Flights + - id: cat_light_easy_016 + difficulty: easy + tags: [flights, us] + input: + id: txn_light_016 + amount: 345.00 + classification: expense + description: "UNITED AIRLINES 0162345678" + expected: + category_name: "Flights" + + - id: cat_light_easy_017 + difficulty: easy + tags: [flights, europe, ireland] + input: + id: txn_light_017 + amount: 89.99 + classification: expense + description: "RYANAIR DAC" + expected: + category_name: "Flights" + + # Hotels + - id: cat_light_easy_018 + difficulty: easy + tags: [hotels, us] + input: + id: txn_light_018 + amount: 189.00 + classification: expense + description: "MARRIOTT HOTELS NYC" + expected: + category_name: "Hotels" + + # Gym + - id: cat_light_easy_019 + difficulty: easy + tags: [gym, us] + input: + id: txn_light_019 + amount: 39.99 + classification: expense + description: "PLANET FITNESS MONTHLY" + expected: + category_name: "Gym & Fitness" + + # Income + - id: cat_light_easy_020 + difficulty: easy + tags: [income, salary, us] + input: + id: txn_light_020 + amount: 3500.00 + classification: income + description: "ACME CORP PAYROLL" + expected: + category_name: "Salary" + + # ============================================================================= + # MEDIUM SAMPLES (15 samples) - Requires domain knowledge + # ============================================================================= + + # Restaurants + - id: cat_light_med_001 + difficulty: medium + tags: [restaurants, us] + input: + id: txn_light_med_001 + amount: 67.50 + classification: expense + description: "OLIVE GARDEN #456" + expected: + category_name: "Restaurants" + + - id: cat_light_med_002 + difficulty: medium + tags: [restaurants, europe, uk] + input: + id: txn_light_med_002 + amount: 78.50 + classification: expense + description: "WAGAMAMA LTD LONDON" + expected: + category_name: "Restaurants" + + # Warehouse stores + - id: cat_light_med_003 + difficulty: medium + tags: [groceries, us, warehouse] + input: + id: txn_light_med_003 + amount: 234.56 + classification: expense + description: "COSTCO WHSE #1234" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + # Utilities + - id: cat_light_med_004 + difficulty: medium + tags: [utilities, us] + input: + id: txn_light_med_004 + amount: 125.00 + classification: expense + description: "CON EDISON PAYMENT" + expected: + category_name: "Utilities" + + - id: cat_light_med_005 + difficulty: medium + tags: [utilities, europe, uk] + input: + id: txn_light_med_005 + amount: 156.00 + classification: expense + description: "BRITISH GAS SERVICES" + expected: + category_name: "Utilities" + + - id: cat_light_med_006 + difficulty: medium + tags: [utilities, us] + input: + id: txn_light_med_006 + amount: 89.00 + classification: expense + description: "AT&T WIRELESS" + expected: + category_name: "Utilities" + + # Public Transit + - id: cat_light_med_007 + difficulty: medium + tags: [public_transit, us] + input: + id: txn_light_med_007 + amount: 127.00 + classification: expense + description: "MTA *METROCARD" + expected: + category_name: "Public Transit" + + - id: cat_light_med_008 + difficulty: medium + tags: [public_transit, europe, uk] + input: + id: txn_light_med_008 + amount: 156.50 + classification: expense + description: "TFL TRAVEL LONDON" + expected: + category_name: "Public Transit" + + # Housing + - id: cat_light_med_009 + difficulty: medium + tags: [rent, us] + input: + id: txn_light_med_009 + amount: 2100.00 + classification: expense + description: "AVALON APARTMENTS RENT" + expected: + category_name: "Rent" + acceptable_alternatives: ["Housing"] + + # Subscriptions + - id: cat_light_med_010 + difficulty: medium + tags: [subscriptions, us] + input: + id: txn_light_med_010 + amount: 9.99 + classification: expense + description: "APPLE.COM/BILL" + expected: + category_name: "Subscriptions" + + # Gifts & Donations + - id: cat_light_med_011 + difficulty: medium + tags: [gifts, us, donation] + input: + id: txn_light_med_011 + amount: 50.00 + classification: expense + description: "RED CROSS DONATION" + expected: + category_name: "Gifts & Donations" + + # Entertainment + - id: cat_light_med_012 + difficulty: medium + tags: [entertainment, us] + input: + id: txn_light_med_012 + amount: 89.00 + classification: expense + description: "TICKETMASTER *EVENT" + expected: + category_name: "Entertainment" + + # Travel + - id: cat_light_med_013 + difficulty: medium + tags: [hotels, us] + input: + id: txn_light_med_013 + amount: 234.00 + classification: expense + description: "AIRBNB *HMQT5J6QQJ" + expected: + category_name: "Hotels" + acceptable_alternatives: ["Travel"] + + # Personal Care + - id: cat_light_med_014 + difficulty: medium + tags: [personal_care, us] + input: + id: txn_light_med_014 + amount: 45.00 + classification: expense + description: "SUPERCUTS #1234" + expected: + category_name: "Personal Care" + + # Income + - id: cat_light_med_015 + difficulty: medium + tags: [income, us] + input: + id: txn_light_med_015 + amount: 500.00 + classification: income + description: "VENMO CASHOUT" + expected: + category_name: "Income" + + # ============================================================================= + # HARD SAMPLES (10 samples) - Ambiguous, multiple interpretations + # ============================================================================= + + # Big-box stores + - id: cat_light_hard_001 + difficulty: hard + tags: [ambiguous, us, multi_purpose_retailer] + input: + id: txn_light_hard_001 + amount: 156.78 + classification: expense + description: "TARGET #1234" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + - id: cat_light_hard_002 + difficulty: hard + tags: [ambiguous, europe, uk, multi_purpose_retailer] + input: + id: txn_light_hard_002 + amount: 156.00 + classification: expense + description: "MARKS & SPENCER PLC" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries", "Clothing"] + + # Online marketplaces + - id: cat_light_hard_003 + difficulty: hard + tags: [ambiguous, us, online_marketplace] + input: + id: txn_light_hard_003 + amount: 89.99 + classification: expense + description: "AMAZON.COM*1A2B3C4D" + expected: + category_name: "Shopping" + + # Payment processors (should be null) + - id: cat_light_hard_004 + difficulty: hard + tags: [ambiguous, us, payment_processor] + input: + id: txn_light_hard_004 + amount: 78.00 + classification: expense + description: "PAYPAL *JOHNSMITH" + expected: + category_name: null + + # Fast-casual + - id: cat_light_hard_005 + difficulty: hard + tags: [ambiguous, us, fast_casual] + input: + id: txn_light_hard_005 + amount: 34.50 + classification: expense + description: "PANERA BREAD #567" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Fast Food"] + + # Delivery services + - id: cat_light_hard_006 + difficulty: hard + tags: [ambiguous, us, delivery_service] + input: + id: txn_light_hard_006 + amount: 45.00 + classification: expense + description: "DOORDASH*CHIPOTLE" + expected: + category_name: "Fast Food" + acceptable_alternatives: ["Restaurants"] + + - id: cat_light_hard_007 + difficulty: hard + tags: [ambiguous, europe, uk, delivery_service] + input: + id: txn_light_hard_007 + amount: 32.50 + classification: expense + description: "DELIVEROO UK LTD" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Fast Food"] + + # Amazon Prime + - id: cat_light_hard_008 + difficulty: hard + tags: [ambiguous, us, amazon] + input: + id: txn_light_hard_008 + amount: 14.99 + classification: expense + description: "AMAZON PRIME*1A2B3C" + expected: + category_name: "Subscriptions" + + # Convenience store + - id: cat_light_hard_009 + difficulty: hard + tags: [ambiguous, us, convenience_store] + input: + id: txn_light_hard_009 + amount: 12.50 + classification: expense + description: "7-ELEVEN #34567" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Fast Food"] + + # Streaming vs Subscription + - id: cat_light_hard_010 + difficulty: hard + tags: [ambiguous, us, streaming_subscription] + input: + id: txn_light_hard_010 + amount: 15.99 + classification: expense + description: "HBO MAX" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + # ============================================================================= + # EDGE CASES (5 samples) - Should return null + # ============================================================================= + + # Generic POS + - id: cat_light_edge_001 + difficulty: edge_case + tags: [should_be_null, generic_pos] + input: + id: txn_light_edge_001 + amount: 15.00 + classification: expense + description: "POS DEBIT 12345" + expected: + category_name: null + + # ACH transfer + - id: cat_light_edge_002 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_light_edge_002 + amount: 100.00 + classification: expense + description: "ACH WITHDRAWAL" + expected: + category_name: null + + # ATM + - id: cat_light_edge_003 + difficulty: edge_case + tags: [should_be_null, atm] + input: + id: txn_light_edge_003 + amount: 200.00 + classification: expense + description: "ATM WITHDRAWAL 12345" + expected: + category_name: null + + # Check + - id: cat_light_edge_004 + difficulty: edge_case + tags: [should_be_null, check] + input: + id: txn_light_edge_004 + amount: 350.00 + classification: expense + description: "CHECK #1234" + expected: + category_name: null + + # Cryptic + - id: cat_light_edge_005 + difficulty: edge_case + tags: [should_be_null, cryptic] + input: + id: txn_light_edge_005 + amount: 45.67 + classification: expense + description: "TXN*89234*AUTH" + expected: + category_name: null diff --git a/db/eval_data/categorization_golden_v2.yml b/db/eval_data/categorization_golden_v2.yml new file mode 100644 index 000000000..71ca1d89e --- /dev/null +++ b/db/eval_data/categorization_golden_v2.yml @@ -0,0 +1,2559 @@ +--- +name: categorization_golden_v2 +description: Golden dataset for transaction categorization evaluation with US and European merchants +eval_type: categorization +version: "2.0" +metadata: + created_at: "2025-12-04" + updated_at: "2025-12-04" + source: manual_curation + notes: | + Difficulty levels: + - easy: Unambiguous merchant names, single clear category + - medium: Requires domain knowledge but has clear answer + - hard: Genuinely ambiguous, multiple reasonable interpretations + - edge_case: Should return null (generic/cryptic descriptions) + + This v2 dataset includes: + - 200 total samples (150 base + 50 challenging) + - US merchants (original) + - European merchants (UK, Germany, France, Spain, Italy, Netherlands, etc.) + - Mix of international and regional brands + - Challenging samples: local businesses, abbreviations, cryptic formats + - More ambiguous cases requiring nuanced reasoning + +context: + categories: + - id: "income" + name: "Income" + classification: "income" + is_subcategory: false + - id: "salary" + name: "Salary" + classification: "income" + is_subcategory: true + parent_id: "income" + - id: "food_and_drink" + name: "Food & Drink" + classification: "expense" + is_subcategory: false + - id: "restaurants" + name: "Restaurants" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "groceries" + name: "Groceries" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "coffee_shops" + name: "Coffee Shops" + classification: "expense" + is_subcategory: true + parent_id: "food_and_drink" + - id: "shopping" + name: "Shopping" + classification: "expense" + is_subcategory: false + - id: "clothing" + name: "Clothing" + classification: "expense" + is_subcategory: true + parent_id: "shopping" + - id: "electronics" + name: "Electronics" + classification: "expense" + is_subcategory: true + parent_id: "shopping" + - id: "transportation" + name: "Transportation" + classification: "expense" + is_subcategory: false + - id: "gas" + name: "Gas & Fuel" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "rideshare" + name: "Rideshare" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "public_transit" + name: "Public Transit" + classification: "expense" + is_subcategory: true + parent_id: "transportation" + - id: "entertainment" + name: "Entertainment" + classification: "expense" + is_subcategory: false + - id: "streaming" + name: "Streaming Services" + classification: "expense" + is_subcategory: true + parent_id: "entertainment" + - id: "utilities" + name: "Utilities" + classification: "expense" + is_subcategory: false + - id: "housing" + name: "Housing" + classification: "expense" + is_subcategory: false + - id: "rent" + name: "Rent" + classification: "expense" + is_subcategory: true + parent_id: "housing" + - id: "health" + name: "Health & Wellness" + classification: "expense" + is_subcategory: false + - id: "pharmacy" + name: "Pharmacy" + classification: "expense" + is_subcategory: true + parent_id: "health" + - id: "gym" + name: "Gym & Fitness" + classification: "expense" + is_subcategory: true + parent_id: "health" + - id: "travel" + name: "Travel" + classification: "expense" + is_subcategory: false + - id: "flights" + name: "Flights" + classification: "expense" + is_subcategory: true + parent_id: "travel" + - id: "hotels" + name: "Hotels" + classification: "expense" + is_subcategory: true + parent_id: "travel" + - id: "subscriptions" + name: "Subscriptions" + classification: "expense" + is_subcategory: false + - id: "personal_care" + name: "Personal Care" + classification: "expense" + is_subcategory: false + - id: "gifts" + name: "Gifts & Donations" + classification: "expense" + is_subcategory: false + +samples: + # ============================================================================= + # EASY SAMPLES - US Merchants (40 samples) + # ============================================================================= + + # Food & Drink - US + - id: cat_v2_easy_001 + difficulty: easy + tags: [food_and_drink, us, clear_merchant] + input: + id: txn_v2_001 + amount: 12.99 + classification: expense + description: "MCDONALD'S #12345 SPRINGFIELD IL" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_easy_002 + difficulty: easy + tags: [food_and_drink, us, clear_merchant] + input: + id: txn_v2_002 + amount: 8.50 + classification: expense + description: "BURGER KING #456 NEW YORK NY" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_easy_003 + difficulty: easy + tags: [food_and_drink, us, clear_merchant] + input: + id: txn_v2_003 + amount: 9.99 + classification: expense + description: "TACO BELL #789" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_easy_004 + difficulty: easy + tags: [food_and_drink, us, clear_merchant] + input: + id: txn_v2_004 + amount: 14.99 + classification: expense + description: "CHIPOTLE MEXICAN GRILL" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_easy_005 + difficulty: easy + tags: [food_and_drink, us, clear_merchant] + input: + id: txn_v2_005 + amount: 8.99 + classification: expense + description: "WENDY'S #5678" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + # Coffee Shops - US + - id: cat_v2_easy_006 + difficulty: easy + tags: [coffee_shops, us, clear_merchant] + input: + id: txn_v2_006 + amount: 5.75 + classification: expense + description: "STARBUCKS STORE #9876" + expected: + category_name: "Coffee Shops" + + - id: cat_v2_easy_007 + difficulty: easy + tags: [coffee_shops, us, clear_merchant] + input: + id: txn_v2_007 + amount: 4.25 + classification: expense + description: "DUNKIN #12345" + expected: + category_name: "Coffee Shops" + + - id: cat_v2_easy_008 + difficulty: easy + tags: [coffee_shops, us, clear_merchant] + input: + id: txn_v2_008 + amount: 6.50 + classification: expense + description: "PEETS COFFEE #456" + expected: + category_name: "Coffee Shops" + + # Groceries - US + - id: cat_v2_easy_009 + difficulty: easy + tags: [groceries, us, clear_merchant] + input: + id: txn_v2_009 + amount: 156.32 + classification: expense + description: "WHOLE FOODS MKT #10234" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_010 + difficulty: easy + tags: [groceries, us, clear_merchant] + input: + id: txn_v2_010 + amount: 87.45 + classification: expense + description: "TRADER JOE'S #567 LOS ANGELES" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_011 + difficulty: easy + tags: [groceries, us, clear_merchant] + input: + id: txn_v2_011 + amount: 98.34 + classification: expense + description: "PUBLIX SUPER MARKET" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_012 + difficulty: easy + tags: [groceries, us, clear_merchant] + input: + id: txn_v2_012 + amount: 67.89 + classification: expense + description: "KROGER #789 GROCERY" + expected: + category_name: "Groceries" + + # Gas & Fuel - US + - id: cat_v2_easy_013 + difficulty: easy + tags: [gas, us, clear_merchant] + input: + id: txn_v2_013 + amount: 45.00 + classification: expense + description: "SHELL OIL 573849234" + expected: + category_name: "Gas & Fuel" + + - id: cat_v2_easy_014 + difficulty: easy + tags: [gas, us, clear_merchant] + input: + id: txn_v2_014 + amount: 52.30 + classification: expense + description: "CHEVRON STATION #1234" + expected: + category_name: "Gas & Fuel" + + - id: cat_v2_easy_015 + difficulty: easy + tags: [gas, us, clear_merchant] + input: + id: txn_v2_015 + amount: 48.50 + classification: expense + description: "EXXONMOBIL 12345" + expected: + category_name: "Gas & Fuel" + + # Rideshare - US + - id: cat_v2_easy_016 + difficulty: easy + tags: [rideshare, us, clear_merchant] + input: + id: txn_v2_016 + amount: 23.50 + classification: expense + description: "UBER *TRIP HELP.UBER.COM" + expected: + category_name: "Rideshare" + + - id: cat_v2_easy_017 + difficulty: easy + tags: [rideshare, us, clear_merchant] + input: + id: txn_v2_017 + amount: 18.75 + classification: expense + description: "LYFT *RIDE SAT 7PM" + expected: + category_name: "Rideshare" + + # Streaming - US + - id: cat_v2_easy_018 + difficulty: easy + tags: [streaming, us, clear_merchant] + input: + id: txn_v2_018 + amount: 15.99 + classification: expense + description: "NETFLIX.COM" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_v2_easy_019 + difficulty: easy + tags: [streaming, us, clear_merchant] + input: + id: txn_v2_019 + amount: 10.99 + classification: expense + description: "SPOTIFY USA" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + # Electronics - US + - id: cat_v2_easy_020 + difficulty: easy + tags: [electronics, us, clear_merchant] + input: + id: txn_v2_020 + amount: 299.99 + classification: expense + description: "BEST BUY 00000456" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + # Clothing - US + - id: cat_v2_easy_021 + difficulty: easy + tags: [clothing, us, clear_merchant] + input: + id: txn_v2_021 + amount: 89.99 + classification: expense + description: "GAP STORE #1234" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_easy_022 + difficulty: easy + tags: [clothing, us, clear_merchant] + input: + id: txn_v2_022 + amount: 65.00 + classification: expense + description: "OLD NAVY #567" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + # Pharmacy - US + - id: cat_v2_easy_023 + difficulty: easy + tags: [pharmacy, us, clear_merchant] + input: + id: txn_v2_023 + amount: 24.99 + classification: expense + description: "CVS/PHARMACY #4567" + expected: + category_name: "Pharmacy" + + - id: cat_v2_easy_024 + difficulty: easy + tags: [pharmacy, us, clear_merchant] + input: + id: txn_v2_024 + amount: 35.50 + classification: expense + description: "WALGREENS #12345" + expected: + category_name: "Pharmacy" + acceptable_alternatives: ["Health & Wellness"] + + # Gym - US + - id: cat_v2_easy_025 + difficulty: easy + tags: [gym, us, clear_merchant] + input: + id: txn_v2_025 + amount: 39.99 + classification: expense + description: "PLANET FITNESS MONTHLY" + expected: + category_name: "Gym & Fitness" + + # Flights - US + - id: cat_v2_easy_026 + difficulty: easy + tags: [flights, us, clear_merchant] + input: + id: txn_v2_026 + amount: 345.00 + classification: expense + description: "UNITED AIRLINES 0162345678" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + - id: cat_v2_easy_027 + difficulty: easy + tags: [flights, us, clear_merchant] + input: + id: txn_v2_027 + amount: 456.00 + classification: expense + description: "DELTA AIR LINES" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + # Hotels - US + - id: cat_v2_easy_028 + difficulty: easy + tags: [hotels, us, clear_merchant] + input: + id: txn_v2_028 + amount: 189.00 + classification: expense + description: "MARRIOTT HOTELS NYC" + expected: + category_name: "Hotels" + + - id: cat_v2_easy_029 + difficulty: easy + tags: [hotels, us, clear_merchant] + input: + id: txn_v2_029 + amount: 245.00 + classification: expense + description: "HILTON HOTELS" + expected: + category_name: "Hotels" + + # Income - US + - id: cat_v2_easy_030 + difficulty: easy + tags: [income, salary, us, clear_merchant] + input: + id: txn_v2_030 + amount: 3500.00 + classification: income + description: "ACME CORP PAYROLL" + expected: + category_name: "Salary" + + - id: cat_v2_easy_031 + difficulty: easy + tags: [income, salary, us, clear_merchant] + input: + id: txn_v2_031 + amount: 2800.00 + classification: income + description: "DIRECT DEPOSIT - PAYROLL" + expected: + category_name: "Salary" + + # ============================================================================= + # EASY SAMPLES - European Merchants (20 samples) + # ============================================================================= + + # Food & Drink - Europe + - id: cat_v2_easy_eu_001 + difficulty: easy + tags: [food_and_drink, europe, uk, clear_merchant] + input: + id: txn_v2_eu_001 + amount: 8.99 + classification: expense + description: "NANDO'S LONDON" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_easy_eu_002 + difficulty: easy + tags: [food_and_drink, europe, uk, clear_merchant] + input: + id: txn_v2_eu_002 + amount: 6.50 + classification: expense + description: "GREGGS PLC" + expected: + category_name: "Food & Drink" + + - id: cat_v2_easy_eu_003 + difficulty: easy + tags: [food_and_drink, europe, germany, clear_merchant] + input: + id: txn_v2_eu_003 + amount: 7.80 + classification: expense + description: "NORDSEE GMBH BERLIN" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Food & Drink"] + + # Coffee Shops - Europe + - id: cat_v2_easy_eu_004 + difficulty: easy + tags: [coffee_shops, europe, uk, clear_merchant] + input: + id: txn_v2_eu_004 + amount: 4.50 + classification: expense + description: "COSTA COFFEE LTD" + expected: + category_name: "Coffee Shops" + + - id: cat_v2_easy_eu_005 + difficulty: easy + tags: [coffee_shops, europe, uk, clear_merchant] + input: + id: txn_v2_eu_005 + amount: 3.80 + classification: expense + description: "CAFFE NERO GROUP" + expected: + category_name: "Coffee Shops" + + - id: cat_v2_easy_eu_006 + difficulty: easy + tags: [coffee_shops, europe, netherlands, clear_merchant] + input: + id: txn_v2_eu_006 + amount: 4.20 + classification: expense + description: "STARBUCKS AMSTERDAM" + expected: + category_name: "Coffee Shops" + + # Groceries - Europe + - id: cat_v2_easy_eu_007 + difficulty: easy + tags: [groceries, europe, uk, clear_merchant] + input: + id: txn_v2_eu_007 + amount: 87.50 + classification: expense + description: "TESCO STORES LTD" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_eu_008 + difficulty: easy + tags: [groceries, europe, uk, clear_merchant] + input: + id: txn_v2_eu_008 + amount: 65.30 + classification: expense + description: "SAINSBURY'S SUPERMARKET" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_eu_009 + difficulty: easy + tags: [groceries, europe, germany, clear_merchant] + input: + id: txn_v2_eu_009 + amount: 78.90 + classification: expense + description: "LIDL DIENSTLEISTUNG" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_eu_010 + difficulty: easy + tags: [groceries, europe, germany, clear_merchant] + input: + id: txn_v2_eu_010 + amount: 92.40 + classification: expense + description: "ALDI SUED GMBH" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_eu_011 + difficulty: easy + tags: [groceries, europe, france, clear_merchant] + input: + id: txn_v2_eu_011 + amount: 123.50 + classification: expense + description: "CARREFOUR MARKET PARIS" + expected: + category_name: "Groceries" + + - id: cat_v2_easy_eu_012 + difficulty: easy + tags: [groceries, europe, netherlands, clear_merchant] + input: + id: txn_v2_eu_012 + amount: 67.80 + classification: expense + description: "ALBERT HEIJN BV" + expected: + category_name: "Groceries" + + # Gas & Fuel - Europe + - id: cat_v2_easy_eu_013 + difficulty: easy + tags: [gas, europe, uk, clear_merchant] + input: + id: txn_v2_eu_013 + amount: 75.00 + classification: expense + description: "BP OIL UK LTD" + expected: + category_name: "Gas & Fuel" + + - id: cat_v2_easy_eu_014 + difficulty: easy + tags: [gas, europe, france, clear_merchant] + input: + id: txn_v2_eu_014 + amount: 68.50 + classification: expense + description: "TOTAL ENERGIES PARIS" + expected: + category_name: "Gas & Fuel" + + # Flights - Europe + - id: cat_v2_easy_eu_015 + difficulty: easy + tags: [flights, europe, uk, clear_merchant] + input: + id: txn_v2_eu_015 + amount: 189.00 + classification: expense + description: "BRITISH AIRWAYS PLC" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + - id: cat_v2_easy_eu_016 + difficulty: easy + tags: [flights, europe, ireland, clear_merchant] + input: + id: txn_v2_eu_016 + amount: 89.99 + classification: expense + description: "RYANAIR DAC" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + - id: cat_v2_easy_eu_017 + difficulty: easy + tags: [flights, europe, germany, clear_merchant] + input: + id: txn_v2_eu_017 + amount: 245.00 + classification: expense + description: "LUFTHANSA AG FRANKFURT" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + - id: cat_v2_easy_eu_018 + difficulty: easy + tags: [flights, europe, france, clear_merchant] + input: + id: txn_v2_eu_018 + amount: 198.00 + classification: expense + description: "AIR FRANCE KLM" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + # Clothing - Europe + - id: cat_v2_easy_eu_019 + difficulty: easy + tags: [clothing, europe, spain, clear_merchant] + input: + id: txn_v2_eu_019 + amount: 79.99 + classification: expense + description: "ZARA ESPANA SA" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_easy_eu_020 + difficulty: easy + tags: [clothing, europe, sweden, clear_merchant] + input: + id: txn_v2_eu_020 + amount: 45.00 + classification: expense + description: "H&M HENNES MAURITZ AB" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + # ============================================================================= + # MEDIUM SAMPLES - US Merchants (25 samples) + # ============================================================================= + + # Restaurants - US + - id: cat_v2_med_001 + difficulty: medium + tags: [restaurants, us, chain] + input: + id: txn_v2_med_001 + amount: 67.50 + classification: expense + description: "OLIVE GARDEN #456" + expected: + category_name: "Restaurants" + + - id: cat_v2_med_002 + difficulty: medium + tags: [restaurants, us, chain] + input: + id: txn_v2_med_002 + amount: 85.00 + classification: expense + description: "CHEESECAKE FACTORY" + expected: + category_name: "Restaurants" + + - id: cat_v2_med_003 + difficulty: medium + tags: [restaurants, us, upscale] + input: + id: txn_v2_med_003 + amount: 123.45 + classification: expense + description: "RUTH'S CHRIS STEAK" + expected: + category_name: "Restaurants" + + # Groceries - Warehouse + - id: cat_v2_med_004 + difficulty: medium + tags: [groceries, us, warehouse] + input: + id: txn_v2_med_004 + amount: 234.56 + classification: expense + description: "COSTCO WHSE #1234" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_med_005 + difficulty: medium + tags: [groceries, us, warehouse] + input: + id: txn_v2_med_005 + amount: 178.90 + classification: expense + description: "SAM'S CLUB #8765" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + # Utilities - US + - id: cat_v2_med_006 + difficulty: medium + tags: [utilities, us, power] + input: + id: txn_v2_med_006 + amount: 125.00 + classification: expense + description: "CON EDISON PAYMENT" + expected: + category_name: "Utilities" + + - id: cat_v2_med_007 + difficulty: medium + tags: [utilities, us, power] + input: + id: txn_v2_med_007 + amount: 89.00 + classification: expense + description: "PACIFIC GAS ELEC CO" + expected: + category_name: "Utilities" + + - id: cat_v2_med_008 + difficulty: medium + tags: [utilities, us, internet] + input: + id: txn_v2_med_008 + amount: 145.00 + classification: expense + description: "XFINITY INTERNET" + expected: + category_name: "Utilities" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_v2_med_009 + difficulty: medium + tags: [utilities, us, phone] + input: + id: txn_v2_med_009 + amount: 89.00 + classification: expense + description: "AT&T WIRELESS" + expected: + category_name: "Utilities" + acceptable_alternatives: ["Subscriptions"] + + # Public Transit - US + - id: cat_v2_med_010 + difficulty: medium + tags: [public_transit, us] + input: + id: txn_v2_med_010 + amount: 127.00 + classification: expense + description: "MTA *METROCARD" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation"] + + - id: cat_v2_med_011 + difficulty: medium + tags: [public_transit, us] + input: + id: txn_v2_med_011 + amount: 2.75 + classification: expense + description: "WMATA SMARTRIP" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation"] + + # Housing - US + - id: cat_v2_med_012 + difficulty: medium + tags: [rent, us, housing] + input: + id: txn_v2_med_012 + amount: 2100.00 + classification: expense + description: "AVALON APARTMENTS RENT" + expected: + category_name: "Rent" + acceptable_alternatives: ["Housing"] + + # Subscriptions - US + - id: cat_v2_med_013 + difficulty: medium + tags: [subscriptions, us] + input: + id: txn_v2_med_013 + amount: 9.99 + classification: expense + description: "APPLE.COM/BILL" + expected: + category_name: "Subscriptions" + + - id: cat_v2_med_014 + difficulty: medium + tags: [subscriptions, us] + input: + id: txn_v2_med_014 + amount: 2.99 + classification: expense + description: "GOOGLE *STORAGE" + expected: + category_name: "Subscriptions" + + # Personal Care - US + - id: cat_v2_med_015 + difficulty: medium + tags: [personal_care, us] + input: + id: txn_v2_med_015 + amount: 45.00 + classification: expense + description: "SUPERCUTS #1234" + expected: + category_name: "Personal Care" + + - id: cat_v2_med_016 + difficulty: medium + tags: [personal_care, us] + input: + id: txn_v2_med_016 + amount: 85.00 + classification: expense + description: "ULTA BEAUTY #567" + expected: + category_name: "Personal Care" + acceptable_alternatives: ["Shopping"] + + # Gifts & Donations - US + - id: cat_v2_med_017 + difficulty: medium + tags: [gifts, us, donation] + input: + id: txn_v2_med_017 + amount: 50.00 + classification: expense + description: "RED CROSS DONATION" + expected: + category_name: "Gifts & Donations" + + - id: cat_v2_med_018 + difficulty: medium + tags: [gifts, us, donation] + input: + id: txn_v2_med_018 + amount: 100.00 + classification: expense + description: "UNICEF USA" + expected: + category_name: "Gifts & Donations" + + # Entertainment - US + - id: cat_v2_med_019 + difficulty: medium + tags: [entertainment, us, movies] + input: + id: txn_v2_med_019 + amount: 45.00 + classification: expense + description: "AMC THEATRES #1234" + expected: + category_name: "Entertainment" + + - id: cat_v2_med_020 + difficulty: medium + tags: [entertainment, us, tickets] + input: + id: txn_v2_med_020 + amount: 89.00 + classification: expense + description: "TICKETMASTER *EVENT" + expected: + category_name: "Entertainment" + + # Travel - US + - id: cat_v2_med_021 + difficulty: medium + tags: [travel, us, car_rental] + input: + id: txn_v2_med_021 + amount: 156.00 + classification: expense + description: "HERTZ RENT-A-CAR" + expected: + category_name: "Travel" + acceptable_alternatives: ["Transportation"] + + - id: cat_v2_med_022 + difficulty: medium + tags: [hotels, us, lodging] + input: + id: txn_v2_med_022 + amount: 234.00 + classification: expense + description: "AIRBNB *HMQT5J6QQJ" + expected: + category_name: "Hotels" + acceptable_alternatives: ["Travel"] + + # Streaming - US + - id: cat_v2_med_023 + difficulty: medium + tags: [streaming, us] + input: + id: txn_v2_med_023 + amount: 17.99 + classification: expense + description: "HULU LLC" + expected: + category_name: "Streaming Services" + + - id: cat_v2_med_024 + difficulty: medium + tags: [streaming, us] + input: + id: txn_v2_med_024 + amount: 13.99 + classification: expense + description: "DISNEY PLUS" + expected: + category_name: "Streaming Services" + + # Income - US + - id: cat_v2_med_025 + difficulty: medium + tags: [income, us, transfer] + input: + id: txn_v2_med_025 + amount: 500.00 + classification: income + description: "VENMO CASHOUT" + expected: + category_name: "Income" + + # ============================================================================= + # MEDIUM SAMPLES - European Merchants (15 samples) + # ============================================================================= + + # Restaurants - Europe + - id: cat_v2_med_eu_001 + difficulty: medium + tags: [restaurants, europe, uk] + input: + id: txn_v2_med_eu_001 + amount: 78.50 + classification: expense + description: "WAGAMAMA LTD LONDON" + expected: + category_name: "Restaurants" + + - id: cat_v2_med_eu_002 + difficulty: medium + tags: [restaurants, europe, italy] + input: + id: txn_v2_med_eu_002 + amount: 95.00 + classification: expense + description: "RISTORANTE MILANO SRL" + expected: + category_name: "Restaurants" + + - id: cat_v2_med_eu_003 + difficulty: medium + tags: [restaurants, europe, spain] + input: + id: txn_v2_med_eu_003 + amount: 67.00 + classification: expense + description: "TELEPIZZA SAU MADRID" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + # Utilities - Europe + - id: cat_v2_med_eu_004 + difficulty: medium + tags: [utilities, europe, uk] + input: + id: txn_v2_med_eu_004 + amount: 156.00 + classification: expense + description: "BRITISH GAS SERVICES" + expected: + category_name: "Utilities" + + - id: cat_v2_med_eu_005 + difficulty: medium + tags: [utilities, europe, germany] + input: + id: txn_v2_med_eu_005 + amount: 89.00 + classification: expense + description: "VODAFONE GMBH" + expected: + category_name: "Utilities" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_v2_med_eu_006 + difficulty: medium + tags: [utilities, europe, france] + input: + id: txn_v2_med_eu_006 + amount: 112.00 + classification: expense + description: "EDF ENERGIE FRANCE" + expected: + category_name: "Utilities" + + # Public Transit - Europe + - id: cat_v2_med_eu_007 + difficulty: medium + tags: [public_transit, europe, uk] + input: + id: txn_v2_med_eu_007 + amount: 156.50 + classification: expense + description: "TFL TRAVEL LONDON" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation"] + + - id: cat_v2_med_eu_008 + difficulty: medium + tags: [public_transit, europe, germany] + input: + id: txn_v2_med_eu_008 + amount: 89.00 + classification: expense + description: "DEUTSCHE BAHN AG" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation", "Travel"] + + - id: cat_v2_med_eu_009 + difficulty: medium + tags: [public_transit, europe, france] + input: + id: txn_v2_med_eu_009 + amount: 75.00 + classification: expense + description: "SNCF VOYAGES" + expected: + category_name: "Public Transit" + acceptable_alternatives: ["Transportation", "Travel"] + + # Entertainment - Europe + - id: cat_v2_med_eu_010 + difficulty: medium + tags: [entertainment, europe, uk] + input: + id: txn_v2_med_eu_010 + amount: 24.00 + classification: expense + description: "ODEON CINEMAS LTD" + expected: + category_name: "Entertainment" + + - id: cat_v2_med_eu_011 + difficulty: medium + tags: [entertainment, europe, uk] + input: + id: txn_v2_med_eu_011 + amount: 145.00 + classification: expense + description: "TICKETMASTER UK LTD" + expected: + category_name: "Entertainment" + + # Gym - Europe + - id: cat_v2_med_eu_012 + difficulty: medium + tags: [gym, europe, uk] + input: + id: txn_v2_med_eu_012 + amount: 35.00 + classification: expense + description: "PUREGYM LTD" + expected: + category_name: "Gym & Fitness" + acceptable_alternatives: ["Health & Wellness"] + + - id: cat_v2_med_eu_013 + difficulty: medium + tags: [gym, europe, germany] + input: + id: txn_v2_med_eu_013 + amount: 29.99 + classification: expense + description: "MCFIT GMBH BERLIN" + expected: + category_name: "Gym & Fitness" + acceptable_alternatives: ["Health & Wellness"] + + # Income - Europe + - id: cat_v2_med_eu_014 + difficulty: medium + tags: [income, salary, europe, uk] + input: + id: txn_v2_med_eu_014 + amount: 2850.00 + classification: income + description: "ACME LTD SALARY" + expected: + category_name: "Salary" + + - id: cat_v2_med_eu_015 + difficulty: medium + tags: [income, salary, europe, germany] + input: + id: txn_v2_med_eu_015 + amount: 3200.00 + classification: income + description: "GEHALT FIRMA GMBH" + expected: + category_name: "Salary" + + # ============================================================================= + # HARD SAMPLES - US Merchants (15 samples) + # ============================================================================= + + # Big-box stores + - id: cat_v2_hard_001 + difficulty: hard + tags: [ambiguous, us, multi_purpose_retailer] + input: + id: txn_v2_hard_001 + amount: 156.78 + classification: expense + description: "TARGET #1234" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + - id: cat_v2_hard_002 + difficulty: hard + tags: [ambiguous, us, multi_purpose_retailer] + input: + id: txn_v2_hard_002 + amount: 234.56 + classification: expense + description: "WALMART SUPERCENTER" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + # Online marketplaces + - id: cat_v2_hard_003 + difficulty: hard + tags: [ambiguous, us, online_marketplace] + input: + id: txn_v2_hard_003 + amount: 89.99 + classification: expense + description: "AMAZON.COM*1A2B3C4D" + expected: + category_name: "Shopping" + + # Square payments + - id: cat_v2_hard_004 + difficulty: hard + tags: [ambiguous, us, square_payment] + input: + id: txn_v2_hard_004 + amount: 45.00 + classification: expense + description: "SQ *DOWNTOWN CAFE" + expected: + category_name: "Coffee Shops" + acceptable_alternatives: ["Restaurants"] + + # PayPal + - id: cat_v2_hard_005 + difficulty: hard + tags: [ambiguous, us, payment_processor] + input: + id: txn_v2_hard_005 + amount: 78.00 + classification: expense + description: "PAYPAL *JOHNSMITH" + expected: + category_name: null + + # Fast-casual + - id: cat_v2_hard_006 + difficulty: hard + tags: [ambiguous, us, fast_casual] + input: + id: txn_v2_hard_006 + amount: 34.50 + classification: expense + description: "PANERA BREAD #567" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + # Delivery services + - id: cat_v2_hard_007 + difficulty: hard + tags: [ambiguous, us, delivery_service] + input: + id: txn_v2_hard_007 + amount: 45.00 + classification: expense + description: "DOORDASH*CHIPOTLE" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants", "Food & Drink"] + + - id: cat_v2_hard_008 + difficulty: hard + tags: [ambiguous, us, delivery_service] + input: + id: txn_v2_hard_008 + amount: 67.00 + classification: expense + description: "GRUBHUB*THAI KITCHEN" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_hard_009 + difficulty: hard + tags: [ambiguous, us, delivery_service] + input: + id: txn_v2_hard_009 + amount: 234.00 + classification: expense + description: "INSTACART*SAFEWAY" + expected: + category_name: "Groceries" + + # Amazon Prime + - id: cat_v2_hard_010 + difficulty: hard + tags: [ambiguous, us, amazon] + input: + id: txn_v2_hard_010 + amount: 14.99 + classification: expense + description: "AMAZON PRIME*1A2B3C" + expected: + category_name: "Subscriptions" + + # Convenience store + - id: cat_v2_hard_011 + difficulty: hard + tags: [ambiguous, us, convenience_store] + input: + id: txn_v2_hard_011 + amount: 12.50 + classification: expense + description: "7-ELEVEN #34567" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Food & Drink"] + + # Premium gym + - id: cat_v2_hard_012 + difficulty: hard + tags: [ambiguous, us, premium_gym] + input: + id: txn_v2_hard_012 + amount: 250.00 + classification: expense + description: "EQUINOX MEMBERSHIP" + expected: + category_name: "Gym & Fitness" + + # Streaming vs Subscription + - id: cat_v2_hard_013 + difficulty: hard + tags: [ambiguous, us, streaming_subscription] + input: + id: txn_v2_hard_013 + amount: 15.99 + classification: expense + description: "HBO MAX" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + # Etsy + - id: cat_v2_hard_014 + difficulty: hard + tags: [ambiguous, us, online_marketplace] + input: + id: txn_v2_hard_014 + amount: 45.00 + classification: expense + description: "ETSY.COM" + expected: + category_name: "Shopping" + + # IKEA + - id: cat_v2_hard_015 + difficulty: hard + tags: [ambiguous, us, home_goods] + input: + id: txn_v2_hard_015 + amount: 423.00 + classification: expense + description: "IKEA US EAST LLC" + expected: + category_name: "Shopping" + + # ============================================================================= + # HARD SAMPLES - European Merchants (10 samples) + # ============================================================================= + + # Multi-purpose retailers - Europe + - id: cat_v2_hard_eu_001 + difficulty: hard + tags: [ambiguous, europe, uk, multi_purpose_retailer] + input: + id: txn_v2_hard_eu_001 + amount: 156.00 + classification: expense + description: "MARKS & SPENCER PLC" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries", "Clothing"] + + - id: cat_v2_hard_eu_002 + difficulty: hard + tags: [ambiguous, europe, uk, multi_purpose_retailer] + input: + id: txn_v2_hard_eu_002 + amount: 89.50 + classification: expense + description: "ASDA STORES LTD" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_hard_eu_003 + difficulty: hard + tags: [ambiguous, europe, france, multi_purpose_retailer] + input: + id: txn_v2_hard_eu_003 + amount: 234.00 + classification: expense + description: "AUCHAN HYPERMARCHE" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + # Delivery - Europe + - id: cat_v2_hard_eu_004 + difficulty: hard + tags: [ambiguous, europe, uk, delivery_service] + input: + id: txn_v2_hard_eu_004 + amount: 32.50 + classification: expense + description: "DELIVEROO UK LTD" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_hard_eu_005 + difficulty: hard + tags: [ambiguous, europe, germany, delivery_service] + input: + id: txn_v2_hard_eu_005 + amount: 28.90 + classification: expense + description: "LIEFERANDO GMBH" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + # Online marketplaces - Europe + - id: cat_v2_hard_eu_006 + difficulty: hard + tags: [ambiguous, europe, online_marketplace] + input: + id: txn_v2_hard_eu_006 + amount: 67.00 + classification: expense + description: "AMAZON.CO.UK" + expected: + category_name: "Shopping" + + - id: cat_v2_hard_eu_007 + difficulty: hard + tags: [ambiguous, europe, germany, online_marketplace] + input: + id: txn_v2_hard_eu_007 + amount: 123.00 + classification: expense + description: "ZALANDO SE" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + # Payment processors - Europe + - id: cat_v2_hard_eu_008 + difficulty: hard + tags: [ambiguous, europe, payment_processor] + input: + id: txn_v2_hard_eu_008 + amount: 45.00 + classification: expense + description: "PAYPAL EUROPE" + expected: + category_name: null + + - id: cat_v2_hard_eu_009 + difficulty: hard + tags: [ambiguous, europe, uk, payment_processor] + input: + id: txn_v2_hard_eu_009 + amount: 89.00 + classification: expense + description: "KLARNA UK LTD" + expected: + category_name: null + + # Pharmacy/Drugstore - Europe + - id: cat_v2_hard_eu_010 + difficulty: hard + tags: [ambiguous, europe, uk, drugstore] + input: + id: txn_v2_hard_eu_010 + amount: 34.50 + classification: expense + description: "BOOTS UK LTD" + expected: + category_name: "Pharmacy" + acceptable_alternatives: ["Personal Care", "Health & Wellness"] + + # ============================================================================= + # EDGE CASES - Should return null (15 samples) + # ============================================================================= + + # Generic POS transactions + - id: cat_v2_edge_001 + difficulty: edge_case + tags: [should_be_null, generic_pos] + input: + id: txn_v2_edge_001 + amount: 15.00 + classification: expense + description: "POS DEBIT 12345" + expected: + category_name: null + + - id: cat_v2_edge_002 + difficulty: edge_case + tags: [should_be_null, generic_pos] + input: + id: txn_v2_edge_002 + amount: 50.00 + classification: expense + description: "DEBIT CARD PURCHASE" + expected: + category_name: null + + - id: cat_v2_edge_003 + difficulty: edge_case + tags: [should_be_null, generic_pos, europe] + input: + id: txn_v2_edge_003 + amount: 45.00 + classification: expense + description: "CARTE BANCAIRE" + expected: + category_name: null + + # ACH/Wire transfers + - id: cat_v2_edge_004 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_v2_edge_004 + amount: 100.00 + classification: expense + description: "ACH WITHDRAWAL" + expected: + category_name: null + + - id: cat_v2_edge_005 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_v2_edge_005 + amount: 500.00 + classification: expense + description: "ONLINE TRANSFER TO CHK 1234" + expected: + category_name: null + + - id: cat_v2_edge_006 + difficulty: edge_case + tags: [should_be_null, transfer, europe] + input: + id: txn_v2_edge_006 + amount: 250.00 + classification: expense + description: "SEPA TRANSFER" + expected: + category_name: null + + - id: cat_v2_edge_007 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_v2_edge_007 + amount: 1500.00 + classification: expense + description: "WIRE TRANSFER OUT" + expected: + category_name: null + + # ATM + - id: cat_v2_edge_008 + difficulty: edge_case + tags: [should_be_null, atm] + input: + id: txn_v2_edge_008 + amount: 200.00 + classification: expense + description: "ATM WITHDRAWAL 12345" + expected: + category_name: null + + - id: cat_v2_edge_009 + difficulty: edge_case + tags: [should_be_null, atm, europe] + input: + id: txn_v2_edge_009 + amount: 150.00 + classification: expense + description: "GELDAUTOMAT ABHEBUNG" + expected: + category_name: null + + # Unknown/generic business names + - id: cat_v2_edge_010 + difficulty: edge_case + tags: [should_be_null, unknown_merchant] + input: + id: txn_v2_edge_010 + amount: 75.00 + classification: expense + description: "MISC SERVICES LLC" + expected: + category_name: null + + # Reference numbers only + - id: cat_v2_edge_011 + difficulty: edge_case + tags: [should_be_null, reference_only] + input: + id: txn_v2_edge_011 + amount: 234.56 + classification: expense + description: "REF #789456123" + expected: + category_name: null + + # Checks + - id: cat_v2_edge_012 + difficulty: edge_case + tags: [should_be_null, check] + input: + id: txn_v2_edge_012 + amount: 350.00 + classification: expense + description: "CHECK #1234" + expected: + category_name: null + + # Bank fees + - id: cat_v2_edge_013 + difficulty: edge_case + tags: [should_be_null, fee] + input: + id: txn_v2_edge_013 + amount: 35.00 + classification: expense + description: "SERVICE CHARGE" + expected: + category_name: null + + # Cryptic abbreviations + - id: cat_v2_edge_014 + difficulty: edge_case + tags: [should_be_null, cryptic] + input: + id: txn_v2_edge_014 + amount: 45.67 + classification: expense + description: "TXN*89234*AUTH" + expected: + category_name: null + + - id: cat_v2_edge_015 + difficulty: edge_case + tags: [should_be_null, cryptic] + input: + id: txn_v2_edge_015 + amount: 123.45 + classification: expense + description: "PURCHASE 847392" + expected: + category_name: null + + # ============================================================================= + # ADDITIONAL SAMPLES - Mixed regions and categories (19 samples to reach 150) + # ============================================================================= + + # Additional US Easy samples + - id: cat_v2_add_001 + difficulty: easy + tags: [food_and_drink, us] + input: + id: txn_v2_add_001 + amount: 11.50 + classification: expense + description: "CHICK-FIL-A #1234" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_add_002 + difficulty: easy + tags: [food_and_drink, us] + input: + id: txn_v2_add_002 + amount: 7.99 + classification: expense + description: "POPEYES #5678" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_add_003 + difficulty: easy + tags: [groceries, us] + input: + id: txn_v2_add_003 + amount: 134.50 + classification: expense + description: "SAFEWAY #1234" + expected: + category_name: "Groceries" + + - id: cat_v2_add_004 + difficulty: easy + tags: [gas, us] + input: + id: txn_v2_add_004 + amount: 55.00 + classification: expense + description: "COSTCO GAS #789" + expected: + category_name: "Gas & Fuel" + + # Additional European Easy samples + - id: cat_v2_add_005 + difficulty: easy + tags: [groceries, europe, spain] + input: + id: txn_v2_add_005 + amount: 89.00 + classification: expense + description: "MERCADONA SA" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_add_006 + difficulty: easy + tags: [groceries, europe, italy] + input: + id: txn_v2_add_006 + amount: 67.50 + classification: expense + description: "ESSELUNGA SPA MILANO" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_add_007 + difficulty: easy + tags: [flights, europe, spain] + input: + id: txn_v2_add_007 + amount: 156.00 + classification: expense + description: "VUELING AIRLINES SA" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + - id: cat_v2_add_008 + difficulty: easy + tags: [flights, europe, netherlands] + input: + id: txn_v2_add_008 + amount: 234.00 + classification: expense + description: "KLM ROYAL DUTCH" + expected: + category_name: "Flights" + acceptable_alternatives: ["Travel"] + + # Additional Medium samples + - id: cat_v2_add_009 + difficulty: medium + tags: [restaurants, us] + input: + id: txn_v2_add_009 + amount: 56.00 + classification: expense + description: "APPLEBEES #789" + expected: + category_name: "Restaurants" + + - id: cat_v2_add_010 + difficulty: medium + tags: [restaurants, us] + input: + id: txn_v2_add_010 + amount: 78.50 + classification: expense + description: "RED LOBSTER #456" + expected: + category_name: "Restaurants" + + - id: cat_v2_add_011 + difficulty: medium + tags: [subscriptions, us] + input: + id: txn_v2_add_011 + amount: 14.99 + classification: expense + description: "MICROSOFT *OFFICE365" + expected: + category_name: "Subscriptions" + + - id: cat_v2_add_012 + difficulty: medium + tags: [subscriptions, us] + input: + id: txn_v2_add_012 + amount: 11.99 + classification: expense + description: "ADOBE CREATIVE CLOUD" + expected: + category_name: "Subscriptions" + + - id: cat_v2_add_013 + difficulty: medium + tags: [personal_care, europe, uk] + input: + id: txn_v2_add_013 + amount: 35.00 + classification: expense + description: "SUPERDRUG STORES" + expected: + category_name: "Personal Care" + acceptable_alternatives: ["Pharmacy"] + + # Additional Hard samples + - id: cat_v2_add_014 + difficulty: hard + tags: [ambiguous, us, delivery_service] + input: + id: txn_v2_add_014 + amount: 156.00 + classification: expense + description: "INSTACART*COSTCO" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_add_015 + difficulty: hard + tags: [ambiguous, europe, spain, delivery_service] + input: + id: txn_v2_add_015 + amount: 45.00 + classification: expense + description: "GLOVO APP BARCELONA" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Groceries", "Food & Drink"] + + - id: cat_v2_add_016 + difficulty: hard + tags: [ambiguous, europe, poland, multi_purpose_retailer] + input: + id: txn_v2_add_016 + amount: 178.00 + classification: expense + description: "BIEDRONKA SP ZOO" + expected: + category_name: "Groceries" + + # Additional Edge cases + - id: cat_v2_add_017 + difficulty: edge_case + tags: [should_be_null, europe, generic] + input: + id: txn_v2_add_017 + amount: 89.00 + classification: expense + description: "VIREMENT SEPA" + expected: + category_name: null + + - id: cat_v2_add_018 + difficulty: edge_case + tags: [should_be_null, generic] + input: + id: txn_v2_add_018 + amount: 25.00 + classification: expense + description: "RECURRING PAYMENT" + expected: + category_name: null + + - id: cat_v2_add_019 + difficulty: edge_case + tags: [should_be_null, europe, uk, generic] + input: + id: txn_v2_add_019 + amount: 15.00 + classification: expense + description: "DIRECT DEBIT PAYMENT" + expected: + category_name: null + + # ============================================================================= + # CHALLENGING SAMPLES - Local businesses, abbreviations, ambiguous + # ============================================================================= + + # Local/Unknown businesses - Hard to categorize without context + - id: cat_v2_challenge_001 + difficulty: hard + tags: [local_business, ambiguous] + input: + id: txn_v2_ch_001 + amount: 45.00 + classification: expense + description: "MIKE'S PLACE" + expected: + category_name: null + + - id: cat_v2_challenge_002 + difficulty: hard + tags: [local_business, ambiguous] + input: + id: txn_v2_ch_002 + amount: 67.50 + classification: expense + description: "THE CORNER SPOT LLC" + expected: + category_name: null + + - id: cat_v2_challenge_003 + difficulty: hard + tags: [local_business, ambiguous] + input: + id: txn_v2_ch_003 + amount: 23.99 + classification: expense + description: "MAIN ST MARKET" + expected: + category_name: "Groceries" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_challenge_004 + difficulty: hard + tags: [local_business, ambiguous] + input: + id: txn_v2_ch_004 + amount: 89.00 + classification: expense + description: "DOWNTOWN GRILL & BAR" + expected: + category_name: "Restaurants" + + - id: cat_v2_challenge_005 + difficulty: hard + tags: [local_business, ambiguous] + input: + id: txn_v2_ch_005 + amount: 15.00 + classification: expense + description: "JAVA JOE'S" + expected: + category_name: "Coffee Shops" + acceptable_alternatives: ["Restaurants"] + + # Abbreviated/truncated merchant names + - id: cat_v2_challenge_006 + difficulty: hard + tags: [abbreviated, ambiguous] + input: + id: txn_v2_ch_006 + amount: 34.50 + classification: expense + description: "AMZN MKTP US*2K9X7Y" + expected: + category_name: "Shopping" + + - id: cat_v2_challenge_007 + difficulty: hard + tags: [abbreviated, ambiguous] + input: + id: txn_v2_ch_007 + amount: 12.99 + classification: expense + description: "WM SUPERCENTER #" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + - id: cat_v2_challenge_008 + difficulty: hard + tags: [abbreviated, ambiguous] + input: + id: txn_v2_ch_008 + amount: 8.50 + classification: expense + description: "SBUX 12345" + expected: + category_name: "Coffee Shops" + + - id: cat_v2_challenge_009 + difficulty: hard + tags: [abbreviated, ambiguous] + input: + id: txn_v2_ch_009 + amount: 156.00 + classification: expense + description: "TGT*" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + - id: cat_v2_challenge_010 + difficulty: hard + tags: [abbreviated, ambiguous] + input: + id: txn_v2_ch_010 + amount: 45.00 + classification: expense + description: "SQ *JOE SMITH" + expected: + category_name: null + + # Multiple category signals - genuinely ambiguous + - id: cat_v2_challenge_011 + difficulty: hard + tags: [multi_signal, ambiguous] + input: + id: txn_v2_ch_011 + amount: 234.00 + classification: expense + description: "AMAZON FRESH" + expected: + category_name: "Groceries" + + - id: cat_v2_challenge_012 + difficulty: hard + tags: [multi_signal, ambiguous] + input: + id: txn_v2_ch_012 + amount: 45.00 + classification: expense + description: "TARGET.COM" + expected: + category_name: "Shopping" + acceptable_alternatives: ["Groceries"] + + - id: cat_v2_challenge_013 + difficulty: hard + tags: [multi_signal, ambiguous] + input: + id: txn_v2_ch_013 + amount: 67.00 + classification: expense + description: "WALGREENS PHARMACY" + expected: + category_name: "Pharmacy" + acceptable_alternatives: ["Health & Wellness", "Groceries"] + + - id: cat_v2_challenge_014 + difficulty: hard + tags: [multi_signal, ambiguous] + input: + id: txn_v2_ch_014 + amount: 23.00 + classification: expense + description: "CVS STORE" + expected: + category_name: "Pharmacy" + acceptable_alternatives: ["Health & Wellness", "Groceries"] + + # Numeric/cryptic descriptions + - id: cat_v2_challenge_015 + difficulty: edge_case + tags: [cryptic, should_be_null] + input: + id: txn_v2_ch_015 + amount: 78.00 + classification: expense + description: "12345678901234" + expected: + category_name: null + + - id: cat_v2_challenge_016 + difficulty: edge_case + tags: [cryptic, should_be_null] + input: + id: txn_v2_ch_016 + amount: 150.00 + classification: expense + description: "PMT*AUTH*9876" + expected: + category_name: null + + - id: cat_v2_challenge_017 + difficulty: edge_case + tags: [cryptic, should_be_null] + input: + id: txn_v2_ch_017 + amount: 99.00 + classification: expense + description: "CHECKCARD 0423" + expected: + category_name: null + + - id: cat_v2_challenge_018 + difficulty: edge_case + tags: [cryptic, should_be_null] + input: + id: txn_v2_ch_018 + amount: 200.00 + classification: expense + description: "EXTERNAL WITHDRAWAL" + expected: + category_name: null + + # Similar names, different categories + - id: cat_v2_challenge_019 + difficulty: hard + tags: [similar_names, ambiguous] + input: + id: txn_v2_ch_019 + amount: 45.00 + classification: expense + description: "APPLE STORE R123" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_challenge_020 + difficulty: hard + tags: [similar_names, ambiguous] + input: + id: txn_v2_ch_020 + amount: 2.99 + classification: expense + description: "APPLE.COM BILL" + expected: + category_name: "Subscriptions" + + - id: cat_v2_challenge_021 + difficulty: hard + tags: [similar_names, ambiguous] + input: + id: txn_v2_ch_021 + amount: 0.99 + classification: expense + description: "GOOGLE PLAY" + expected: + category_name: "Subscriptions" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_challenge_022 + difficulty: hard + tags: [similar_names, ambiguous] + input: + id: txn_v2_ch_022 + amount: 1299.00 + classification: expense + description: "GOOGLE STORE" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + # International formats + - id: cat_v2_challenge_023 + difficulty: hard + tags: [international, europe] + input: + id: txn_v2_ch_023 + amount: 45.00 + classification: expense + description: "REWE MARKT GMBH" + expected: + category_name: "Groceries" + + - id: cat_v2_challenge_024 + difficulty: hard + tags: [international, europe] + input: + id: txn_v2_ch_024 + amount: 89.00 + classification: expense + description: "MEDIAMARKT SATURN" + expected: + category_name: "Electronics" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_challenge_025 + difficulty: hard + tags: [international, europe] + input: + id: txn_v2_ch_025 + amount: 34.00 + classification: expense + description: "PRIMARK STORES LTD" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + - id: cat_v2_challenge_026 + difficulty: hard + tags: [international, asia] + input: + id: txn_v2_ch_026 + amount: 78.00 + classification: expense + description: "UNIQLO CO LTD" + expected: + category_name: "Clothing" + acceptable_alternatives: ["Shopping"] + + # Delivery with ambiguous underlying merchant + - id: cat_v2_challenge_027 + difficulty: hard + tags: [delivery, ambiguous] + input: + id: txn_v2_ch_027 + amount: 45.00 + classification: expense + description: "DOORDASH*" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_challenge_028 + difficulty: hard + tags: [delivery, ambiguous] + input: + id: txn_v2_ch_028 + amount: 89.00 + classification: expense + description: "UBER EATS" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_challenge_029 + difficulty: hard + tags: [delivery, ambiguous] + input: + id: txn_v2_ch_029 + amount: 123.00 + classification: expense + description: "INSTACART" + expected: + category_name: "Groceries" + + # Gym/Fitness edge cases + - id: cat_v2_challenge_030 + difficulty: hard + tags: [gym, ambiguous] + input: + id: txn_v2_ch_030 + amount: 15.00 + classification: expense + description: "CLASSPASS INC" + expected: + category_name: "Gym & Fitness" + acceptable_alternatives: ["Health & Wellness", "Subscriptions"] + + # Streaming vs Subscription edge cases + - id: cat_v2_challenge_031 + difficulty: hard + tags: [streaming, subscription, ambiguous] + input: + id: txn_v2_ch_031 + amount: 6.99 + classification: expense + description: "AMAZON VIDEO" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_v2_challenge_032 + difficulty: hard + tags: [streaming, subscription, ambiguous] + input: + id: txn_v2_ch_032 + amount: 9.99 + classification: expense + description: "YOUTUBE PREMIUM" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + - id: cat_v2_challenge_033 + difficulty: hard + tags: [streaming, subscription, ambiguous] + input: + id: txn_v2_ch_033 + amount: 14.99 + classification: expense + description: "APPLE TV+" + expected: + category_name: "Streaming Services" + acceptable_alternatives: ["Subscriptions"] + + # P2P and transfer ambiguity + - id: cat_v2_challenge_034 + difficulty: edge_case + tags: [p2p, should_be_null] + input: + id: txn_v2_ch_034 + amount: 50.00 + classification: expense + description: "VENMO *JOHN DOE" + expected: + category_name: null + + - id: cat_v2_challenge_035 + difficulty: edge_case + tags: [p2p, should_be_null] + input: + id: txn_v2_ch_035 + amount: 100.00 + classification: expense + description: "ZELLE PAYMENT TO" + expected: + category_name: null + + - id: cat_v2_challenge_036 + difficulty: medium + tags: [p2p, income] + input: + id: txn_v2_ch_036 + amount: 200.00 + classification: income + description: "VENMO *PAYMENT FROM" + expected: + category_name: "Income" + + # Food-related ambiguity + - id: cat_v2_challenge_037 + difficulty: hard + tags: [food, ambiguous] + input: + id: txn_v2_ch_037 + amount: 12.00 + classification: expense + description: "FIVE GUYS #1234" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_challenge_038 + difficulty: hard + tags: [food, ambiguous] + input: + id: txn_v2_ch_038 + amount: 45.00 + classification: expense + description: "SHAKE SHACK" + expected: + category_name: "Food & Drink" + acceptable_alternatives: ["Restaurants"] + + - id: cat_v2_challenge_039 + difficulty: hard + tags: [food, ambiguous] + input: + id: txn_v2_ch_039 + amount: 8.00 + classification: expense + description: "SWEETGREEN" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + - id: cat_v2_challenge_040 + difficulty: hard + tags: [food, ambiguous] + input: + id: txn_v2_ch_040 + amount: 23.00 + classification: expense + description: "CAVA GRILL" + expected: + category_name: "Restaurants" + acceptable_alternatives: ["Food & Drink"] + + # Hotel/Travel edge cases + - id: cat_v2_challenge_041 + difficulty: hard + tags: [travel, ambiguous] + input: + id: txn_v2_ch_041 + amount: 89.00 + classification: expense + description: "BOOKING.COM" + expected: + category_name: "Hotels" + acceptable_alternatives: ["Travel"] + + - id: cat_v2_challenge_042 + difficulty: hard + tags: [travel, ambiguous] + input: + id: txn_v2_ch_042 + amount: 156.00 + classification: expense + description: "EXPEDIA INC" + expected: + category_name: "Travel" + acceptable_alternatives: ["Hotels", "Flights"] + + - id: cat_v2_challenge_043 + difficulty: hard + tags: [travel, ambiguous] + input: + id: txn_v2_ch_043 + amount: 234.00 + classification: expense + description: "VRBO.COM" + expected: + category_name: "Hotels" + acceptable_alternatives: ["Travel"] + + # Gas station convenience purchases + - id: cat_v2_challenge_044 + difficulty: hard + tags: [gas, convenience, ambiguous] + input: + id: txn_v2_ch_044 + amount: 8.50 + classification: expense + description: "SHELL SERVICE STATION" + expected: + category_name: "Gas & Fuel" + acceptable_alternatives: ["Groceries"] + + - id: cat_v2_challenge_045 + difficulty: hard + tags: [gas, convenience, ambiguous] + input: + id: txn_v2_ch_045 + amount: 12.00 + classification: expense + description: "SPEEDWAY" + expected: + category_name: "Gas & Fuel" + acceptable_alternatives: ["Groceries"] + + # Income edge cases + - id: cat_v2_challenge_046 + difficulty: medium + tags: [income, ambiguous] + input: + id: txn_v2_ch_046 + amount: 1500.00 + classification: income + description: "ACH CREDIT" + expected: + category_name: "Income" + acceptable_alternatives: ["Salary"] + + - id: cat_v2_challenge_047 + difficulty: medium + tags: [income, ambiguous] + input: + id: txn_v2_ch_047 + amount: 500.00 + classification: income + description: "INTEREST PAYMENT" + expected: + category_name: "Income" + + - id: cat_v2_challenge_048 + difficulty: medium + tags: [income, ambiguous] + input: + id: txn_v2_ch_048 + amount: 234.00 + classification: income + description: "DIVIDEND" + expected: + category_name: "Income" + + # Cryptic European formats + - id: cat_v2_challenge_049 + difficulty: edge_case + tags: [europe, cryptic, should_be_null] + input: + id: txn_v2_ch_049 + amount: 45.00 + classification: expense + description: "LASTSCHRIFT" + expected: + category_name: null + + - id: cat_v2_challenge_050 + difficulty: edge_case + tags: [europe, cryptic, should_be_null] + input: + id: txn_v2_ch_050 + amount: 89.00 + classification: expense + description: "PRELEVEMENT" + expected: + category_name: null diff --git a/db/eval_data/chat_golden_v1.yml b/db/eval_data/chat_golden_v1.yml new file mode 100644 index 000000000..46afd1d14 --- /dev/null +++ b/db/eval_data/chat_golden_v1.yml @@ -0,0 +1,825 @@ +--- +name: chat_golden_v1 +description: Golden dataset for chat/assistant function calling evaluation +eval_type: chat +version: "1.0" +metadata: + created_at: "2024-12-01" + source: manual_curation + +samples: + # ===== EASY - Simple single function calls ===== + - id: chat_easy_001 + difficulty: easy + tags: [get_accounts, simple] + input: + prompt: "What accounts do I have?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_easy_002 + difficulty: easy + tags: [get_accounts, simple] + input: + prompt: "Show me my accounts" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_easy_003 + difficulty: easy + tags: [get_accounts, balance] + input: + prompt: "What's my account balance?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_easy_004 + difficulty: easy + tags: [get_transactions, simple] + input: + prompt: "Show me my recent transactions" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_easy_005 + difficulty: easy + tags: [get_transactions, simple] + input: + prompt: "What are my latest transactions?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_easy_006 + difficulty: easy + tags: [get_balance_sheet, simple] + input: + prompt: "What's my net worth?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_easy_007 + difficulty: easy + tags: [get_balance_sheet, simple] + input: + prompt: "Show me my assets and liabilities" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_easy_008 + difficulty: easy + tags: [get_income_statement, simple] + input: + prompt: "What were my expenses last month?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_easy_009 + difficulty: easy + tags: [get_income_statement, simple] + input: + prompt: "How much income did I make this month?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_easy_010 + difficulty: easy + tags: [get_accounts, simple] + input: + prompt: "How many accounts do I have?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_easy_011 + difficulty: easy + tags: [get_transactions, simple] + input: + prompt: "List my transactions" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_easy_012 + difficulty: easy + tags: [get_balance_sheet, simple] + input: + prompt: "How much do I owe?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_easy_013 + difficulty: easy + tags: [get_balance_sheet, simple] + input: + prompt: "What are my total assets?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_easy_014 + difficulty: easy + tags: [get_income_statement, simple] + input: + prompt: "Show my spending" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_easy_015 + difficulty: easy + tags: [get_income_statement, simple] + input: + prompt: "How much did I spend?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + # ===== MEDIUM - With filtering or specific parameters ===== + - id: chat_medium_001 + difficulty: medium + tags: [get_transactions, filtering] + input: + prompt: "Show me my restaurant spending" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_002 + difficulty: medium + tags: [get_transactions, filtering] + input: + prompt: "What did I spend on groceries?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_003 + difficulty: medium + tags: [get_transactions, filtering] + input: + prompt: "Show transactions over $100" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_004 + difficulty: medium + tags: [get_transactions, filtering] + input: + prompt: "What did I spend at Amazon?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_005 + difficulty: medium + tags: [get_transactions, date_range] + input: + prompt: "Show me last week's transactions" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_006 + difficulty: medium + tags: [get_income_statement, date_range] + input: + prompt: "What was my income in January?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_medium_007 + difficulty: medium + tags: [get_income_statement, comparison] + input: + prompt: "How much did I save last month?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_medium_008 + difficulty: medium + tags: [get_accounts, specific] + input: + prompt: "What's the balance in my checking account?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_medium_009 + difficulty: medium + tags: [get_accounts, specific] + input: + prompt: "How much do I have in savings?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_medium_010 + difficulty: medium + tags: [get_transactions, category] + input: + prompt: "Show me all my subscription payments" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_011 + difficulty: medium + tags: [get_transactions, search] + input: + prompt: "Find transactions from Uber" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_012 + difficulty: medium + tags: [get_income_statement, category] + input: + prompt: "How much do I spend on entertainment?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_medium_013 + difficulty: medium + tags: [get_balance_sheet, trend] + input: + prompt: "How has my net worth changed over time?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_medium_014 + difficulty: medium + tags: [get_transactions, amount] + input: + prompt: "What's my largest expense this month?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_015 + difficulty: medium + tags: [get_income_statement, breakdown] + input: + prompt: "Break down my expenses by category" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_medium_016 + difficulty: medium + tags: [get_transactions, recurring] + input: + prompt: "Show me my recurring payments" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_017 + difficulty: medium + tags: [get_accounts, credit] + input: + prompt: "What's my credit card balance?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_medium_018 + difficulty: medium + tags: [get_income_statement, specific] + input: + prompt: "How much did I spend on food last month?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_medium_019 + difficulty: medium + tags: [get_transactions, date] + input: + prompt: "Show transactions from December" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_020 + difficulty: medium + tags: [get_balance_sheet, liability] + input: + prompt: "What are my debts?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + # ===== HARD - Analysis, comparisons, insights ===== + - id: chat_hard_001 + difficulty: hard + tags: [analysis, spending_trend] + input: + prompt: "Am I spending more than I make?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_002 + difficulty: hard + tags: [comparison, month_over_month] + input: + prompt: "How does my spending this month compare to last month?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_003 + difficulty: hard + tags: [analysis, budget] + input: + prompt: "Where can I cut expenses?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_004 + difficulty: hard + tags: [analysis, savings] + input: + prompt: "What's my savings rate?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_005 + difficulty: hard + tags: [analysis, trend] + input: + prompt: "Are my expenses trending up or down?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_006 + difficulty: hard + tags: [analysis, category] + input: + prompt: "What category do I spend the most on?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_007 + difficulty: hard + tags: [analysis, unusual] + input: + prompt: "Are there any unusual transactions this month?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_hard_008 + difficulty: hard + tags: [analysis, debt] + input: + prompt: "How long will it take to pay off my credit card?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_hard_009 + difficulty: hard + tags: [analysis, financial_health] + input: + prompt: "What's my debt-to-income ratio?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_hard_010 + difficulty: hard + tags: [analysis, goals] + input: + prompt: "Can I afford to save $500 more per month?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_011 + difficulty: hard + tags: [comparison, year_over_year] + input: + prompt: "How does this year compare to last year?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_012 + difficulty: hard + tags: [analysis, pattern] + input: + prompt: "Do I have any spending patterns I should know about?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_hard_013 + difficulty: hard + tags: [advice, budget] + input: + prompt: "How should I allocate my income?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_014 + difficulty: hard + tags: [analysis, efficiency] + input: + prompt: "Am I overspending on subscriptions?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_hard_015 + difficulty: hard + tags: [forecast, projection] + input: + prompt: "At this rate, how much will I have saved by year end?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + # ===== EDGE CASES - Unclear intent, no function needed ===== + - id: chat_edge_001 + difficulty: edge_case + tags: [no_function, greeting] + input: + prompt: "Hello" + expected: + functions: [] + response_contains: [] + + - id: chat_edge_002 + difficulty: edge_case + tags: [no_function, thanks] + input: + prompt: "Thank you!" + expected: + functions: [] + response_contains: [] + + - id: chat_edge_003 + difficulty: edge_case + tags: [no_function, general] + input: + prompt: "What can you help me with?" + expected: + functions: [] + response_contains: [] + + - id: chat_edge_004 + difficulty: edge_case + tags: [no_function, advice] + input: + prompt: "Should I invest in stocks?" + expected: + functions: [] + response_contains: [] + + - id: chat_edge_005 + difficulty: edge_case + tags: [no_function, external] + input: + prompt: "What's the weather like?" + expected: + functions: [] + response_contains: [] + + - id: chat_edge_006 + difficulty: edge_case + tags: [ambiguous] + input: + prompt: "Tell me about my money" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_edge_007 + difficulty: edge_case + tags: [ambiguous] + input: + prompt: "How am I doing financially?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_edge_008 + difficulty: edge_case + tags: [ambiguous] + input: + prompt: "Give me a summary" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_edge_009 + difficulty: edge_case + tags: [no_function, off_topic] + input: + prompt: "What's 2 + 2?" + expected: + functions: [] + response_contains: [] + + - id: chat_edge_010 + difficulty: edge_case + tags: [no_function, general] + input: + prompt: "Who are you?" + expected: + functions: [] + response_contains: [] + + # Additional samples + - id: chat_easy_016 + difficulty: easy + tags: [get_transactions] + input: + prompt: "Pull up my transactions" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_easy_017 + difficulty: easy + tags: [get_accounts] + input: + prompt: "Show all my bank accounts" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_easy_018 + difficulty: easy + tags: [get_balance_sheet] + input: + prompt: "What do I own?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_easy_019 + difficulty: easy + tags: [get_income_statement] + input: + prompt: "What's my income?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_easy_020 + difficulty: easy + tags: [get_transactions] + input: + prompt: "Recent purchases" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_021 + difficulty: medium + tags: [get_transactions, merchant] + input: + prompt: "How much have I spent at Starbucks?" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_022 + difficulty: medium + tags: [get_transactions, category] + input: + prompt: "Show transportation expenses" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_medium_023 + difficulty: medium + tags: [get_income_statement, period] + input: + prompt: "Quarterly expense report" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_medium_024 + difficulty: medium + tags: [get_accounts, type] + input: + prompt: "Show my investment accounts" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_medium_025 + difficulty: medium + tags: [get_transactions, amount] + input: + prompt: "Transactions under $50" + expected: + functions: + - name: "get_transactions" + params: {} + response_contains: [] + + - id: chat_hard_016 + difficulty: hard + tags: [analysis, discretionary] + input: + prompt: "How much discretionary spending do I have?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_017 + difficulty: hard + tags: [analysis, fixed_vs_variable] + input: + prompt: "What are my fixed vs variable expenses?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] + + - id: chat_hard_018 + difficulty: hard + tags: [analysis, emergency_fund] + input: + prompt: "Do I have enough for an emergency fund?" + expected: + functions: + - name: "get_balance_sheet" + params: {} + response_contains: [] + + - id: chat_hard_019 + difficulty: hard + tags: [analysis, liquidity] + input: + prompt: "How liquid are my assets?" + expected: + functions: + - name: "get_accounts" + params: {} + response_contains: [] + + - id: chat_hard_020 + difficulty: hard + tags: [comparison, benchmark] + input: + prompt: "Am I spending too much on housing?" + expected: + functions: + - name: "get_income_statement" + params: {} + response_contains: [] diff --git a/db/eval_data/merchant_detection_golden_v1.yml b/db/eval_data/merchant_detection_golden_v1.yml new file mode 100644 index 000000000..a25c35ca5 --- /dev/null +++ b/db/eval_data/merchant_detection_golden_v1.yml @@ -0,0 +1,1117 @@ +--- +name: merchant_detection_golden_v1 +description: Golden dataset for merchant name and URL detection evaluation +eval_type: merchant_detection +version: "1.0" +metadata: + created_at: "2024-12-01" + source: manual_curation + +context: + merchants: + - id: "mcdonalds" + name: "McDonald's" + - id: "starbucks" + name: "Starbucks" + - id: "amazon" + name: "Amazon" + - id: "netflix" + name: "Netflix" + - id: "uber" + name: "Uber" + - id: "spotify" + name: "Spotify" + - id: "target" + name: "Target" + - id: "costco" + name: "Costco" + - id: "apple" + name: "Apple" + - id: "google" + name: "Google" + +samples: + # ===== EASY - Clear global brands ===== + - id: merch_easy_001 + difficulty: easy + tags: [fast_food, global_brand] + input: + id: txn_001 + amount: 12.99 + classification: expense + description: "MCDONALD'S #12345 SPRINGFIELD IL" + expected: + business_name: "McDonald's" + business_url: "mcdonalds.com" + + - id: merch_easy_002 + difficulty: easy + tags: [coffee, global_brand] + input: + id: txn_002 + amount: 5.75 + classification: expense + description: "STARBUCKS STORE #9876" + expected: + business_name: "Starbucks" + business_url: "starbucks.com" + + - id: merch_easy_003 + difficulty: easy + tags: [ecommerce, global_brand] + input: + id: txn_003 + amount: 89.99 + classification: expense + description: "AMAZON.COM*1A2B3C4D AMZN.COM/BILL" + expected: + business_name: "Amazon" + business_url: "amazon.com" + + - id: merch_easy_004 + difficulty: easy + tags: [streaming, global_brand] + input: + id: txn_004 + amount: 15.99 + classification: expense + description: "NETFLIX.COM" + expected: + business_name: "Netflix" + business_url: "netflix.com" + + - id: merch_easy_005 + difficulty: easy + tags: [rideshare, global_brand] + input: + id: txn_005 + amount: 23.50 + classification: expense + description: "UBER *TRIP HELP.UBER.COM" + expected: + business_name: "Uber" + business_url: "uber.com" + + - id: merch_easy_006 + difficulty: easy + tags: [streaming, global_brand] + input: + id: txn_006 + amount: 14.99 + classification: expense + description: "SPOTIFY USA" + expected: + business_name: "Spotify" + business_url: "spotify.com" + + - id: merch_easy_007 + difficulty: easy + tags: [retail, global_brand] + input: + id: txn_007 + amount: 156.78 + classification: expense + description: "TARGET #1234" + expected: + business_name: "Target" + business_url: "target.com" + + - id: merch_easy_008 + difficulty: easy + tags: [retail, global_brand] + input: + id: txn_008 + amount: 234.56 + classification: expense + description: "COSTCO WHSE #5678" + expected: + business_name: "Costco" + business_url: "costco.com" + + - id: merch_easy_009 + difficulty: easy + tags: [tech, global_brand] + input: + id: txn_009 + amount: 9.99 + classification: expense + description: "APPLE.COM/BILL" + expected: + business_name: "Apple" + business_url: "apple.com" + + - id: merch_easy_010 + difficulty: easy + tags: [tech, global_brand] + input: + id: txn_010 + amount: 12.99 + classification: expense + description: "GOOGLE *STORAGE" + expected: + business_name: "Google" + business_url: "google.com" + + - id: merch_easy_011 + difficulty: easy + tags: [fast_food, global_brand] + input: + id: txn_011 + amount: 8.50 + classification: expense + description: "BURGER KING #456" + expected: + business_name: "Burger King" + business_url: "bk.com" + + - id: merch_easy_012 + difficulty: easy + tags: [fast_food, global_brand] + input: + id: txn_012 + amount: 9.99 + classification: expense + description: "TACO BELL #789" + expected: + business_name: "Taco Bell" + business_url: "tacobell.com" + + - id: merch_easy_013 + difficulty: easy + tags: [fast_food, global_brand] + input: + id: txn_013 + amount: 11.50 + classification: expense + description: "WENDYS #456" + expected: + business_name: "Wendy's" + business_url: "wendys.com" + + - id: merch_easy_014 + difficulty: easy + tags: [coffee, global_brand] + input: + id: txn_014 + amount: 4.25 + classification: expense + description: "DUNKIN #12345" + expected: + business_name: "Dunkin'" + business_url: "dunkindonuts.com" + + - id: merch_easy_015 + difficulty: easy + tags: [grocery, global_brand] + input: + id: txn_015 + amount: 156.32 + classification: expense + description: "WHOLE FOODS MKT #10234" + expected: + business_name: "Whole Foods Market" + business_url: "wholefoodsmarket.com" + + - id: merch_easy_016 + difficulty: easy + tags: [grocery, global_brand] + input: + id: txn_016 + amount: 87.45 + classification: expense + description: "TRADER JOE'S #567" + expected: + business_name: "Trader Joe's" + business_url: "traderjoes.com" + + - id: merch_easy_017 + difficulty: easy + tags: [gas, global_brand] + input: + id: txn_017 + amount: 45.00 + classification: expense + description: "SHELL OIL 573849234" + expected: + business_name: "Shell" + business_url: "shell.com" + + - id: merch_easy_018 + difficulty: easy + tags: [gas, global_brand] + input: + id: txn_018 + amount: 52.30 + classification: expense + description: "CHEVRON STATION #1234" + expected: + business_name: "Chevron" + business_url: "chevron.com" + + - id: merch_easy_019 + difficulty: easy + tags: [rideshare, global_brand] + input: + id: txn_019 + amount: 18.75 + classification: expense + description: "LYFT *RIDE SAT 7PM" + expected: + business_name: "Lyft" + business_url: "lyft.com" + + - id: merch_easy_020 + difficulty: easy + tags: [electronics, global_brand] + input: + id: txn_020 + amount: 299.99 + classification: expense + description: "BEST BUY 00000456" + expected: + business_name: "Best Buy" + business_url: "bestbuy.com" + + # ===== MEDIUM - Less obvious merchants ===== + - id: merch_medium_001 + difficulty: medium + tags: [restaurant, chain] + input: + id: txn_021 + amount: 67.50 + classification: expense + description: "OLIVE GARDEN #456" + expected: + business_name: "Olive Garden" + business_url: "olivegarden.com" + + - id: merch_medium_002 + difficulty: medium + tags: [restaurant, chain] + input: + id: txn_022 + amount: 45.00 + classification: expense + description: "CHEESECAKE FACTORY" + expected: + business_name: "The Cheesecake Factory" + business_url: "thecheesecakefactory.com" + + - id: merch_medium_003 + difficulty: medium + tags: [pharmacy, chain] + input: + id: txn_023 + amount: 24.99 + classification: expense + description: "CVS/PHARMACY #4567" + expected: + business_name: "CVS Pharmacy" + business_url: "cvs.com" + + - id: merch_medium_004 + difficulty: medium + tags: [pharmacy, chain] + input: + id: txn_024 + amount: 35.50 + classification: expense + description: "WALGREENS #12345" + expected: + business_name: "Walgreens" + business_url: "walgreens.com" + + - id: merch_medium_005 + difficulty: medium + tags: [fitness, chain] + input: + id: txn_025 + amount: 39.99 + classification: expense + description: "PLANET FITNESS MONTHLY" + expected: + business_name: "Planet Fitness" + business_url: "planetfitness.com" + + - id: merch_medium_006 + difficulty: medium + tags: [airline, brand] + input: + id: txn_026 + amount: 345.00 + classification: expense + description: "UNITED AIRLINES 0162345678" + expected: + business_name: "United Airlines" + business_url: "united.com" + + - id: merch_medium_007 + difficulty: medium + tags: [hotel, brand] + input: + id: txn_027 + amount: 189.00 + classification: expense + description: "MARRIOTT HOTELS NYC" + expected: + business_name: "Marriott" + business_url: "marriott.com" + + - id: merch_medium_008 + difficulty: medium + tags: [retail, chain] + input: + id: txn_028 + amount: 234.56 + classification: expense + description: "WALMART SUPERCENTER #1234" + expected: + business_name: "Walmart" + business_url: "walmart.com" + + - id: merch_medium_009 + difficulty: medium + tags: [streaming] + input: + id: txn_029 + amount: 6.99 + classification: expense + description: "HULU LLC" + expected: + business_name: "Hulu" + business_url: "hulu.com" + + - id: merch_medium_010 + difficulty: medium + tags: [streaming] + input: + id: txn_030 + amount: 8.99 + classification: expense + description: "DISNEY PLUS" + expected: + business_name: "Disney+" + business_url: "disneyplus.com" + + - id: merch_medium_011 + difficulty: medium + tags: [clothing, brand] + input: + id: txn_031 + amount: 89.99 + classification: expense + description: "ZARA USA INC" + expected: + business_name: "Zara" + business_url: "zara.com" + + - id: merch_medium_012 + difficulty: medium + tags: [clothing, brand] + input: + id: txn_032 + amount: 65.00 + classification: expense + description: "H&M HENNES MAURITZ" + expected: + business_name: "H&M" + business_url: "hm.com" + + - id: merch_medium_013 + difficulty: medium + tags: [utility] + input: + id: txn_033 + amount: 145.00 + classification: expense + description: "XFINITY INTERNET" + expected: + business_name: "Xfinity" + business_url: "xfinity.com" + + - id: merch_medium_014 + difficulty: medium + tags: [telecom] + input: + id: txn_034 + amount: 89.00 + classification: expense + description: "AT&T WIRELESS" + expected: + business_name: "AT&T" + business_url: "att.com" + + - id: merch_medium_015 + difficulty: medium + tags: [telecom] + input: + id: txn_035 + amount: 112.00 + classification: expense + description: "VERIZON WIRELESS" + expected: + business_name: "Verizon" + business_url: "verizon.com" + + - id: merch_medium_016 + difficulty: medium + tags: [ecommerce] + input: + id: txn_036 + amount: 45.00 + classification: expense + description: "ETSY.COM" + expected: + business_name: "Etsy" + business_url: "etsy.com" + + - id: merch_medium_017 + difficulty: medium + tags: [ecommerce] + input: + id: txn_037 + amount: 89.00 + classification: expense + description: "WAYFAIR*PURCHASE" + expected: + business_name: "Wayfair" + business_url: "wayfair.com" + + - id: merch_medium_018 + difficulty: medium + tags: [home, brand] + input: + id: txn_038 + amount: 123.00 + classification: expense + description: "IKEA US EAST LLC" + expected: + business_name: "IKEA" + business_url: "ikea.com" + + - id: merch_medium_019 + difficulty: medium + tags: [hotel] + input: + id: txn_039 + amount: 234.00 + classification: expense + description: "AIRBNB *HMQT5J6QQJ" + expected: + business_name: "Airbnb" + business_url: "airbnb.com" + + - id: merch_medium_020 + difficulty: medium + tags: [entertainment] + input: + id: txn_040 + amount: 45.00 + classification: expense + description: "AMC THEATRES #1234" + expected: + business_name: "AMC Theatres" + business_url: "amctheatres.com" + + # ===== HARD - Delivery services, subsidiaries ===== + - id: merch_hard_001 + difficulty: hard + tags: [delivery, aggregator] + input: + id: txn_041 + amount: 45.00 + classification: expense + description: "DOORDASH*CHIPOTLE" + expected: + business_name: "DoorDash" + business_url: "doordash.com" + + - id: merch_hard_002 + difficulty: hard + tags: [delivery, aggregator] + input: + id: txn_042 + amount: 67.00 + classification: expense + description: "GRUBHUB*THAI KITCHEN" + expected: + business_name: "Grubhub" + business_url: "grubhub.com" + + - id: merch_hard_003 + difficulty: hard + tags: [delivery, aggregator] + input: + id: txn_043 + amount: 89.00 + classification: expense + description: "UBEREATS *UBER EATS" + expected: + business_name: "Uber Eats" + business_url: "ubereats.com" + + - id: merch_hard_004 + difficulty: hard + tags: [delivery, grocery] + input: + id: txn_044 + amount: 234.00 + classification: expense + description: "INSTACART*SAFEWAY" + expected: + business_name: "Instacart" + business_url: "instacart.com" + + - id: merch_hard_005 + difficulty: hard + tags: [subscription, variant] + input: + id: txn_045 + amount: 156.00 + classification: expense + description: "AMAZON PRIME*1A2B3C" + expected: + business_name: "Amazon" + business_url: "amazon.com" + + - id: merch_hard_006 + difficulty: hard + tags: [payment_processor] + input: + id: txn_046 + amount: 45.00 + classification: expense + description: "SQ *DOWNTOWN CAFE" + expected: + business_name: null + business_url: null + + - id: merch_hard_007 + difficulty: hard + tags: [streaming, variant] + input: + id: txn_047 + amount: 19.99 + classification: expense + description: "HBO MAX" + expected: + business_name: "Max" + business_url: "max.com" + + - id: merch_hard_008 + difficulty: hard + tags: [tech, variant] + input: + id: txn_048 + amount: 1299.00 + classification: expense + description: "APPLE STORE #R123" + expected: + business_name: "Apple" + business_url: "apple.com" + + - id: merch_hard_009 + difficulty: hard + tags: [food, variant] + input: + id: txn_049 + amount: 34.50 + classification: expense + description: "PANERA BREAD #567" + expected: + business_name: "Panera Bread" + business_url: "panerabread.com" + + - id: merch_hard_010 + difficulty: hard + tags: [convenience] + input: + id: txn_050 + amount: 12.50 + classification: expense + description: "7-ELEVEN #34567" + expected: + business_name: "7-Eleven" + business_url: "7-eleven.com" + + # ===== EDGE CASES - Should return null ===== + - id: merch_edge_001 + difficulty: edge_case + tags: [should_be_null, generic] + input: + id: txn_051 + amount: 15.00 + classification: expense + description: "POS DEBIT 12345" + expected: + business_name: null + business_url: null + + - id: merch_edge_002 + difficulty: edge_case + tags: [should_be_null, generic] + input: + id: txn_052 + amount: 100.00 + classification: expense + description: "ACH WITHDRAWAL" + expected: + business_name: null + business_url: null + + - id: merch_edge_003 + difficulty: edge_case + tags: [should_be_null, payment] + input: + id: txn_053 + amount: 78.00 + classification: expense + description: "PAYPAL *JOHNSMITH" + expected: + business_name: null + business_url: null + + - id: merch_edge_004 + difficulty: edge_case + tags: [should_be_null, transfer] + input: + id: txn_054 + amount: 500.00 + classification: expense + description: "ONLINE TRANSFER TO CHK 1234" + expected: + business_name: null + business_url: null + + - id: merch_edge_005 + difficulty: edge_case + tags: [should_be_null, atm] + input: + id: txn_055 + amount: 200.00 + classification: expense + description: "ATM WITHDRAWAL 12345" + expected: + business_name: null + business_url: null + + - id: merch_edge_006 + difficulty: edge_case + tags: [should_be_null, generic] + input: + id: txn_056 + amount: 75.00 + classification: expense + description: "MISC SERVICES LLC" + expected: + business_name: null + business_url: null + + - id: merch_edge_007 + difficulty: edge_case + tags: [should_be_null, wire] + input: + id: txn_057 + amount: 1500.00 + classification: expense + description: "WIRE TRANSFER OUT" + expected: + business_name: null + business_url: null + + - id: merch_edge_008 + difficulty: edge_case + tags: [should_be_null, check] + input: + id: txn_058 + amount: 350.00 + classification: expense + description: "CHECK #1234" + expected: + business_name: null + business_url: null + + - id: merch_edge_009 + difficulty: edge_case + tags: [should_be_null, fee] + input: + id: txn_059 + amount: 35.00 + classification: expense + description: "SERVICE CHARGE" + expected: + business_name: null + business_url: null + + - id: merch_edge_010 + difficulty: edge_case + tags: [should_be_null, p2p] + input: + id: txn_060 + amount: 250.00 + classification: expense + description: "ZELLE TO JOHN DOE" + expected: + business_name: null + business_url: null + + # Additional samples + - id: merch_easy_021 + difficulty: easy + tags: [fast_food] + input: + id: txn_061 + amount: 14.99 + classification: expense + description: "CHIPOTLE ONLINE" + expected: + business_name: "Chipotle" + business_url: "chipotle.com" + + - id: merch_easy_022 + difficulty: easy + tags: [fast_food] + input: + id: txn_062 + amount: 8.99 + classification: expense + description: "SUBWAY #12345" + expected: + business_name: "Subway" + business_url: "subway.com" + + - id: merch_easy_023 + difficulty: easy + tags: [coffee] + input: + id: txn_063 + amount: 6.50 + classification: expense + description: "PEETS COFFEE #456" + expected: + business_name: "Peet's Coffee" + business_url: "peets.com" + + - id: merch_easy_024 + difficulty: easy + tags: [grocery] + input: + id: txn_064 + amount: 145.67 + classification: expense + description: "KROGER #456" + expected: + business_name: "Kroger" + business_url: "kroger.com" + + - id: merch_easy_025 + difficulty: easy + tags: [grocery] + input: + id: txn_065 + amount: 98.34 + classification: expense + description: "PUBLIX SUPER MARKET" + expected: + business_name: "Publix" + business_url: "publix.com" + + - id: merch_medium_021 + difficulty: medium + tags: [restaurant] + input: + id: txn_066 + amount: 123.45 + classification: expense + description: "RUTH'S CHRIS STEAK" + expected: + business_name: "Ruth's Chris Steak House" + business_url: "ruthschris.com" + + - id: merch_medium_022 + difficulty: medium + tags: [restaurant] + input: + id: txn_067 + amount: 89.00 + classification: expense + description: "P.F. CHANGS #234" + expected: + business_name: "P.F. Chang's" + business_url: "pfchangs.com" + + - id: merch_medium_023 + difficulty: medium + tags: [gas] + input: + id: txn_068 + amount: 48.50 + classification: expense + description: "EXXONMOBIL 12345" + expected: + business_name: "ExxonMobil" + business_url: "exxon.com" + + - id: merch_medium_024 + difficulty: medium + tags: [gas] + input: + id: txn_069 + amount: 55.00 + classification: expense + description: "BP#1234567" + expected: + business_name: "BP" + business_url: "bp.com" + + - id: merch_medium_025 + difficulty: medium + tags: [hotel] + input: + id: txn_070 + amount: 245.00 + classification: expense + description: "HILTON HOTELS" + expected: + business_name: "Hilton" + business_url: "hilton.com" + + - id: merch_hard_011 + difficulty: hard + tags: [payment_processor] + input: + id: txn_071 + amount: 34.00 + classification: expense + description: "VENMO *PIZZA PLACE" + expected: + business_name: null + business_url: null + + - id: merch_hard_012 + difficulty: hard + tags: [subscription] + input: + id: txn_072 + amount: 14.99 + classification: expense + description: "APPLE.COM/BILL ONE" + expected: + business_name: "Apple" + business_url: "apple.com" + + - id: merch_hard_013 + difficulty: hard + tags: [tech] + input: + id: txn_073 + amount: 99.99 + classification: expense + description: "MICROSOFT*OFFICE 365" + expected: + business_name: "Microsoft" + business_url: "microsoft.com" + + - id: merch_hard_014 + difficulty: hard + tags: [gaming] + input: + id: txn_074 + amount: 59.99 + classification: expense + description: "STEAMPOWERED.COM" + expected: + business_name: "Steam" + business_url: "steampowered.com" + + - id: merch_hard_015 + difficulty: hard + tags: [subscription] + input: + id: txn_075 + amount: 10.99 + classification: expense + description: "YOUTUBE PREMIUM" + expected: + business_name: "YouTube" + business_url: "youtube.com" + + - id: merch_easy_026 + difficulty: easy + tags: [airline] + input: + id: txn_076 + amount: 456.00 + classification: expense + description: "DELTA AIR LINES" + expected: + business_name: "Delta Air Lines" + business_url: "delta.com" + + - id: merch_easy_027 + difficulty: easy + tags: [airline] + input: + id: txn_077 + amount: 389.00 + classification: expense + description: "SOUTHWEST AIRLINES" + expected: + business_name: "Southwest Airlines" + business_url: "southwest.com" + + - id: merch_easy_028 + difficulty: easy + tags: [hotel] + input: + id: txn_078 + amount: 178.00 + classification: expense + description: "HYATT REGENCY" + expected: + business_name: "Hyatt" + business_url: "hyatt.com" + + - id: merch_medium_026 + difficulty: medium + tags: [fitness] + input: + id: txn_079 + amount: 29.99 + classification: expense + description: "LA FITNESS CLUB" + expected: + business_name: "LA Fitness" + business_url: "lafitness.com" + + - id: merch_medium_027 + difficulty: medium + tags: [fitness] + input: + id: txn_080 + amount: 49.99 + classification: expense + description: "ORANGETHEORY FITNESS" + expected: + business_name: "Orangetheory Fitness" + business_url: "orangetheory.com" + + - id: merch_edge_011 + difficulty: edge_case + tags: [should_be_null] + input: + id: txn_081 + amount: 1.00 + classification: expense + description: "PENDING AUTHORIZATION" + expected: + business_name: null + business_url: null + + - id: merch_edge_012 + difficulty: edge_case + tags: [should_be_null] + input: + id: txn_082 + amount: 0.00 + classification: expense + description: "VOID TRANSACTION" + expected: + business_name: null + business_url: null + + - id: merch_medium_028 + difficulty: medium + tags: [entertainment] + input: + id: txn_083 + amount: 89.00 + classification: expense + description: "TICKETMASTER *EVENT" + expected: + business_name: "Ticketmaster" + business_url: "ticketmaster.com" + + - id: merch_medium_029 + difficulty: medium + tags: [entertainment] + input: + id: txn_084 + amount: 150.00 + classification: expense + description: "STUBHUB INC" + expected: + business_name: "StubHub" + business_url: "stubhub.com" + + - id: merch_medium_030 + difficulty: medium + tags: [car_rental] + input: + id: txn_085 + amount: 56.00 + classification: expense + description: "HERTZ RENT-A-CAR" + expected: + business_name: "Hertz" + business_url: "hertz.com" + + - id: merch_easy_029 + difficulty: easy + tags: [grocery] + input: + id: txn_086 + amount: 178.90 + classification: expense + description: "SAM'S CLUB #8765" + expected: + business_name: "Sam's Club" + business_url: "samsclub.com" + + - id: merch_easy_030 + difficulty: easy + tags: [pharmacy] + input: + id: txn_087 + amount: 67.89 + classification: expense + description: "RITE AID #1234" + expected: + business_name: "Rite Aid" + business_url: "riteaid.com" + + - id: merch_hard_016 + difficulty: hard + tags: [aggregator] + input: + id: txn_088 + amount: 234.00 + classification: expense + description: "COSTCO.COM" + expected: + business_name: "Costco" + business_url: "costco.com" + + - id: merch_hard_017 + difficulty: hard + tags: [fitness] + input: + id: txn_089 + amount: 150.00 + classification: expense + description: "EQUINOX MEMBERSHIP" + expected: + business_name: "Equinox" + business_url: "equinox.com" + + - id: merch_hard_018 + difficulty: hard + tags: [subscription] + input: + id: txn_090 + amount: 4.99 + classification: expense + description: "PARAMOUNT+ ESSENTIAL" + expected: + business_name: "Paramount+" + business_url: "paramountplus.com" diff --git a/db/migrate/20251201084101_create_eval_tables.rb b/db/migrate/20251201084101_create_eval_tables.rb new file mode 100644 index 000000000..3949c7dad --- /dev/null +++ b/db/migrate/20251201084101_create_eval_tables.rb @@ -0,0 +1,81 @@ +class CreateEvalTables < ActiveRecord::Migration[7.2] + def change + # Eval Datasets - Golden dataset containers + create_table :eval_datasets, id: :uuid do |t| + t.string :name, null: false + t.string :description + t.string :eval_type, null: false + t.string :version, null: false, default: "1.0" + t.integer :sample_count, default: 0 + t.jsonb :metadata, default: {} + t.boolean :active, default: true + + t.timestamps + end + + add_index :eval_datasets, :name, unique: true + add_index :eval_datasets, [ :eval_type, :active ] + + # Eval Samples - Individual test cases + create_table :eval_samples, id: :uuid do |t| + t.references :eval_dataset, null: false, foreign_key: true, type: :uuid + t.jsonb :input_data, null: false + t.jsonb :expected_output, null: false + t.jsonb :context_data, default: {} + t.string :difficulty, default: "medium" + t.string :tags, array: true, default: [] + t.jsonb :metadata, default: {} + + t.timestamps + end + + add_index :eval_samples, [ :eval_dataset_id, :difficulty ] + add_index :eval_samples, :tags, using: :gin + + # Eval Runs - Evaluation execution records + create_table :eval_runs, id: :uuid do |t| + t.references :eval_dataset, null: false, foreign_key: true, type: :uuid + t.string :name + t.string :status, null: false, default: "pending" + t.string :provider, null: false + t.string :model, null: false + t.jsonb :provider_config, default: {} + t.jsonb :metrics, default: {} + t.integer :total_prompt_tokens, default: 0 + t.integer :total_completion_tokens, default: 0 + t.decimal :total_cost, precision: 10, scale: 6, default: 0.0 + t.datetime :started_at + t.datetime :completed_at + t.text :error_message + + t.timestamps + end + + add_index :eval_runs, [ :eval_dataset_id, :model ] + add_index :eval_runs, [ :provider, :model ] + add_index :eval_runs, :status + + # Eval Results - Individual sample results + create_table :eval_results, id: :uuid do |t| + t.references :eval_run, null: false, foreign_key: true, type: :uuid + t.references :eval_sample, null: false, foreign_key: true, type: :uuid + t.jsonb :actual_output, null: false + t.boolean :correct, null: false + t.boolean :exact_match, default: false + t.boolean :hierarchical_match, default: false + t.boolean :null_expected, default: false + t.boolean :null_returned, default: false + t.float :fuzzy_score + t.integer :latency_ms + t.integer :prompt_tokens + t.integer :completion_tokens + t.decimal :cost, precision: 10, scale: 6 + t.jsonb :metadata, default: {} + + t.timestamps + end + + add_index :eval_results, [ :eval_run_id, :correct ] + # eval_sample_id index is automatically created by t.references + end +end diff --git a/db/migrate/20251203133213_add_alternative_match_to_eval_results.rb b/db/migrate/20251203133213_add_alternative_match_to_eval_results.rb new file mode 100644 index 000000000..c1e882e50 --- /dev/null +++ b/db/migrate/20251203133213_add_alternative_match_to_eval_results.rb @@ -0,0 +1,5 @@ +class AddAlternativeMatchToEvalResults < ActiveRecord::Migration[7.2] + def change + add_column :eval_results, :alternative_match, :boolean, default: false + end +end diff --git a/db/schema.rb b/db/schema.rb index bec5fc441..ae5445cdf 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -307,6 +307,80 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do t.index ["import_id"], name: "index_entries_on_import_id" end + create_table "eval_datasets", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.string "name", null: false + t.string "description" + t.string "eval_type", null: false + t.string "version", default: "1.0", null: false + t.integer "sample_count", default: 0 + t.jsonb "metadata", default: {} + t.boolean "active", default: true + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["eval_type", "active"], name: "index_eval_datasets_on_eval_type_and_active" + t.index ["name"], name: "index_eval_datasets_on_name", unique: true + end + + create_table "eval_results", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.uuid "eval_run_id", null: false + t.uuid "eval_sample_id", null: false + t.jsonb "actual_output", null: false + t.boolean "correct", null: false + t.boolean "exact_match", default: false + t.boolean "hierarchical_match", default: false + t.boolean "null_expected", default: false + t.boolean "null_returned", default: false + t.float "fuzzy_score" + t.integer "latency_ms" + t.integer "prompt_tokens" + t.integer "completion_tokens" + t.decimal "cost", precision: 10, scale: 6 + t.jsonb "metadata", default: {} + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.boolean "alternative_match", default: false + t.index ["eval_run_id", "correct"], name: "index_eval_results_on_eval_run_id_and_correct" + t.index ["eval_run_id"], name: "index_eval_results_on_eval_run_id" + t.index ["eval_sample_id"], name: "index_eval_results_on_eval_sample_id" + end + + create_table "eval_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.uuid "eval_dataset_id", null: false + t.string "name" + t.string "status", default: "pending", null: false + t.string "provider", null: false + t.string "model", null: false + t.jsonb "provider_config", default: {} + t.jsonb "metrics", default: {} + t.integer "total_prompt_tokens", default: 0 + t.integer "total_completion_tokens", default: 0 + t.decimal "total_cost", precision: 10, scale: 6, default: "0.0" + t.datetime "started_at" + t.datetime "completed_at" + t.text "error_message" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["eval_dataset_id", "model"], name: "index_eval_runs_on_eval_dataset_id_and_model" + t.index ["eval_dataset_id"], name: "index_eval_runs_on_eval_dataset_id" + t.index ["provider", "model"], name: "index_eval_runs_on_provider_and_model" + t.index ["status"], name: "index_eval_runs_on_status" + end + + create_table "eval_samples", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.uuid "eval_dataset_id", null: false + t.jsonb "input_data", null: false + t.jsonb "expected_output", null: false + t.jsonb "context_data", default: {} + t.string "difficulty", default: "medium" + t.string "tags", default: [], array: true + t.jsonb "metadata", default: {} + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["eval_dataset_id", "difficulty"], name: "index_eval_samples_on_eval_dataset_id_and_difficulty" + t.index ["eval_dataset_id"], name: "index_eval_samples_on_eval_dataset_id" + t.index ["tags"], name: "index_eval_samples_on_tags", using: :gin + end + create_table "exchange_rates", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| t.string "from_currency", null: false t.string "to_currency", null: false @@ -789,6 +863,21 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do t.index ["rule_id"], name: "index_rule_conditions_on_rule_id" end + create_table "rule_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.uuid "rule_id", null: false + t.string "execution_type", null: false + t.string "status", null: false + t.integer "transactions_processed", default: 0, null: false + t.integer "transactions_modified", default: 0, null: false + t.datetime "executed_at", null: false + t.text "error_message" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["executed_at"], name: "index_rule_runs_on_executed_at" + t.index ["rule_id", "executed_at"], name: "index_rule_runs_on_rule_id_and_executed_at" + t.index ["rule_id"], name: "index_rule_runs_on_rule_id" + end + create_table "rules", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| t.uuid "family_id", null: false t.string "resource_type", null: false @@ -991,6 +1080,8 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do t.datetime "updated_at", null: false t.string "currency" t.jsonb "locked_attributes", default: {} + t.uuid "category_id" + t.index ["category_id"], name: "index_trades_on_category_id" t.index ["security_id"], name: "index_trades_on_security_id" end @@ -1095,6 +1186,10 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do add_foreign_key "enable_banking_items", "families" add_foreign_key "entries", "accounts", on_delete: :cascade add_foreign_key "entries", "imports" + add_foreign_key "eval_results", "eval_runs" + add_foreign_key "eval_results", "eval_samples" + add_foreign_key "eval_runs", "eval_datasets" + add_foreign_key "eval_samples", "eval_datasets" add_foreign_key "family_exports", "families" add_foreign_key "holdings", "account_providers" add_foreign_key "holdings", "accounts", on_delete: :cascade @@ -1136,6 +1231,7 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do add_foreign_key "taggings", "tags" add_foreign_key "tags", "families" add_foreign_key "tool_calls", "messages" + add_foreign_key "trades", "categories" add_foreign_key "trades", "securities" add_foreign_key "transactions", "categories", on_delete: :nullify add_foreign_key "transactions", "merchants" diff --git a/lib/tasks/evals.rake b/lib/tasks/evals.rake new file mode 100644 index 000000000..1987124f8 --- /dev/null +++ b/lib/tasks/evals.rake @@ -0,0 +1,739 @@ +namespace :evals do + desc "List all evaluation datasets" + task list_datasets: :environment do + datasets = Eval::Dataset.order(:eval_type, :name) + + if datasets.empty? + puts "No datasets found. Import a dataset with: rake evals:import_dataset[path/to/file.yml]" + next + end + + puts "=" * 80 + puts "Available Evaluation Datasets" + puts "=" * 80 + puts + + datasets.group_by(&:eval_type).each do |eval_type, type_datasets| + puts "#{eval_type.titleize}:" + puts "-" * 40 + + type_datasets.each do |dataset| + status = dataset.active ? "active" : "inactive" + puts " #{dataset.name} (v#{dataset.version}) - #{dataset.sample_count} samples [#{status}]" + puts " #{dataset.description}" if dataset.description.present? + end + puts + end + end + + desc "Import dataset from YAML file" + task :import_dataset, [ :file_path ] => :environment do |_t, args| + file_path = args[:file_path] || ENV["FILE"] + + if file_path.blank? + puts "Usage: rake evals:import_dataset[path/to/file.yml]" + puts " or: FILE=path/to/file.yml rake evals:import_dataset" + exit 1 + end + + unless File.exist?(file_path) + puts "Error: File not found: #{file_path}" + exit 1 + end + + puts "Importing dataset from #{file_path}..." + + dataset = Eval::Dataset.import_from_yaml(file_path) + + puts "Successfully imported dataset:" + puts " Name: #{dataset.name}" + puts " Type: #{dataset.eval_type}" + puts " Version: #{dataset.version}" + puts " Samples: #{dataset.sample_count}" + + stats = dataset.statistics + puts " By difficulty: #{stats[:by_difficulty].map { |k, v| "#{k}=#{v}" }.join(', ')}" + end + + desc "Run evaluation against a model" + task :run, [ :dataset_name, :model ] => :environment do |_t, args| + dataset_name = args[:dataset_name] || ENV["DATASET"] + model = args[:model] || ENV["MODEL"] || "gpt-4.1" + provider = ENV["PROVIDER"] || "openai" + + if dataset_name.blank? + puts "Usage: rake evals:run[dataset_name,model]" + puts " or: DATASET=name MODEL=gpt-4 rake evals:run" + exit 1 + end + + dataset = Eval::Dataset.find_by(name: dataset_name) + + if dataset.nil? + puts "Error: Dataset '#{dataset_name}' not found" + puts "Available datasets:" + Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" } + exit 1 + end + + run_name = "#{dataset_name}_#{model}_#{Time.current.strftime('%Y%m%d_%H%M%S')}" + + puts "=" * 80 + puts "Starting Evaluation Run" + puts "=" * 80 + puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)" + puts " Type: #{dataset.eval_type}" + puts " Model: #{model}" + puts " Provider: #{provider}" + puts " Run Name: #{run_name}" + puts + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: provider, + model: model, + name: run_name, + status: "pending" + ) + + runner = dataset.runner_class.new(eval_run) + + puts "Running evaluation..." + start_time = Time.current + + begin + result = runner.run + duration = (Time.current - start_time).round(1) + + puts + puts "=" * 80 + puts "Evaluation Complete" + puts "=" * 80 + puts " Status: #{result.status}" + puts " Duration: #{duration}s" + puts " Run ID: #{result.id}" + puts + puts "Metrics:" + result.metrics.each do |key, value| + next if value.is_a?(Hash) # Skip nested metrics for summary + puts " #{key}: #{format_metric_value(value)}" + end + + # Show difficulty breakdown if available + if result.metrics["by_difficulty"].present? + puts + puts "By Difficulty:" + result.metrics["by_difficulty"].each do |difficulty, stats| + puts " #{difficulty}: #{stats['accuracy']}% accuracy (#{stats['correct']}/#{stats['count']})" + end + end + rescue => e + puts + puts "Evaluation FAILED: #{e.message}" + puts e.backtrace.first(5).join("\n") if ENV["DEBUG"] + exit 1 + end + end + + desc "Compare multiple models on a dataset" + task :compare, [ :dataset_name ] => :environment do |_t, args| + dataset_name = args[:dataset_name] || ENV["DATASET"] + models = (ENV["MODELS"] || "gpt-4.1,gpt-4o-mini").split(",").map(&:strip) + provider = ENV["PROVIDER"] || "openai" + + if dataset_name.blank? + puts "Usage: MODELS=model1,model2 rake evals:compare[dataset_name]" + exit 1 + end + + dataset = Eval::Dataset.find_by!(name: dataset_name) + + puts "=" * 80 + puts "Model Comparison" + puts "=" * 80 + puts " Dataset: #{dataset.name}" + puts " Models: #{models.join(', ')}" + puts + + runs = models.map do |model| + puts "Running evaluation for #{model}..." + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: provider, + model: model, + name: "compare_#{model}_#{Time.current.to_i}", + status: "pending" + ) + + runner = dataset.runner_class.new(eval_run) + runner.run + end + + puts + puts "=" * 80 + puts "Comparison Results" + puts "=" * 80 + puts + + reporter = Eval::Reporters::ComparisonReporter.new(runs) + puts reporter.to_table + + summary = reporter.summary + if summary.present? + puts + puts "Recommendations:" + puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)" + puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})" + puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)" + puts + puts " #{summary[:recommendation]}" + end + + # Export to CSV if requested + if ENV["CSV"].present? + csv_path = reporter.to_csv(ENV["CSV"]) + puts + puts "Exported to: #{csv_path}" + end + end + + desc "Generate report for specific runs" + task :report, [ :run_ids ] => :environment do |_t, args| + run_ids = (args[:run_ids] || ENV["RUN_IDS"])&.split(",") + + runs = if run_ids.present? + Eval::Run.where(id: run_ids) + else + Eval::Run.completed.order(created_at: :desc).limit(5) + end + + if runs.empty? + puts "No runs found." + exit 1 + end + + reporter = Eval::Reporters::ComparisonReporter.new(runs) + + puts reporter.to_table + + summary = reporter.summary + if summary.present? + puts + puts "Summary:" + puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)" + puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})" + puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)" + end + + if ENV["CSV"].present? + csv_path = reporter.to_csv(ENV["CSV"]) + puts + puts "Exported to: #{csv_path}" + end + end + + desc "Quick smoke test to verify provider configuration" + task smoke_test: :environment do + puts "Running smoke test..." + + provider = Provider::Registry.get_provider(:openai) + + unless provider + puts "FAIL: OpenAI provider not configured" + puts "Set OPENAI_ACCESS_TOKEN environment variable or configure in settings" + exit 1 + end + + puts " Provider: #{provider.provider_name}" + puts " Model: #{provider.instance_variable_get(:@default_model)}" + + # Test with a single categorization sample + result = provider.auto_categorize( + transactions: [ + { id: "test", amount: 10, classification: "expense", description: "McDonalds" } + ], + user_categories: [ + { id: "1", name: "Food & Drink", classification: "expense" } + ] + ) + + if result.success? + category = result.data.first&.category_name + puts " Test result: #{category || 'null'}" + puts + puts "PASS: Provider is working correctly" + else + puts "FAIL: #{result.error.message}" + exit 1 + end + end + + desc "Run CI regression test" + task ci_regression: :environment do + dataset_name = ENV["EVAL_DATASET"] || "categorization_golden_v1" + model = ENV["EVAL_MODEL"] || "gpt-4.1-mini" + threshold = (ENV["EVAL_THRESHOLD"] || "80").to_f + + dataset = Eval::Dataset.find_by(name: dataset_name) + + unless dataset + puts "Dataset '#{dataset_name}' not found. Skipping CI regression test." + exit 0 + end + + # Get baseline from last successful run + baseline_run = dataset.runs.completed.for_model(model).order(created_at: :desc).first + + # Run new evaluation + eval_run = Eval::Run.create!( + dataset: dataset, + provider: "openai", + model: model, + name: "ci_regression_#{Time.current.to_i}", + status: "pending" + ) + + runner = dataset.runner_class.new(eval_run) + result = runner.run + + current_accuracy = result.metrics["accuracy"] || 0 + + puts "CI Regression Test Results:" + puts " Model: #{model}" + puts " Current Accuracy: #{current_accuracy}%" + + if baseline_run + baseline_accuracy = baseline_run.metrics["accuracy"] || 0 + puts " Baseline Accuracy: #{baseline_accuracy}%" + + accuracy_diff = current_accuracy - baseline_accuracy + + if accuracy_diff < -5 + puts + puts "REGRESSION DETECTED!" + puts "Accuracy dropped by #{accuracy_diff.abs}% (threshold: 5%)" + exit 1 + end + + puts " Difference: #{accuracy_diff > 0 ? '+' : ''}#{accuracy_diff.round(2)}%" + end + + if current_accuracy < threshold + puts + puts "BELOW THRESHOLD!" + puts "Accuracy #{current_accuracy}% is below required #{threshold}%" + exit 1 + end + + puts + puts "CI Regression Test PASSED" + end + + desc "List recent evaluation runs" + task list_runs: :environment do + runs = Eval::Run.order(created_at: :desc).limit(20) + + if runs.empty? + puts "No runs found." + next + end + + puts "=" * 100 + puts "Recent Evaluation Runs" + puts "=" * 100 + + runs.each do |run| + status_icon = case run.status + when "completed" then "[OK]" + when "failed" then "[FAIL]" + when "running" then "[...]" + else "[?]" + end + + accuracy = run.metrics["accuracy"] ? "#{run.metrics['accuracy']}%" : "-" + + puts "#{status_icon} #{run.id[0..7]} | #{run.model.ljust(15)} | #{run.dataset.name.ljust(25)} | #{accuracy.rjust(8)} | #{run.created_at.strftime('%Y-%m-%d %H:%M')}" + end + end + + desc "Show details for a specific run" + task :show_run, [ :run_id ] => :environment do |_t, args| + run_id = args[:run_id] || ENV["RUN_ID"] + + if run_id.blank? + puts "Usage: rake evals:show_run[run_id]" + exit 1 + end + + run = Eval::Run.find_by(id: run_id) || Eval::Run.find_by("id::text LIKE ?", "#{run_id}%") + + unless run + puts "Run not found: #{run_id}" + exit 1 + end + + puts "=" * 80 + puts "Evaluation Run Details" + puts "=" * 80 + puts + puts "Run ID: #{run.id}" + puts "Name: #{run.name}" + puts "Dataset: #{run.dataset.name}" + puts "Model: #{run.model}" + puts "Provider: #{run.provider}" + puts "Status: #{run.status}" + puts "Created: #{run.created_at}" + puts "Duration: #{run.duration_seconds}s" if run.duration_seconds + + if run.error_message.present? + puts + puts "Error: #{run.error_message}" + end + + if run.metrics.present? + puts + puts "Metrics:" + run.metrics.each do |key, value| + if value.is_a?(Hash) + puts " #{key}:" + value.each { |k, v| puts " #{k}: #{v}" } + else + puts " #{key}: #{format_metric_value(value)}" + end + end + end + + # Show sample of incorrect results + incorrect = run.results.incorrect.limit(5) + if incorrect.any? + puts + puts "Sample Incorrect Results (#{run.results.incorrect.count} total):" + incorrect.each do |result| + puts " Sample: #{result.sample_id[0..7]}" + puts " Expected: #{result.sample.expected_output}" + puts " Actual: #{result.actual_output}" + puts + end + end + end + + # ============================================================================= + # Langfuse Integration + # ============================================================================= + + namespace :langfuse do + desc "Check Langfuse configuration" + task check: :environment do + begin + client = Eval::Langfuse::Client.new + puts "✓ Langfuse credentials configured" + + # Try to list datasets to verify connection + response = client.list_datasets(limit: 1) + puts "✓ Successfully connected to Langfuse" + puts " Region: #{ENV['LANGFUSE_REGION'] || 'us (default)'}" + rescue Eval::Langfuse::Client::ConfigurationError => e + puts "✗ #{e.message}" + exit 1 + rescue Eval::Langfuse::Client::ApiError => e + puts "✗ Failed to connect to Langfuse: #{e.message}" + exit 1 + end + end + + desc "Upload dataset to Langfuse" + task :upload_dataset, [ :dataset_name ] => :environment do |_t, args| + dataset_name = args[:dataset_name] || ENV["DATASET"] + + if dataset_name.blank? + puts "Usage: rake evals:langfuse:upload_dataset[dataset_name]" + puts " or: DATASET=name rake evals:langfuse:upload_dataset" + exit 1 + end + + dataset = Eval::Dataset.find_by(name: dataset_name) + + if dataset.nil? + puts "Error: Dataset '#{dataset_name}' not found" + puts "Available datasets:" + Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" } + exit 1 + end + + puts "=" * 80 + puts "Uploading Dataset to Langfuse" + puts "=" * 80 + puts " Dataset: #{dataset.name}" + puts " Type: #{dataset.eval_type}" + puts " Samples: #{dataset.sample_count}" + puts + + begin + exporter = Eval::Langfuse::DatasetExporter.new(dataset) + result = exporter.export + + puts + puts "✓ Successfully uploaded dataset to Langfuse" + puts " Langfuse dataset name: #{result[:dataset_name]}" + puts " Items exported: #{result[:items_exported]}" + puts + puts "View in Langfuse: https://cloud.langfuse.com/project/datasets" + rescue Eval::Langfuse::Client::ConfigurationError => e + puts "✗ #{e.message}" + exit 1 + rescue Eval::Langfuse::Client::ApiError => e + puts "✗ Langfuse API error: #{e.message}" + exit 1 + end + end + + desc "Run experiment in Langfuse" + task :run_experiment, [ :dataset_name, :model ] => :environment do |_t, args| + dataset_name = args[:dataset_name] || ENV["DATASET"] + model = args[:model] || ENV["MODEL"] || "gpt-4.1" + provider = ENV["PROVIDER"] || "openai" + run_name = ENV["RUN_NAME"] + + if dataset_name.blank? + puts "Usage: rake evals:langfuse:run_experiment[dataset_name,model]" + puts " or: DATASET=name MODEL=gpt-4.1 rake evals:langfuse:run_experiment" + puts + puts "Optional environment variables:" + puts " PROVIDER=openai (default)" + puts " RUN_NAME=custom_run_name" + exit 1 + end + + dataset = Eval::Dataset.find_by(name: dataset_name) + + if dataset.nil? + puts "Error: Dataset '#{dataset_name}' not found" + puts "Available datasets:" + Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" } + exit 1 + end + + puts "=" * 80 + puts "Running Langfuse Experiment" + puts "=" * 80 + puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)" + puts " Type: #{dataset.eval_type}" + puts " Model: #{model}" + puts " Provider: #{provider}" + puts + + begin + runner = Eval::Langfuse::ExperimentRunner.new( + dataset, + model: model, + provider: provider + ) + + start_time = Time.current + result = runner.run(run_name: run_name) + duration = (Time.current - start_time).round(1) + + puts + puts "=" * 80 + puts "Experiment Complete" + puts "=" * 80 + puts " Run Name: #{result[:run_name]}" + puts " Duration: #{duration}s" + puts + puts "Results:" + puts " Accuracy: #{result[:metrics][:accuracy]}%" + puts " Correct: #{result[:metrics][:correct]}/#{result[:metrics][:total]}" + puts " Avg Latency: #{result[:metrics][:avg_latency_ms]}ms" + puts + puts "View in Langfuse:" + puts " Dataset: https://cloud.langfuse.com/project/datasets" + puts " Traces: https://cloud.langfuse.com/project/traces" + rescue Eval::Langfuse::Client::ConfigurationError => e + puts "✗ #{e.message}" + exit 1 + rescue Eval::Langfuse::Client::ApiError => e + puts "✗ Langfuse API error: #{e.message}" + exit 1 + rescue => e + puts "✗ Error: #{e.message}" + puts e.backtrace.first(5).join("\n") if ENV["DEBUG"] + exit 1 + end + end + + desc "List datasets in Langfuse" + task list_datasets: :environment do + begin + client = Eval::Langfuse::Client.new + response = client.list_datasets(limit: 100) + + datasets = response["data"] || [] + + if datasets.empty? + puts "No datasets found in Langfuse." + puts "Upload a dataset with: rake evals:langfuse:upload_dataset[dataset_name]" + next + end + + puts "=" * 80 + puts "Langfuse Datasets" + puts "=" * 80 + puts + + datasets.each do |ds| + puts " #{ds['name']}" + puts " Description: #{ds['description']}" if ds["description"].present? + puts " Created: #{ds['createdAt']}" + puts " Metadata: #{ds['metadata']}" if ds["metadata"].present? + puts + end + rescue Eval::Langfuse::Client::ConfigurationError => e + puts "✗ #{e.message}" + exit 1 + rescue Eval::Langfuse::Client::ApiError => e + puts "✗ Langfuse API error: #{e.message}" + exit 1 + end + end + end + + desc "Export manually categorized transactions as golden data" + task :export_manual_categories, [ :family_id ] => :environment do |_t, args| + family_id = args[:family_id] || ENV["FAMILY_ID"] + output_path = ENV["OUTPUT"] || "db/eval_data/categorization_manual_export.yml" + limit = (ENV["LIMIT"] || 500).to_i + + if family_id.blank? + puts "Usage: rake evals:export_manual_categories[family_id]" + puts " or: FAMILY_ID=uuid rake evals:export_manual_categories" + puts + puts "Optional environment variables:" + puts " OUTPUT=path/to/output.yml (default: db/eval_data/categorization_manual_export.yml)" + puts " LIMIT=500 (default: 500)" + exit 1 + end + + family = Family.find_by(id: family_id) + + if family.nil? + puts "Error: Family '#{family_id}' not found" + exit 1 + end + + puts "=" * 80 + puts "Exporting Manually Categorized Transactions" + puts "=" * 80 + puts " Family: #{family.name}" + puts " Output: #{output_path}" + puts " Limit: #{limit}" + puts + + # Find transactions that have: + # 1. A category assigned + # 2. locked_attributes contains "category_id" (meaning user manually set it) + # 3. No DataEnrichment record for category_id (meaning it wasn't set by AI/rules/etc) + manually_categorized = Transaction + .joins(:entry) + .joins("INNER JOIN accounts ON accounts.id = entries.account_id") + .where(accounts: { family_id: family_id }) + .where.not(category_id: nil) + .where("transactions.locked_attributes ? 'category_id'") + .where.not( + id: DataEnrichment + .where(enrichable_type: "Transaction", attribute_name: "category_id") + .select(:enrichable_id) + ) + .includes(:category, entry: :account) + .limit(limit) + + count = manually_categorized.count + + if count == 0 + puts "No manually categorized transactions found." + puts + puts "Manually categorized transactions are those where:" + puts " - User set a category manually (locked_attributes contains 'category_id')" + puts " - Category was NOT set by AI, rules, or data enrichment sources" + exit 0 + end + + puts "Found #{count} manually categorized transactions" + puts + + # Build category context from family's categories + categories = family.categories.includes(:parent).map do |cat| + { + "id" => cat.id.to_s, + "name" => cat.name, + "classification" => cat.classification, + "is_subcategory" => cat.subcategory?, + "parent_id" => cat.parent_id&.to_s + }.compact + end + + # Build samples + samples = manually_categorized.map.with_index do |txn, idx| + entry = txn.entry + sample_id = "manual_#{idx + 1}" + + { + "id" => sample_id, + "difficulty" => "manual", + "tags" => [ txn.category.name.parameterize.underscore, "manual_export" ], + "input" => { + "id" => txn.id.to_s, + "amount" => entry.amount.to_f.abs, + "classification" => entry.classification, + "description" => entry.name + }, + "expected" => { + "category_name" => txn.category.name + } + } + end + + # Build output structure + output = { + "name" => "categorization_manual_export", + "description" => "Golden dataset exported from manually categorized user transactions", + "eval_type" => "categorization", + "version" => "1.0", + "metadata" => { + "created_at" => Time.current.strftime("%Y-%m-%d"), + "source" => "manual_export", + "family_id" => family_id, + "exported_count" => samples.size + }, + "context" => { + "categories" => categories + }, + "samples" => samples + } + + # Write to file + FileUtils.mkdir_p(File.dirname(output_path)) + File.write(output_path, output.to_yaml) + + puts "✓ Successfully exported #{samples.size} samples" + puts " Difficulty: manual" + puts + puts "Output written to: #{output_path}" + puts + puts "To import this dataset, run:" + puts " rake evals:import_dataset[#{output_path}]" + end + + private + + def format_metric_value(value) + case value + when Float + value.round(4) + when BigDecimal + value.to_f.round(4) + else + value + end + end +end diff --git a/test/models/eval/dataset_test.rb b/test/models/eval/dataset_test.rb new file mode 100644 index 000000000..32059b527 --- /dev/null +++ b/test/models/eval/dataset_test.rb @@ -0,0 +1,118 @@ +require "test_helper" + +class Eval::DatasetTest < ActiveSupport::TestCase + test "validates presence of name and eval_type" do + dataset = Eval::Dataset.new + + assert_not dataset.valid? + assert_includes dataset.errors[:name], "can't be blank" + assert_includes dataset.errors[:eval_type], "can't be blank" + end + + test "validates eval_type is one of allowed values" do + dataset = Eval::Dataset.new(name: "test", eval_type: "invalid") + + assert_not dataset.valid? + assert_includes dataset.errors[:eval_type], "is not included in the list" + + dataset.eval_type = "categorization" + dataset.valid? + assert_empty dataset.errors[:eval_type] + end + + test "validates name uniqueness" do + Eval::Dataset.create!(name: "unique_test", eval_type: "categorization") + + duplicate = Eval::Dataset.new(name: "unique_test", eval_type: "categorization") + assert_not duplicate.valid? + assert_includes duplicate.errors[:name], "has already been taken" + end + + test "scopes filter by eval_type" do + cat_dataset = Eval::Dataset.create!(name: "cat_test", eval_type: "categorization") + merch_dataset = Eval::Dataset.create!(name: "merch_test", eval_type: "merchant_detection") + chat_dataset = Eval::Dataset.create!(name: "chat_test", eval_type: "chat") + + assert_includes Eval::Dataset.for_categorization, cat_dataset + assert_not_includes Eval::Dataset.for_categorization, merch_dataset + + assert_includes Eval::Dataset.for_merchant_detection, merch_dataset + assert_not_includes Eval::Dataset.for_merchant_detection, cat_dataset + + assert_includes Eval::Dataset.for_chat, chat_dataset + assert_not_includes Eval::Dataset.for_chat, cat_dataset + end + + test "import_from_yaml creates dataset with samples" do + yaml_content = <<~YAML + name: test_import + description: Test dataset + eval_type: categorization + version: "1.0" + context: + categories: + - id: "food" + name: "Food" + classification: "expense" + samples: + - id: sample_1 + difficulty: easy + tags: [test] + input: + id: txn_1 + amount: 10 + classification: expense + description: "Test transaction" + expected: + category_name: "Food" + YAML + + file_path = Rails.root.join("tmp", "test_import.yml") + File.write(file_path, yaml_content) + + dataset = Eval::Dataset.import_from_yaml(file_path) + + assert_equal "test_import", dataset.name + assert_equal "categorization", dataset.eval_type + assert_equal 1, dataset.samples.count + assert_equal "easy", dataset.samples.first.difficulty + assert_equal "Food", dataset.samples.first.expected_output["category_name"] + ensure + File.delete(file_path) if File.exist?(file_path) + end + + test "statistics returns sample breakdown" do + dataset = Eval::Dataset.create!(name: "stats_test", eval_type: "categorization") + + dataset.samples.create!( + input_data: { id: "1" }, + expected_output: { category_name: "Food" }, + difficulty: "easy", + tags: [ "food" ] + ) + + dataset.samples.create!( + input_data: { id: "2" }, + expected_output: { category_name: "Travel" }, + difficulty: "medium", + tags: [ "travel" ] + ) + + stats = dataset.statistics + + assert_equal 2, stats[:total_samples] + assert_equal({ "easy" => 1, "medium" => 1 }, stats[:by_difficulty]) + assert_includes stats[:by_tags], "food" + assert_includes stats[:by_tags], "travel" + end + + test "runner_class returns correct class for each eval_type" do + cat_dataset = Eval::Dataset.new(eval_type: "categorization") + merch_dataset = Eval::Dataset.new(eval_type: "merchant_detection") + chat_dataset = Eval::Dataset.new(eval_type: "chat") + + assert_equal Eval::Runners::CategorizationRunner, cat_dataset.runner_class + assert_equal Eval::Runners::MerchantDetectionRunner, merch_dataset.runner_class + assert_equal Eval::Runners::ChatRunner, chat_dataset.runner_class + end +end diff --git a/test/models/eval/runners/categorization_runner_test.rb b/test/models/eval/runners/categorization_runner_test.rb new file mode 100644 index 000000000..c7aa03c46 --- /dev/null +++ b/test/models/eval/runners/categorization_runner_test.rb @@ -0,0 +1,212 @@ +require "test_helper" + +class Eval::Runners::CategorizationRunnerTest < ActiveSupport::TestCase + include ProviderTestHelper + + setup do + @categories = [ + { "id" => "food", "name" => "Food & Drink", "classification" => "expense" }, + { "id" => "fast_food", "name" => "Fast Food", "classification" => "expense", "parent_id" => "food" } + ] + end + + + test "run processes all samples and calculates metrics" do + dataset = Eval::Dataset.create!( + name: "test_cat_#{SecureRandom.hex(4)}", + eval_type: "categorization", + version: "1.0" + ) + + sample1 = dataset.samples.create!( + input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" }, + expected_output: { "category_name" => "Fast Food" }, + context_data: { "categories" => @categories }, + difficulty: "easy" + ) + + sample2 = dataset.samples.create!( + input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" }, + expected_output: { "category_name" => nil }, + context_data: { "categories" => @categories }, + difficulty: "edge_case" + ) + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: "openai", + model: "gpt-4.1", + name: "test_run", + provider_config: { "access_token" => "test-token" }, + status: "pending" + ) + + mock_response = provider_success_response([ + Provider::LlmConcept::AutoCategorization.new(transaction_id: sample1.id, category_name: "Fast Food"), + Provider::LlmConcept::AutoCategorization.new(transaction_id: sample2.id, category_name: "null") + ]) + + Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response) + + runner = Eval::Runners::CategorizationRunner.new(eval_run) + result = runner.run + + assert_equal "completed", result.status + assert_equal 2, result.results.count + assert result.metrics["accuracy"].present? + end + + test "records correct result when category matches" do + dataset = Eval::Dataset.create!( + name: "test_cat_match_#{SecureRandom.hex(4)}", + eval_type: "categorization", + version: "1.0" + ) + + sample = dataset.samples.create!( + input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" }, + expected_output: { "category_name" => "Fast Food" }, + context_data: { "categories" => @categories }, + difficulty: "easy" + ) + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: "openai", + model: "gpt-4.1", + name: "test_run", + provider_config: { "access_token" => "test-token" }, + status: "pending" + ) + + mock_response = provider_success_response([ + Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Fast Food") + ]) + + Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response) + + runner = Eval::Runners::CategorizationRunner.new(eval_run) + runner.run + + result = eval_run.results.find_by(eval_sample_id: sample.id) + + assert result.correct + assert result.exact_match + assert_equal "Fast Food", result.actual_output["category_name"] + end + + test "records hierarchical match when parent category returned" do + dataset = Eval::Dataset.create!( + name: "test_cat_hier_#{SecureRandom.hex(4)}", + eval_type: "categorization", + version: "1.0" + ) + + sample = dataset.samples.create!( + input_data: { "id" => "txn_3", "amount" => 50, "classification" => "expense", "description" => "Olive Garden" }, + expected_output: { "category_name" => "Fast Food" }, + context_data: { "categories" => @categories }, + difficulty: "medium" + ) + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: "openai", + model: "gpt-4.1", + name: "test_hierarchical", + provider_config: { "access_token" => "test-token" }, + status: "pending" + ) + + # Model returns parent category instead of subcategory + mock_response = provider_success_response([ + Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Food & Drink") + ]) + + Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response) + + runner = Eval::Runners::CategorizationRunner.new(eval_run) + runner.run + + result = eval_run.results.find_by(eval_sample_id: sample.id) + + assert_not result.exact_match + assert result.hierarchical_match + end + + test "handles null correctly when expected" do + dataset = Eval::Dataset.create!( + name: "test_cat_null_#{SecureRandom.hex(4)}", + eval_type: "categorization", + version: "1.0" + ) + + sample = dataset.samples.create!( + input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" }, + expected_output: { "category_name" => nil }, + context_data: { "categories" => @categories }, + difficulty: "edge_case" + ) + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: "openai", + model: "gpt-4.1", + name: "test_run", + provider_config: { "access_token" => "test-token" }, + status: "pending" + ) + + mock_response = provider_success_response([ + Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "null") + ]) + + Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response) + + runner = Eval::Runners::CategorizationRunner.new(eval_run) + runner.run + + result = eval_run.results.find_by(eval_sample_id: sample.id) + + assert result.correct + assert result.null_expected + assert result.null_returned + end + + test "records error results on provider error but completes run" do + dataset = Eval::Dataset.create!( + name: "test_cat_err_#{SecureRandom.hex(4)}", + eval_type: "categorization", + version: "1.0" + ) + + sample = dataset.samples.create!( + input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" }, + expected_output: { "category_name" => "Fast Food" }, + context_data: { "categories" => @categories }, + difficulty: "easy" + ) + + eval_run = Eval::Run.create!( + dataset: dataset, + provider: "openai", + model: "gpt-4.1", + name: "test_run", + provider_config: { "access_token" => "test-token" }, + status: "pending" + ) + + Provider::Openai.any_instance.stubs(:auto_categorize).raises(StandardError.new("API Error")) + + runner = Eval::Runners::CategorizationRunner.new(eval_run) + result = runner.run + + # Run completes but with error results + assert_equal "completed", result.status + assert_equal 1, result.results.count + + error_result = result.results.find_by(eval_sample_id: sample.id) + assert_not error_result.correct + assert_includes error_result.actual_output["error"], "API Error" + end +end