Small llms improvements (#400)

* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2026-04-20 04:24:06 +00:00 · 2025-12-07 18:11:34 +01:00
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions
--- a/app/models/provider/openai/auto_categorizer.rb
+++ b/app/models/provider/openai/auto_categorizer.rb
@@ -1,9 +1,22 @@
 class Provider::Openai::AutoCategorizer
  include Provider::Openai::Concerns::UsageRecorder

-  attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family
+  # JSON response format modes for custom providers
+  # - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
+  # - "json_object": Use json_object response format (broader compatibility)
+  # - "none": No response format constraint (maximum compatibility with local LLMs)
+  JSON_MODE_STRICT = "strict"
+  JSON_MODE_OBJECT = "json_object"
+  JSON_MODE_NONE = "none"
+  JSON_MODE_AUTO = "auto"

-  def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil)
+  # Threshold for auto mode: if more than this percentage returns null, retry with none mode
+  # This is a heuristic to detect when strict JSON mode is breaking the model's ability to reason
+  AUTO_MODE_NULL_THRESHOLD = 0.5
+
+  attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family, :json_mode
+
+  def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
    @client = client
    @model = model
    @transactions = transactions
@@ -11,6 +24,32 @@ class Provider::Openai::AutoCategorizer
    @custom_provider = custom_provider
    @langfuse_trace = langfuse_trace
    @family = family
+    @json_mode = json_mode || default_json_mode
+  end
+
+  VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
+
+  # Determine default JSON mode based on configuration hierarchy:
+  # 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
+  # 2. Setting.openai_json_mode - user-configured in app settings
+  # 3. Default: auto mode (recommended for all providers)
+  #
+  # Mode descriptions:
+  # - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
+  # - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
+  # - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
+  # - "json_object": Middle ground, broader compatibility than strict
+  def default_json_mode
+    # 1. Check environment variable first (allows runtime override for testing)
+    env_mode = ENV["LLM_JSON_MODE"]
+    return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
+
+    # 2. Check app settings (user-configured)
+    setting_mode = Setting.openai_json_mode
+    return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
+
+    # 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
+    JSON_MODE_AUTO
  end

  def auto_categorize
@@ -22,6 +61,40 @@ class Provider::Openai::AutoCategorizer
  end

  def instructions
+    if custom_provider
+      simple_instructions
+    else
+      detailed_instructions
+    end
+  end
+
+  # Simplified instructions for smaller/local LLMs
+  def simple_instructions
+    <<~INSTRUCTIONS.strip_heredoc
+      Categorize transactions into the given categories. Return JSON only. Do not explain your reasoning.
+
+      CRITICAL RULES:
+      1. Match transaction_id exactly from input
+      2. Use EXACT category_name from the provided list, or "null" if unsure
+      3. Match expense transactions to expense categories only
+      4. Match income transactions to income categories only
+      5. Return "null" if the description is generic/ambiguous (e.g., "POS DEBIT", "ACH WITHDRAWAL", "CHECK #1234")
+      6. Prefer MORE SPECIFIC subcategories over general parent categories when available
+
+      CATEGORY HIERARCHY NOTES:
+      - Use "Restaurants" for sit-down restaurants, "Fast Food" for quick service chains
+      - Use "Coffee Shops" for coffee places, "Food & Drink" only when type is unclear
+      - Use "Shopping" for general retail, big-box stores, and online marketplaces
+      - Use "Groceries" for dedicated grocery stores ONLY
+      - For income: use "Salary" for payroll/employer deposits, "Income" for generic income sources
+
+      Output JSON format only (no markdown, no explanation):
+      {"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
+    INSTRUCTIONS
+  end
+
+  # Detailed instructions for larger models like GPT-4
+  def detailed_instructions
    <<~INSTRUCTIONS.strip_heredoc
      You are an assistant to a consumer personal finance app.  You will be provided a list
      of the user's transactions and a list of the user's categories.  Your job is to auto-categorize
@@ -87,19 +160,68 @@ class Provider::Openai::AutoCategorizer
    end

    def auto_categorize_openai_generic
+      if json_mode == JSON_MODE_AUTO
+        auto_categorize_with_auto_mode
+      else
+        auto_categorize_with_mode(json_mode)
+      end
+    rescue Faraday::BadRequestError => e
+      # If strict mode fails (HTTP 400), fall back to none mode
+      # This handles providers that don't support json_schema response format
+      if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
+        Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
+        auto_categorize_with_mode(JSON_MODE_NONE)
+      else
+        raise
+      end
+    end
+
+    # Auto mode: try strict first, fall back to none if too many nulls or missing results
+    #
+    # This uses pure heuristics to detect when strict JSON mode is breaking the model's
+    # ability to reason. Models that can't reason well in strict mode often:
+    # 1. Return null for everything, OR
+    # 2. Simply omit transactions they can't categorize (returning fewer results than input)
+    #
+    # The heuristic is simple: if >50% of results are null or missing, the model likely
+    # needs the freedom to reason in its output (which strict mode prevents).
+    def auto_categorize_with_auto_mode
+      result = auto_categorize_with_mode(JSON_MODE_STRICT)
+
+      null_count = result.count { |r| r.category_name.nil? || r.category_name == "null" }
+      missing_count = transactions.size - result.size
+      failed_count = null_count + missing_count
+      failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
+
+      if failed_ratio > AUTO_MODE_NULL_THRESHOLD
+        Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
+        auto_categorize_with_mode(JSON_MODE_NONE)
+      else
+        result
+      end
+    end
+
+    def auto_categorize_with_mode(mode)
      span = langfuse_trace&.span(name: "auto_categorize_api_call", input: {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        transactions: transactions,
-        user_categories: user_categories
+        user_categories: user_categories,
+        json_mode: mode
      })

-      response = client.chat(parameters: {
+      # Build parameters with configurable JSON response format
+      params = {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        messages: [
          { role: "system", content: instructions },
-          { role: "user", content: developer_message }
-        ],
-        response_format: {
+          { role: "user", content: developer_message_for_generic }
+        ]
+      }
+
+      # Add response format based on json_mode setting
+      case mode
+      when JSON_MODE_STRICT
+        params[:response_format] = {
          type: "json_schema",
          json_schema: {
            name: "auto_categorize_personal_finance_transactions",
@@ -107,9 +229,14 @@ class Provider::Openai::AutoCategorizer
            schema: json_schema
          }
        }
-      })
+      when JSON_MODE_OBJECT
+        params[:response_format] = { type: "json_object" }
+        # JSON_MODE_NONE: no response_format constraint
+      end

-      Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")}")
+      response = client.chat(parameters: params)
+
+      Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")

      categorizations = extract_categorizations_generic(response)
      result = build_response(categorizations)
@@ -120,7 +247,8 @@ class Provider::Openai::AutoCategorizer
        operation: "auto_categorize",
        metadata: {
          transaction_count: transactions.size,
-          category_count: user_categories.size
+          category_count: user_categories.size,
+          json_mode: mode
        }
      )

@@ -143,9 +271,72 @@ class Provider::Openai::AutoCategorizer
    end

    def normalize_category_name(category_name)
-      return nil if category_name == "null"
+      # Convert to string to handle non-string LLM outputs (numbers, booleans, etc.)
+      normalized = category_name.to_s.strip
+      return nil if normalized.empty? || normalized == "null" || normalized.downcase == "null"

-      category_name
+      # Try exact match first
+      exact_match = user_categories.find { |c| c[:name] == normalized }
+      return exact_match[:name] if exact_match
+
+      # Try case-insensitive match
+      case_insensitive_match = user_categories.find { |c| c[:name].to_s.downcase == normalized.downcase }
+      return case_insensitive_match[:name] if case_insensitive_match
+
+      # Try partial/fuzzy match (for common variations)
+      fuzzy_match = find_fuzzy_category_match(normalized)
+      return fuzzy_match if fuzzy_match
+
+      # Return normalized string if no match found (will be treated as uncategorized)
+      normalized
+    end
+
+    # Find a fuzzy match for category names with common variations
+    def find_fuzzy_category_match(category_name)
+      # Ensure string input for string operations
+      input_str = category_name.to_s
+      normalized_input = input_str.downcase.gsub(/[^a-z0-9]/, "")
+
+      user_categories.each do |cat|
+        cat_name_str = cat[:name].to_s
+        normalized_cat = cat_name_str.downcase.gsub(/[^a-z0-9]/, "")
+
+        # Check if one contains the other
+        return cat[:name] if normalized_input.include?(normalized_cat) || normalized_cat.include?(normalized_input)
+
+        # Check common abbreviations/variations
+        return cat[:name] if fuzzy_name_match?(input_str, cat_name_str)
+      end
+
+      nil
+    end
+
+    # Handle common naming variations
+    def fuzzy_name_match?(input, category)
+      variations = {
+        "gas" => [ "gas & fuel", "gas and fuel", "fuel", "gasoline" ],
+        "restaurants" => [ "restaurant", "dining", "food" ],
+        "groceries" => [ "grocery", "supermarket", "food store" ],
+        "streaming" => [ "streaming services", "streaming service" ],
+        "rideshare" => [ "ride share", "ride-share", "uber", "lyft" ],
+        "coffee" => [ "coffee shops", "coffee shop", "cafe" ],
+        "fast food" => [ "fastfood", "quick service" ],
+        "gym" => [ "gym & fitness", "fitness", "gym and fitness" ],
+        "flights" => [ "flight", "airline", "airlines", "airfare" ],
+        "hotels" => [ "hotel", "lodging", "accommodation" ]
+      }
+
+      # Ensure string inputs for string operations
+      input_lower = input.to_s.downcase
+      category_lower = category.to_s.downcase
+
+      variations.each do |_key, synonyms|
+        if synonyms.include?(input_lower) && synonyms.include?(category_lower)
+          return true
+        end
+      end
+
+      false
    end

    def extract_categorizations_native(response)
@@ -162,9 +353,107 @@ class Provider::Openai::AutoCategorizer

    def extract_categorizations_generic(response)
      raw = response.dig("choices", 0, "message", "content")
-      JSON.parse(raw).dig("categorizations")
-    rescue JSON::ParserError => e
-      raise Provider::Openai::Error, "Invalid JSON in generic categorization: #{e.message}"
+      parsed = parse_json_flexibly(raw)
+
+      # Handle different response formats from various LLMs
+      categorizations = parsed.dig("categorizations") ||
+                        parsed.dig("results") ||
+                        (parsed.is_a?(Array) ? parsed : nil)
+
+      raise Provider::Openai::Error, "Could not find categorizations in response" if categorizations.nil?
+
+      # Normalize field names (some LLMs use different naming)
+      categorizations.map do |cat|
+        {
+          "transaction_id" => cat["transaction_id"] || cat["id"] || cat["txn_id"],
+          "category_name" => cat["category_name"] || cat["category"] || cat["name"]
+        }
+      end
+    end
+
+    # Flexible JSON parsing that handles common LLM output issues
+    def parse_json_flexibly(raw)
+      return {} if raw.blank?
+
+      # Strip thinking model tags if present (e.g., <think>...</think>)
+      # The actual JSON output comes after the thinking block
+      cleaned = strip_thinking_tags(raw)
+
+      # Try direct parse first
+      JSON.parse(cleaned)
+    rescue JSON::ParserError
+      # Try multiple extraction strategies in order of preference
+
+      # Strategy 1: Closed markdown code blocks (```json...```)
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
+        matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+      end
+
+      # Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
+      # Pattern: ```json followed by JSON that goes to end of string
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 3: Find JSON object with "categorizations" key
+      if cleaned =~ /(\{"categorizations"\s*:\s*\[[\s\S]*\]\s*\})/m
+        matches = cleaned.scan(/(\{"categorizations"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+        # Try greedy match if non-greedy failed
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 4: Find any JSON object (last resort)
+      if cleaned =~ /(\{[\s\S]*\})/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Fall through to error
+        end
+      end
+
+      raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
+    end
+
+    # Strip thinking model tags (<think>...</think>) from response
+    # Some models like Qwen-thinking output reasoning in these tags before the actual response
+    def strip_thinking_tags(raw)
+      # Remove <think>...</think> blocks but keep content after them
+      # If no closing tag, the model may have been cut off - try to extract JSON from inside
+      if raw.include?("<think>")
+        # Check if there's content after the thinking block
+        if raw =~ /<\/think>\s*([\s\S]*)/m
+          after_thinking = $1.strip
+          return after_thinking if after_thinking.present?
+        end
+        # If no content after </think> or no closing tag, look inside the thinking block
+        # The JSON might be the last thing in the thinking block
+        if raw =~ /<think>([\s\S]*)/m
+          return $1
+        end
+      end
+      raw
    end

    def json_schema
@@ -213,4 +502,39 @@ class Provider::Openai::AutoCategorizer
        ```
      MESSAGE
    end
+
+    # Concise developer message optimized for smaller/local LLMs
+    # Uses pattern-based guidance instead of exhaustive examples
+    def developer_message_for_generic
+      <<~MESSAGE.strip_heredoc
+        AVAILABLE CATEGORIES: #{user_categories.map { |c| c[:name] }.join(", ")}
+
+        TRANSACTIONS TO CATEGORIZE:
+        #{format_transactions_simply}
+
+        CATEGORIZATION GUIDELINES:
+        - Prefer specific subcategories over general parent categories when confident
+        - Food delivery services should be categorized based on the underlying merchant type
+        - Square payments (SQ *) should be inferred from the merchant name after the prefix
+        - Warehouse/club stores should be categorized based on their primary purpose
+        - Return "null" for generic transactions (e.g., POS terminals, wire transfers, checks, ATM withdrawals)
+
+        IMPORTANT:
+        - Use EXACT category names from the list above
+        - Return "null" (as a string) if you cannot confidently match a category
+        - Match expense transactions only to expense categories
+        - Match income transactions only to income categories
+        - Do NOT include any explanation or reasoning - only output JSON
+
+        Respond with ONLY this JSON (no markdown code blocks, no other text):
+        {"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
+      MESSAGE
+    end
+
+    # Format transactions in a simpler, more readable way for smaller LLMs
+    def format_transactions_simply
+      transactions.map do |t|
+        "- ID: #{t[:id]}, Amount: #{t[:amount]}, Type: #{t[:classification]}, Description: \"#{t[:description]}\""
+      end.join("\n")
+    end
 end
--- a/app/models/provider/openai/auto_merchant_detector.rb
+++ b/app/models/provider/openai/auto_merchant_detector.rb
@@ -1,9 +1,22 @@
 class Provider::Openai::AutoMerchantDetector
  include Provider::Openai::Concerns::UsageRecorder

-  attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family
+  # JSON response format modes for custom providers
+  # - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
+  # - "json_object": Use json_object response format (broader compatibility)
+  # - "none": No response format constraint (maximum compatibility with local LLMs)
+  # - "auto": Try strict first, fall back to none if poor results
+  JSON_MODE_STRICT = "strict"
+  JSON_MODE_OBJECT = "json_object"
+  JSON_MODE_NONE = "none"
+  JSON_MODE_AUTO = "auto"

-  def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil)
+  # Threshold for auto mode: if more than this percentage returns null, retry with none mode
+  AUTO_MODE_NULL_THRESHOLD = 0.5
+
+  attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family, :json_mode
+
+  def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
    @client = client
    @model = model
    @transactions = transactions
@@ -11,6 +24,32 @@ class Provider::Openai::AutoMerchantDetector
    @custom_provider = custom_provider
    @langfuse_trace = langfuse_trace
    @family = family
+    @json_mode = json_mode || default_json_mode
+  end
+
+  VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
+
+  # Determine default JSON mode based on configuration hierarchy:
+  # 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
+  # 2. Setting.openai_json_mode - user-configured in app settings
+  # 3. Default: auto mode (recommended for all providers)
+  #
+  # Mode descriptions:
+  # - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
+  # - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
+  # - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
+  # - "json_object": Middle ground, broader compatibility than strict
+  def default_json_mode
+    # 1. Check environment variable first (allows runtime override for testing)
+    env_mode = ENV["LLM_JSON_MODE"]
+    return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
+
+    # 2. Check app settings (user-configured)
+    setting_mode = Setting.openai_json_mode
+    return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
+
+    # 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
+    JSON_MODE_AUTO
  end

  def auto_detect_merchants
@@ -22,6 +61,32 @@ class Provider::Openai::AutoMerchantDetector
  end

  def instructions
+    if custom_provider
+      simple_instructions
+    else
+      detailed_instructions
+    end
+  end
+
+  # Simplified instructions for smaller/local LLMs
+  def simple_instructions
+    <<~INSTRUCTIONS.strip_heredoc
+      Detect business names and websites from transaction descriptions. Return JSON only.
+
+      Rules:
+      1. Match transaction_id exactly from input
+      2. Return business_name and business_url for known businesses
+      3. Return "null" for both if uncertain or generic (e.g. "Paycheck", "Local diner")
+      4. Don't include "www." in URLs (use "amazon.com" not "www.amazon.com")
+      5. Favor "null" over guessing - only return values if 80%+ confident
+
+      Example output format:
+      {"merchants": [{"transaction_id": "txn_001", "business_name": "Amazon", "business_url": "amazon.com"}]}
+    INSTRUCTIONS
+  end
+
+  # Detailed instructions for larger models like GPT-4
+  def detailed_instructions
    <<~INSTRUCTIONS.strip_heredoc
      You are an assistant to a consumer personal finance app.

@@ -108,19 +173,64 @@ class Provider::Openai::AutoMerchantDetector
    end

    def auto_detect_merchants_openai_generic
+      if json_mode == JSON_MODE_AUTO
+        auto_detect_merchants_with_auto_mode
+      else
+        auto_detect_merchants_with_mode(json_mode)
+      end
+    rescue Faraday::BadRequestError => e
+      # If strict mode fails (HTTP 400), fall back to none mode
+      # This handles providers that don't support json_schema response format
+      if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
+        Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
+        auto_detect_merchants_with_mode(JSON_MODE_NONE)
+      else
+        raise
+      end
+    end
+
+    # Auto mode: try strict first, fall back to none if too many nulls or missing results
+    def auto_detect_merchants_with_auto_mode
+      result = auto_detect_merchants_with_mode(JSON_MODE_STRICT)
+
+      # Check if too many nulls OR missing results were returned
+      # Models that can't reason in strict mode often:
+      # 1. Return null for everything, OR
+      # 2. Simply omit transactions they can't detect (returning fewer results than input)
+      null_count = result.count { |r| r.business_name.nil? || r.business_name == "null" }
+      missing_count = transactions.size - result.size
+      failed_count = null_count + missing_count
+      failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
+
+      if failed_ratio > AUTO_MODE_NULL_THRESHOLD
+        Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
+        auto_detect_merchants_with_mode(JSON_MODE_NONE)
+      else
+        result
+      end
+    end
+
+    def auto_detect_merchants_with_mode(mode)
      span = langfuse_trace&.span(name: "auto_detect_merchants_api_call", input: {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        transactions: transactions,
-        user_merchants: user_merchants
+        user_merchants: user_merchants,
+        json_mode: mode
      })

-      response = client.chat(parameters: {
+      # Build parameters with configurable JSON response format
+      params = {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        messages: [
          { role: "system", content: instructions },
-          { role: "user", content: developer_message }
-        ],
-        response_format: {
+          { role: "user", content: developer_message_for_generic }
+        ]
+      }
+
+      # Add response format based on json_mode setting
+      case mode
+      when JSON_MODE_STRICT
+        params[:response_format] = {
          type: "json_schema",
          json_schema: {
            name: "auto_detect_personal_finance_merchants",
@@ -128,9 +238,14 @@ class Provider::Openai::AutoMerchantDetector
            schema: json_schema
          }
        }
-      })
+      when JSON_MODE_OBJECT
+        params[:response_format] = { type: "json_object" }
+        # JSON_MODE_NONE: no response_format constraint
+      end

-      Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")}")
+      response = client.chat(parameters: params)
+
+      Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")

      merchants = extract_merchants_generic(response)
      result = build_response(merchants)
@@ -141,7 +256,8 @@ class Provider::Openai::AutoMerchantDetector
        operation: "auto_detect_merchants",
        metadata: {
          transaction_count: transactions.size,
-          merchant_count: user_merchants.size
+          merchant_count: user_merchants.size,
+          json_mode: mode
        }
      )

@@ -154,24 +270,40 @@ class Provider::Openai::AutoMerchantDetector

    AutoDetectedMerchant = Provider::LlmConcept::AutoDetectedMerchant

-    def build_response(categorizations)
-      categorizations.map do |categorization|
+    def build_response(merchants)
+      merchants.map do |merchant|
        AutoDetectedMerchant.new(
-          transaction_id: categorization.dig("transaction_id"),
-          business_name: normalize_ai_value(categorization.dig("business_name")),
-          business_url: normalize_ai_value(categorization.dig("business_url")),
+          transaction_id: merchant.dig("transaction_id"),
+          business_name: normalize_merchant_value(merchant.dig("business_name")),
+          business_url: normalize_merchant_value(merchant.dig("business_url")),
        )
      end
    end

-    def normalize_ai_value(ai_value)
-      return nil if ai_value == "null"
+    def normalize_merchant_value(value)
+      return nil if value.nil? || value == "null" || value.to_s.downcase == "null"

-      ai_value
+      # Try to match against user merchants for name normalization
+      if user_merchants.present?
+        # Try exact match first
+        exact_match = user_merchants.find { |m| m[:name] == value }
+        return exact_match[:name] if exact_match
+
+        # Try case-insensitive match
+        case_match = user_merchants.find { |m| m[:name].to_s.downcase == value.to_s.downcase }
+        return case_match[:name] if case_match
+      end
+
+      value
    end

    def extract_merchants_native(response)
-      raw = response.dig("output", 0, "content", 0, "text")
+      # Find the message output (not reasoning output)
+      message_output = response["output"]&.find { |o| o["type"] == "message" }
+      raw = message_output&.dig("content", 0, "text")
+
+      raise Provider::Openai::Error, "No message content found in response" if raw.nil?
+
      JSON.parse(raw).dig("merchants")
    rescue JSON::ParserError => e
      raise Provider::Openai::Error, "Invalid JSON in native merchant detection: #{e.message}"
@@ -179,9 +311,100 @@ class Provider::Openai::AutoMerchantDetector

    def extract_merchants_generic(response)
      raw = response.dig("choices", 0, "message", "content")
-      JSON.parse(raw).dig("merchants")
-    rescue JSON::ParserError => e
-      raise Provider::Openai::Error, "Invalid JSON in generic merchant detection: #{e.message}"
+      parsed = parse_json_flexibly(raw)
+
+      # Handle different response formats from various LLMs
+      merchants = parsed.dig("merchants") ||
+                  parsed.dig("results") ||
+                  (parsed.is_a?(Array) ? parsed : nil)
+
+      raise Provider::Openai::Error, "Could not find merchants in response" if merchants.nil?
+
+      # Normalize field names (some LLMs use different naming)
+      merchants.map do |m|
+        {
+          "transaction_id" => m["transaction_id"] || m["id"] || m["txn_id"],
+          "business_name" => m["business_name"] || m["name"] || m["merchant_name"] || m["merchant"],
+          "business_url" => m["business_url"] || m["url"] || m["website"]
+        }
+      end
+    end
+
+    # Flexible JSON parsing that handles common LLM output issues
+    def parse_json_flexibly(raw)
+      return {} if raw.blank?
+
+      # Strip thinking model tags if present (e.g., <think>...</think>)
+      cleaned = strip_thinking_tags(raw)
+
+      # Try direct parse first
+      JSON.parse(cleaned)
+    rescue JSON::ParserError
+      # Try multiple extraction strategies in order of preference
+
+      # Strategy 1: Closed markdown code blocks (```json...```)
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
+        matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+      end
+
+      # Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 3: Find JSON object with "merchants" key
+      if cleaned =~ /(\{"merchants"\s*:\s*\[[\s\S]*\]\s*\})/m
+        matches = cleaned.scan(/(\{"merchants"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+        # Try greedy match if non-greedy failed
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 4: Find any JSON object (last resort)
+      if cleaned =~ /(\{[\s\S]*\})/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Fall through to error
+        end
+      end
+
+      raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
+    end
+
+    # Strip thinking model tags (<think>...</think>) from response
+    def strip_thinking_tags(raw)
+      if raw.include?("<think>")
+        if raw =~ /<\/think>\s*([\s\S]*)/m
+          after_thinking = $1.strip
+          return after_thinking if after_thinking.present?
+        end
+        if raw =~ /<think>([\s\S]*)/m
+          return $1
+        end
+      end
+      raw
    end

    def json_schema
@@ -235,4 +458,40 @@ class Provider::Openai::AutoMerchantDetector
        Return "null" if you are not 80%+ confident in your answer.
      MESSAGE
    end
+
+    # Enhanced developer message with few-shot examples for smaller/local LLMs
+    def developer_message_for_generic
+      merchant_names = user_merchants.present? ? user_merchants.map { |m| m[:name] }.join(", ") : "(none provided)"
+
+      <<~MESSAGE.strip_heredoc
+        USER'S KNOWN MERCHANTS: #{merchant_names}
+
+        TRANSACTIONS TO ANALYZE:
+        #{format_transactions_simply}
+
+        EXAMPLES of correct merchant detection:
+        - "AMAZON.COM*1A2B3C" → business_name: "Amazon", business_url: "amazon.com"
+        - "STARBUCKS STORE #9876" → business_name: "Starbucks", business_url: "starbucks.com"
+        - "NETFLIX.COM" → business_name: "Netflix", business_url: "netflix.com"
+        - "UBER *TRIP" → business_name: "Uber", business_url: "uber.com"
+        - "ACH WITHDRAWAL" → business_name: "null", business_url: "null" (generic)
+        - "LOCAL DINER" → business_name: "null", business_url: "null" (generic/unknown)
+        - "POS DEBIT 12345" → business_name: "null", business_url: "null" (generic)
+
+        IMPORTANT:
+        - Return "null" (as a string) for BOTH name and URL if you cannot confidently identify the business
+        - Don't include "www." in URLs
+        - Generic descriptions like "Paycheck", "Transfer", "ATM" should return "null"
+
+        Respond with ONLY this JSON format (no other text):
+        {"merchants": [{"transaction_id": "...", "business_name": "...", "business_url": "..."}]}
+      MESSAGE
+    end
+
+    # Format transactions in a simpler, more readable way for smaller LLMs
+    def format_transactions_simply
+      transactions.map do |t|
+        "- ID: #{t[:id]}, Description: \"#{t[:name] || t[:description]}\""
+      end.join("\n")
+    end
 end