diff --git a/.env.example b/.env.example index f5e98e7a4..b11153b27 100644 --- a/.env.example +++ b/.env.example @@ -25,19 +25,34 @@ OPENAI_ACCESS_TOKEN= OPENAI_MODEL= OPENAI_URI_BASE= +# Optional: LLM token budget (applies to chat, auto-categorize, merchant detection, PDF processing). +# Lower these for small-context local models (Ollama, LM Studio, LocalAI). +# Defaults work for modern cloud OpenAI models without configuration. +# LLM_CONTEXT_WINDOW=2048 +# LLM_MAX_RESPONSE_TOKENS=512 +# LLM_MAX_HISTORY_TOKENS= +# LLM_SYSTEM_PROMPT_RESERVE=256 +# LLM_MAX_ITEMS_PER_CALL=25 + +# Optional: OpenAI-compatible capability flags +# OPENAI_REQUEST_TIMEOUT=60 # HTTP timeout in seconds; raise for slow local models +# OPENAI_SUPPORTS_PDF_PROCESSING=true # Set to false for endpoints without vision support +# OPENAI_SUPPORTS_RESPONSES_ENDPOINT= # Override Responses-API vs chat.completions routing +# LLM_JSON_MODE= # auto | strict | json_object | none + # Optional: External AI Assistant — delegates chat to a remote AI agent # instead of calling LLMs directly. The agent calls back to Sure's /mcp endpoint. # See docs/hosting/ai.md for full details. # ASSISTANT_TYPE=external # EXTERNAL_ASSISTANT_URL=https://your-agent-host/v1/chat/completions -# EXTERNAL_ASSISTANT_TOKEN=your-api-token +# EXTERNAL_ASSISTANT_TOKEN=your-api-token # pipelock:ignore # EXTERNAL_ASSISTANT_AGENT_ID=main # EXTERNAL_ASSISTANT_SESSION_KEY=agent:main:main # EXTERNAL_ASSISTANT_ALLOWED_EMAILS=user@example.com # Optional: MCP server endpoint — enables /mcp for external AI assistants. # Both values are required. MCP_USER_EMAIL must match an existing user's email. -# MCP_API_TOKEN=your-random-bearer-token +# MCP_API_TOKEN=your-random-bearer-token # pipelock:ignore # MCP_USER_EMAIL=user@example.com # Optional: Langfuse config @@ -82,7 +97,7 @@ EMAIL_SENDER= # Database Configuration DB_HOST=localhost # May need to be changed to `DB_HOST=db` if using devcontainer DB_PORT=5432 -POSTGRES_PASSWORD=postgres +POSTGRES_PASSWORD=postgres # pipelock:ignore POSTGRES_USER=postgres # Redis configuration @@ -94,7 +109,7 @@ REDIS_URL=redis://localhost:6379/1 # REDIS_SENTINEL_HOSTS=sentinel1:26379,sentinel2:26379,sentinel3:26379 # REDIS_SENTINEL_MASTER=mymaster # REDIS_SENTINEL_USERNAME=default -# REDIS_PASSWORD=your-redis-password +# REDIS_PASSWORD=your-redis-password # pipelock:ignore # App Domain # This is the domain that your Sure instance will be hosted at. It is used to generate links in emails and other places. diff --git a/.env.local.example b/.env.local.example index 75a8de778..1795cc8b9 100644 --- a/.env.local.example +++ b/.env.local.example @@ -28,8 +28,22 @@ TWELVE_DATA_API_KEY = OPENAI_ACCESS_TOKEN = OPENAI_URI_BASE = OPENAI_MODEL = -# OPENAI_REQUEST_TIMEOUT: Request timeout in seconds (default: 60) -# OPENAI_SUPPORTS_PDF_PROCESSING: Set to false for endpoints without vision support (default: true) + +# LLM token budget. Applies to ALL outbound LLM calls: chat history, +# auto-categorize, merchant detection, provider enhancer, PDF processing. +# Defaults to Ollama's historical 2048-token baseline so small local models +# work out of the box — raise explicitly for cloud or larger-context models. +# LLM_CONTEXT_WINDOW = 2048 # Total tokens the model will accept +# LLM_MAX_RESPONSE_TOKENS = 512 # Reserved for the model's reply +# LLM_MAX_HISTORY_TOKENS = # Derived if unset (context - response - system_reserve) +# LLM_SYSTEM_PROMPT_RESERVE = 256 # Tokens reserved for the system prompt +# LLM_MAX_ITEMS_PER_CALL = 25 # Upper bound on auto-categorize / merchant batches + +# OpenAI-compatible capability flags (custom/self-hosted providers) +# OPENAI_REQUEST_TIMEOUT = 60 # HTTP timeout in seconds; raise for slow local models +# OPENAI_SUPPORTS_PDF_PROCESSING = true # Set to false for endpoints without vision support +# OPENAI_SUPPORTS_RESPONSES_ENDPOINT = # true to force Responses API on custom providers +# LLM_JSON_MODE = # auto | strict | json_object | none # (example: LM Studio/Docker config) OpenAI-compatible API endpoint config # OPENAI_URI_BASE = http://host.docker.internal:1234/ diff --git a/app/controllers/settings/hostings_controller.rb b/app/controllers/settings/hostings_controller.rb index 59a85939a..063f2a6b0 100644 --- a/app/controllers/settings/hostings_controller.rb +++ b/app/controllers/settings/hostings_controller.rb @@ -1,6 +1,15 @@ class Settings::HostingsController < ApplicationController layout "settings" + # Minimum accepted value for each configurable LLM budget field. Mirrors the + # `min:` attribute on the form inputs in `_openai_settings.html.erb` so the + # controller rejects what the browser-side validator would reject. + LLM_BUDGET_MINIMUMS = { + llm_context_window: 256, + llm_max_response_tokens: 64, + llm_max_items_per_call: 1 + }.freeze + guard_feature unless: -> { self_hosted? } before_action :ensure_admin, only: [ :update, :clear_cache, :disconnect_external_assistant ] @@ -157,6 +166,21 @@ class Settings::HostingsController < ApplicationController Setting.openai_json_mode = hosting_params[:openai_json_mode].presence end + LLM_BUDGET_MINIMUMS.each do |key, minimum| + next unless hosting_params.key?(key) + raw = hosting_params[key].to_s.strip + if raw.blank? + Setting.public_send("#{key}=", nil) + next + end + parsed = Integer(raw, 10) rescue nil + if parsed.nil? || parsed < minimum + label = t("settings.hostings.openai_settings.#{key}_label") + raise Setting::ValidationError, t(".invalid_llm_budget", field: label, minimum: minimum) + end + Setting.public_send("#{key}=", parsed) + end + if hosting_params.key?(:external_assistant_url) Setting.external_assistant_url = hosting_params[:external_assistant_url] end @@ -199,7 +223,7 @@ class Settings::HostingsController < ApplicationController private def hosting_params return ActionController::Parameters.new unless params.key?(:setting) - params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :invite_only_default_family_id, :brand_fetch_client_id, :brand_fetch_high_res_logos, :twelve_data_api_key, :tiingo_api_key, :eodhd_api_key, :alpha_vantage_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider, :syncs_include_pending, :auto_sync_enabled, :auto_sync_time, :external_assistant_url, :external_assistant_token, :external_assistant_agent_id, securities_providers: []) + params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :invite_only_default_family_id, :brand_fetch_client_id, :brand_fetch_high_res_logos, :twelve_data_api_key, :tiingo_api_key, :eodhd_api_key, :alpha_vantage_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :llm_context_window, :llm_max_response_tokens, :llm_max_items_per_call, :exchange_rate_provider, :securities_provider, :syncs_include_pending, :auto_sync_enabled, :auto_sync_time, :external_assistant_url, :external_assistant_token, :external_assistant_agent_id, securities_providers: []) end def update_assistant_type diff --git a/app/models/assistant/builtin.rb b/app/models/assistant/builtin.rb index 1d615eb5a..14bc9c05d 100644 --- a/app/models/assistant/builtin.rb +++ b/app/models/assistant/builtin.rb @@ -63,6 +63,13 @@ class Assistant::Builtin < Assistant::Base responder.respond(previous_response_id: latest_response_id) rescue => e stop_thinking + # If we streamed any partial content before the error, the message was + # persisted with the default `complete` status. Demote it to `failed` so + # `Assistant::Responder#conversation_history` won't feed a broken turn + # back into future prompts. + if assistant_message&.persisted? + assistant_message.update_columns(status: "failed") + end chat.add_error(e) end diff --git a/app/models/assistant/history_trimmer.rb b/app/models/assistant/history_trimmer.rb new file mode 100644 index 000000000..02e64f8b9 --- /dev/null +++ b/app/models/assistant/history_trimmer.rb @@ -0,0 +1,53 @@ +class Assistant::HistoryTrimmer + def initialize(messages, max_tokens:) + @messages = messages || [] + @max_tokens = max_tokens.to_i + end + + def call + return [] if @messages.empty? || @max_tokens <= 0 + + kept = [] + tokens = 0 + + group_tool_pairs(@messages).reverse_each do |group| + group_tokens = Assistant::TokenEstimator.estimate(group) + break if tokens + group_tokens > @max_tokens + + kept.unshift(*group) + tokens += group_tokens + end + + kept + end + + private + + # Bundles each assistant message that has `tool_calls` with the + # consecutive `role: "tool"` results that follow it, so the trimmer + # never splits a call/result pair when dropping from the oldest end. + def group_tool_pairs(messages) + groups = [] + current_group = nil + + messages.each do |msg| + if assistant_with_tool_calls?(msg) + groups << current_group if current_group + current_group = [ msg ] + elsif msg[:role].to_s == "tool" && current_group + current_group << msg + else + groups << current_group if current_group + current_group = nil + groups << [ msg ] + end + end + + groups << current_group if current_group + groups + end + + def assistant_with_tool_calls?(msg) + msg[:role].to_s == "assistant" && msg[:tool_calls].is_a?(Array) && msg[:tool_calls].any? + end +end diff --git a/app/models/assistant/responder.rb b/app/models/assistant/responder.rb index f2b6d121a..480c69c22 100644 --- a/app/models/assistant/responder.rb +++ b/app/models/assistant/responder.rb @@ -79,6 +79,7 @@ class Assistant::Responder instructions: instructions, functions: function_tool_caller.function_definitions, function_results: function_results, + messages: conversation_history, streamer: streamer, previous_response_id: previous_response_id, session_id: chat_session_id, @@ -114,4 +115,46 @@ class Assistant::Responder def chat @chat ||= message.chat end + + def conversation_history + messages = [] + return messages unless chat&.messages + + chat.messages + .where(type: [ "UserMessage", "AssistantMessage" ], status: "complete") + .includes(:tool_calls) + .ordered + .each do |chat_message| + if chat_message.tool_calls.any? + messages << { + role: chat_message.role, + content: chat_message.content || "", + tool_calls: chat_message.tool_calls.map(&:to_tool_call) + } + + chat_message.tool_calls.map(&:to_result).each do |fn_result| + # Handle nil explicitly to avoid serializing to "null" + output = fn_result[:output] + content = if output.nil? + "" + elsif output.is_a?(String) + output + else + output.to_json + end + + messages << { + role: "tool", + tool_call_id: fn_result[:call_id], + name: fn_result[:name], + content: content + } + end + + elsif !chat_message.content.blank? + messages << { role: chat_message.role, content: chat_message.content || "" } + end + end + messages + end end diff --git a/app/models/assistant/token_estimator.rb b/app/models/assistant/token_estimator.rb new file mode 100644 index 000000000..4f5e3fd60 --- /dev/null +++ b/app/models/assistant/token_estimator.rb @@ -0,0 +1,19 @@ +module Assistant::TokenEstimator + CHARS_PER_TOKEN = 4 + SAFETY_FACTOR = 1.25 + + def self.estimate(value) + chars = char_length(value) + ((chars / CHARS_PER_TOKEN.to_f) * SAFETY_FACTOR).ceil + end + + def self.char_length(value) + case value + when nil then 0 + when String then value.length + when Array then value.sum { |v| char_length(v) } + when Hash then value.to_json.length + else value.to_s.length + end + end +end diff --git a/app/models/chat.rb b/app/models/chat.rb index d47dcccac..f97fca68b 100644 --- a/app/models/chat.rb +++ b/app/models/chat.rb @@ -26,9 +26,9 @@ class Chat < ApplicationRecord end # Returns the default AI model to use for chats - # Priority: ENV variable > Setting > OpenAI default + # Priority: AI Config > Setting def default_model - ENV["OPENAI_MODEL"].presence || Setting.openai_model.presence || Provider::Openai::DEFAULT_MODEL + Provider::Openai.effective_model.presence || Setting.openai_model end end diff --git a/app/models/provider/llm_concept.rb b/app/models/provider/llm_concept.rb index 46e6dcd84..52550111f 100644 --- a/app/models/provider/llm_concept.rb +++ b/app/models/provider/llm_concept.rb @@ -40,6 +40,7 @@ module Provider::LlmConcept instructions: nil, functions: [], function_results: [], + messages: nil, streamer: nil, previous_response_id: nil, session_id: nil, diff --git a/app/models/provider/openai.rb b/app/models/provider/openai.rb index 60c103a9e..052829cc1 100644 --- a/app/models/provider/openai.rb +++ b/app/models/provider/openai.rb @@ -4,31 +4,29 @@ class Provider::Openai < Provider # Subclass so errors caught in this provider are raised as Provider::Openai::Error Error = Class.new(Provider::Error) - # Supported OpenAI model prefixes (e.g., "gpt-4" matches "gpt-4", "gpt-4.1", "gpt-4-turbo", etc.) - DEFAULT_OPENAI_MODEL_PREFIXES = %w[gpt-4 gpt-5 o1 o3] - DEFAULT_MODEL = "gpt-4.1" - - # Models that support PDF/vision input (not all OpenAI models have vision capabilities) + DEFAULT_MODEL = "gpt-4.1".freeze + SUPPORTED_MODELS = %w[gpt-4 gpt-5 o1 o3].freeze VISION_CAPABLE_MODEL_PREFIXES = %w[gpt-4o gpt-4-turbo gpt-4.1 gpt-5 o1 o3].freeze - # Returns the effective model that would be used by the provider - # Uses the same logic as Provider::Registry and the initializer + # Returns the effective model that would be used by the provider. + # Priority: explicit ENV > Setting > DEFAULT_MODEL. def self.effective_model - configured_model = ENV.fetch("OPENAI_MODEL", Setting.openai_model) - configured_model.presence || DEFAULT_MODEL + ENV.fetch("OPENAI_MODEL") { Setting.openai_model }.presence || DEFAULT_MODEL end def initialize(access_token, uri_base: nil, model: nil) client_options = { access_token: access_token } - client_options[:uri_base] = uri_base if uri_base.present? + llm_uri_base = uri_base.presence + llm_model = model.presence + client_options[:uri_base] = llm_uri_base if llm_uri_base.present? client_options[:request_timeout] = ENV.fetch("OPENAI_REQUEST_TIMEOUT", 60).to_i @client = ::OpenAI::Client.new(**client_options) - @uri_base = uri_base - if custom_provider? && model.blank? + @uri_base = llm_uri_base + if custom_provider? && llm_model.blank? raise Error, "Model is required when using a custom OpenAI‑compatible provider" end - @default_model = model.presence || DEFAULT_MODEL + @default_model = llm_model.presence || self.class.effective_model end def supports_model?(model) @@ -36,7 +34,18 @@ class Provider::Openai < Provider return true if custom_provider? # Otherwise, check if model starts with any supported OpenAI prefix - DEFAULT_OPENAI_MODEL_PREFIXES.any? { |prefix| model.start_with?(prefix) } + SUPPORTED_MODELS.any? { |prefix| model.start_with?(prefix) } + end + + def supports_responses_endpoint? + return @supports_responses_endpoint if defined?(@supports_responses_endpoint) + + env_override = ENV["OPENAI_SUPPORTS_RESPONSES_ENDPOINT"] + if env_override.to_s.present? + return @supports_responses_endpoint = ActiveModel::Type::Boolean.new.cast(env_override) + end + + @supports_responses_endpoint = !custom_provider? end def provider_name @@ -47,7 +56,7 @@ class Provider::Openai < Provider if custom_provider? @default_model.present? ? "configured model: #{@default_model}" : "any model" else - "models starting with: #{DEFAULT_OPENAI_MODEL_PREFIXES.join(', ')}" + "models starting with: #{SUPPORTED_MODELS.join(", ")}" end end @@ -55,9 +64,43 @@ class Provider::Openai < Provider @uri_base.present? end + # Token-budget knobs. Precedence: ENV > Setting > default. Defaults match + # Ollama's historical 2048-token baseline so local small-context models work + # out of the box. Users on larger-context cloud models can raise via ENV or + # via the Self-Hosting settings page. + def context_window + positive_budget(ENV["LLM_CONTEXT_WINDOW"], Setting.llm_context_window, 2048) + end + + def max_response_tokens + positive_budget(ENV["LLM_MAX_RESPONSE_TOKENS"], Setting.llm_max_response_tokens, 512) + end + + def system_prompt_reserve + positive_budget(ENV["LLM_SYSTEM_PROMPT_RESERVE"], nil, 256) + end + + def max_history_tokens + explicit = ENV["LLM_MAX_HISTORY_TOKENS"].presence&.to_i + return explicit if explicit&.positive? + [ context_window - max_response_tokens - system_prompt_reserve, 256 ].max + end + + # Budget available for a one-shot (non-chat) request's full input, + # excluding reserved response tokens AND the system/instructions prompt. + # Drives the batch slicer for the auto_categorize / auto_detect_merchants / + # enhance_provider_merchants calls — each ships ~200–400 tokens of + # instructions + JSON schema that aren't counted in `fixed_tokens`. + def max_input_tokens + [ context_window - max_response_tokens - system_prompt_reserve, 256 ].max + end + + def max_items_per_call + positive_budget(ENV["LLM_MAX_ITEMS_PER_CALL"], Setting.llm_max_items_per_call, 25) + end + def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil) with_provider_response do - raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25 if user_categories.blank? family_id = family&.id || "unknown" Rails.logger.error("Cannot auto-categorize transactions for family #{family_id}: no categories available") @@ -71,16 +114,20 @@ class Provider::Openai < Provider input: { transactions: transactions, user_categories: user_categories } ) - result = AutoCategorizer.new( - client, - model: effective_model, - transactions: transactions, - user_categories: user_categories, - custom_provider: custom_provider?, - langfuse_trace: trace, - family: family, - json_mode: json_mode - ).auto_categorize + batches = slice_for_context(transactions, fixed: user_categories) + + result = batches.flat_map do |batch| + AutoCategorizer.new( + client, + model: effective_model, + transactions: batch, + user_categories: user_categories, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family, + json_mode: json_mode + ).auto_categorize + end upsert_langfuse_trace(trace: trace, output: result.map(&:to_h)) @@ -90,8 +137,6 @@ class Provider::Openai < Provider def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil) with_provider_response do - raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25 - effective_model = model.presence || @default_model trace = create_langfuse_trace( @@ -99,16 +144,20 @@ class Provider::Openai < Provider input: { transactions: transactions, user_merchants: user_merchants } ) - result = AutoMerchantDetector.new( - client, - model: effective_model, - transactions: transactions, - user_merchants: user_merchants, - custom_provider: custom_provider?, - langfuse_trace: trace, - family: family, - json_mode: json_mode - ).auto_detect_merchants + batches = slice_for_context(transactions, fixed: user_merchants) + + result = batches.flat_map do |batch| + AutoMerchantDetector.new( + client, + model: effective_model, + transactions: batch, + user_merchants: user_merchants, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family, + json_mode: json_mode + ).auto_detect_merchants + end upsert_langfuse_trace(trace: trace, output: result.map(&:to_h)) @@ -118,8 +167,6 @@ class Provider::Openai < Provider def enhance_provider_merchants(merchants: [], model: "", family: nil, json_mode: nil) with_provider_response do - raise Error, "Too many merchants to enhance. Max is 25 per request." if merchants.size > 25 - effective_model = model.presence || @default_model trace = create_langfuse_trace( @@ -127,15 +174,19 @@ class Provider::Openai < Provider input: { merchants: merchants } ) - result = ProviderMerchantEnhancer.new( - client, - model: effective_model, - merchants: merchants, - custom_provider: custom_provider?, - langfuse_trace: trace, - family: family, - json_mode: json_mode - ).enhance_merchants + batches = slice_for_context(merchants) + + result = batches.flat_map do |batch| + ProviderMerchantEnhancer.new( + client, + model: effective_model, + merchants: batch, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family, + json_mode: json_mode + ).enhance_merchants + end upsert_langfuse_trace(trace: trace, output: result.map(&:to_h)) @@ -171,7 +222,8 @@ class Provider::Openai < Provider pdf_content: pdf_content, custom_provider: custom_provider?, langfuse_trace: trace, - family: family + family: family, + max_response_tokens: max_response_tokens ).process upsert_langfuse_trace(trace: trace, output: result.to_h) @@ -207,25 +259,17 @@ class Provider::Openai < Provider instructions: nil, functions: [], function_results: [], + messages: nil, streamer: nil, previous_response_id: nil, session_id: nil, user_identifier: nil, family: nil ) - if custom_provider? - generic_chat_response( - prompt: prompt, - model: model, - instructions: instructions, - functions: functions, - function_results: function_results, - streamer: streamer, - session_id: session_id, - user_identifier: user_identifier, - family: family - ) - else + if supports_responses_endpoint? + # Native path uses the Responses API which chains history via + # `previous_response_id`; it does NOT need (and must not receive) + # inline message history in the input payload. native_chat_response( prompt: prompt, model: model, @@ -238,12 +282,48 @@ class Provider::Openai < Provider user_identifier: user_identifier, family: family ) + else + generic_chat_response( + prompt: prompt, + model: model, + instructions: instructions, + functions: functions, + function_results: function_results, + messages: messages, + streamer: streamer, + session_id: session_id, + user_identifier: user_identifier, + family: family + ) end end private attr_reader :client + # Returns the first positive integer among env, setting, default. Treats + # zero or negative values as "unset" and falls through — a 0-token budget + # is never what the user meant. + def positive_budget(env_value, setting_value, default) + from_env = env_value.to_s.strip.to_i + return from_env if from_env.positive? + return setting_value.to_i if setting_value.to_i.positive? + default + end + + # Routes one-shot (non-chat) inputs through the BatchSlicer so large + # caller batches are split to fit the model's context window. `fixed` is + # the portion of the prompt that stays constant across every sub-batch + # (e.g. user_categories, user_merchants), used for fixed-tokens accounting. + def slice_for_context(items, fixed: nil) + BatchSlicer.call( + Array(items), + max_items: max_items_per_call, + max_tokens: max_input_tokens, + fixed_tokens: fixed ? Assistant::TokenEstimator.estimate(fixed) : 0 + ) + end + def native_chat_response( prompt:, model:, @@ -278,7 +358,7 @@ class Provider::Openai < Provider nil end - input_payload = chat_config.build_input(prompt) + input_payload = chat_config.build_input(prompt: prompt) begin raw_response = client.responses.create(parameters: { @@ -344,6 +424,7 @@ class Provider::Openai < Provider instructions: nil, functions: [], function_results: [], + messages: nil, streamer: nil, session_id: nil, user_identifier: nil, @@ -353,7 +434,8 @@ class Provider::Openai < Provider messages = build_generic_messages( prompt: prompt, instructions: instructions, - function_results: function_results + function_results: function_results, + messages: messages ) tools = build_generic_tools(functions) @@ -412,16 +494,24 @@ class Provider::Openai < Provider end end - def build_generic_messages(prompt:, instructions: nil, function_results: []) - messages = [] + def build_generic_messages(prompt:, instructions: nil, function_results: [], messages: nil) + payload = [] # Add system message if instructions present if instructions.present? - messages << { role: "system", content: instructions } + payload << { role: "system", content: instructions } end - # Add user prompt - messages << { role: "user", content: prompt } + # Add conversation history or user prompt. History is trimmed to fit the + # configured token budget so small-context local models (Ollama, LM Studio, + # LocalAI) don't silently truncate. tool_call/tool_result pairs are + # preserved atomically by HistoryTrimmer. + if messages.present? + trimmed = Assistant::HistoryTrimmer.new(messages, max_tokens: max_history_tokens).call + payload.concat(trimmed) + elsif prompt.present? + payload << { role: "user", content: prompt } + end # If there are function results, we need to add the assistant message that made the tool calls # followed by the tool messages with the results @@ -442,7 +532,7 @@ class Provider::Openai < Provider } end - messages << { + payload << { role: "assistant", content: "", # Some OpenAI-compatible APIs require string, not null tool_calls: tool_calls @@ -462,7 +552,7 @@ class Provider::Openai < Provider output.to_json end - messages << { + payload << { role: "tool", tool_call_id: fn_result[:call_id], name: fn_result[:name], @@ -471,7 +561,7 @@ class Provider::Openai < Provider end end - messages + payload end def build_generic_tools(functions) diff --git a/app/models/provider/openai/batch_slicer.rb b/app/models/provider/openai/batch_slicer.rb new file mode 100644 index 000000000..577ddba5d --- /dev/null +++ b/app/models/provider/openai/batch_slicer.rb @@ -0,0 +1,45 @@ +class Provider::Openai::BatchSlicer + class ContextOverflowError < StandardError; end + + # Splits `items` into sub-batches that respect both a hard item cap and a + # token-budget cap. Used by auto_categorize / auto_detect_merchants / + # enhance_provider_merchants so callers can pass larger batches and have the + # provider fan them out to fit small-context models. + def self.call(items, max_items:, max_tokens:, fixed_tokens: 0) + items = Array(items) + return [] if items.empty? + + available = max_tokens.to_i - fixed_tokens.to_i + if available <= 0 + raise ContextOverflowError, + "Fixed prompt tokens (#{fixed_tokens}) exceed context budget (#{max_tokens})" + end + + batches = [] + current = [] + current_tokens = 0 + + items.each do |item| + item_tokens = Assistant::TokenEstimator.estimate(item) + if item_tokens > available + raise ContextOverflowError, + "Single item requires ~#{item_tokens} tokens, which exceeds available budget (#{available})" + end + + would_exceed_items = current.size >= max_items.to_i + would_exceed_tokens = current_tokens + item_tokens > available + + if would_exceed_items || would_exceed_tokens + batches << current unless current.empty? + current = [] + current_tokens = 0 + end + + current << item + current_tokens += item_tokens + end + + batches << current unless current.empty? + batches + end +end diff --git a/app/models/provider/openai/chat_config.rb b/app/models/provider/openai/chat_config.rb index 5e98a66ce..65e683de4 100644 --- a/app/models/provider/openai/chat_config.rb +++ b/app/models/provider/openai/chat_config.rb @@ -16,7 +16,9 @@ class Provider::Openai::ChatConfig end end - def build_input(prompt) + def build_input(prompt: nil) + input_messages = prompt.present? ? [ { role: "user", content: prompt } ] : [] + results = function_results.map do |fn_result| # Handle nil explicitly to avoid serializing to "null" output = fn_result[:output] @@ -36,7 +38,7 @@ class Provider::Openai::ChatConfig end [ - { role: "user", content: prompt }, + *input_messages, *results ] end diff --git a/app/models/provider/openai/pdf_processor.rb b/app/models/provider/openai/pdf_processor.rb index f65510e87..1cb63e773 100644 --- a/app/models/provider/openai/pdf_processor.rb +++ b/app/models/provider/openai/pdf_processor.rb @@ -1,15 +1,16 @@ class Provider::Openai::PdfProcessor include Provider::Openai::Concerns::UsageRecorder - attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family + attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family, :max_response_tokens - def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil) + def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil, max_response_tokens:) @client = client @model = model @pdf_content = pdf_content @custom_provider = custom_provider @langfuse_trace = langfuse_trace @family = family + @max_response_tokens = max_response_tokens end def process @@ -175,7 +176,7 @@ class Provider::Openai::PdfProcessor { role: "system", content: instructions + "\n\nIMPORTANT: Respond with valid JSON only, no markdown or other formatting." }, { role: "user", content: content } ], - max_tokens: 4096 + max_tokens: max_response_tokens } response = client.chat(parameters: params) diff --git a/app/models/setting.rb b/app/models/setting.rb index f16a66d6a..c5aa08d7e 100644 --- a/app/models/setting.rb +++ b/app/models/setting.rb @@ -10,6 +10,14 @@ class Setting < RailsSettings::Base field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"] field :openai_model, type: :string, default: ENV["OPENAI_MODEL"] field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"] + + # LLM token budget (applies to every outbound LLM call: chat, auto-categorize, + # merchant detection, enhance-merchants, PDF processing). Defaults track + # Ollama's historical 2048-token baseline so local small-context models work + # out of the box. ENV overrides Setting at read time in Provider::Openai. + field :llm_context_window, type: :integer, default: ENV["LLM_CONTEXT_WINDOW"]&.to_i + field :llm_max_response_tokens, type: :integer, default: ENV["LLM_MAX_RESPONSE_TOKENS"]&.to_i + field :llm_max_items_per_call, type: :integer, default: ENV["LLM_MAX_ITEMS_PER_CALL"]&.to_i field :external_assistant_url, type: :string field :external_assistant_token, type: :string field :external_assistant_agent_id, type: :string diff --git a/app/views/settings/hostings/_openai_settings.html.erb b/app/views/settings/hostings/_openai_settings.html.erb index 6691d9556..b54dd7671 100644 --- a/app/views/settings/hostings/_openai_settings.html.erb +++ b/app/views/settings/hostings/_openai_settings.html.erb @@ -63,5 +63,37 @@ { disabled: ENV["LLM_JSON_MODE"].present?, data: { "auto-submit-form-target": "auto" } } %>

<%= t(".json_mode_help") %>

+ +
+

<%= t(".budget_heading") %>

+

<%= t(".budget_description") %>

+ + <%= form.number_field :llm_context_window, + label: t(".context_window_label"), + placeholder: "2048", + value: Setting.llm_context_window, + min: 256, + disabled: ENV["LLM_CONTEXT_WINDOW"].present?, + data: { "auto-submit-form-target": "auto" } %> +

<%= t(".context_window_help") %>

+ + <%= form.number_field :llm_max_response_tokens, + label: t(".max_response_tokens_label"), + placeholder: "512", + value: Setting.llm_max_response_tokens, + min: 64, + disabled: ENV["LLM_MAX_RESPONSE_TOKENS"].present?, + data: { "auto-submit-form-target": "auto" } %> +

<%= t(".max_response_tokens_help") %>

+ + <%= form.number_field :llm_max_items_per_call, + label: t(".max_items_per_call_label"), + placeholder: "25", + value: Setting.llm_max_items_per_call, + min: 1, + disabled: ENV["LLM_MAX_ITEMS_PER_CALL"].present?, + data: { "auto-submit-form-target": "auto" } %> +

<%= t(".max_items_per_call_help") %>

+
<% end %> diff --git a/compose.example.ai.yml b/compose.example.ai.yml index 1265bb23d..a1de3b6e8 100644 --- a/compose.example.ai.yml +++ b/compose.example.ai.yml @@ -187,7 +187,7 @@ services: - WEBUI_AUTH=False - WEBUI_NAME=AI - WEBUI_URL=http://localhost:8080 - - WEBUI_SECRET_KEY=t0p-s3cr3t + - WEBUI_SECRET_KEY=t0p-s3cr3t # pipelock:ignore - NO_PROXY=host.docker.internal extra_hosts: - host.docker.internal:host-gateway @@ -325,7 +325,7 @@ services: - POSTGRES_HOST=db - POSTGRES_DB=${POSTGRES_DB:-sure_production} - POSTGRES_USER=${POSTGRES_USER:-sure_user} - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password} # pipelock:ignore - SCHEDULE=@daily # Runs once a day at midnight - BACKUP_KEEP_DAYS=7 # Keeps the last 7 days of backups - BACKUP_KEEP_WEEKS=4 # Keeps 4 weekly backups diff --git a/compose.example.yml b/compose.example.yml index 487234038..108fd6b9a 100644 --- a/compose.example.yml +++ b/compose.example.yml @@ -130,7 +130,7 @@ services: - POSTGRES_HOST=db - POSTGRES_DB=${POSTGRES_DB:-sure_production} - POSTGRES_USER=${POSTGRES_USER:-sure_user} - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password} # pipelock:ignore - SCHEDULE=@daily # Runs once a day at midnight - BACKUP_KEEP_DAYS=7 # Keeps the last 7 days of backups - BACKUP_KEEP_WEEKS=4 # Keeps 4 weekly backups diff --git a/config/locales/views/settings/hostings/en.yml b/config/locales/views/settings/hostings/en.yml index 22d1f1d31..0c3f8f7ab 100644 --- a/config/locales/views/settings/hostings/en.yml +++ b/config/locales/views/settings/hostings/en.yml @@ -100,6 +100,14 @@ en: json_mode_none: None (best for standard models) json_mode_json_object: JSON Object json_mode_help: "Strict mode works best with thinking models (qwen-thinking, deepseek-reasoner). None mode works best with standard models (llama, mistral, gpt-oss)." + budget_heading: Token Budget + budget_description: Applies to every LLM call — chat history, auto-categorization, merchant detection, and PDF processing. Defaults are conservative for small-context local models. Raise for cloud models with larger context windows. + context_window_label: Context Window (Optional) + context_window_help: "Total tokens the model will accept. Default: 2048 — raise to 8192+ for cloud OpenAI or large-context local models." + max_response_tokens_label: Max Response Tokens (Optional) + max_response_tokens_help: "Tokens reserved for the model's reply. Default: 512. Lower to free up room for longer history." + max_items_per_call_label: Max Items Per Batch (Optional) + max_items_per_call_help: "Upper bound for auto-categorize / merchant detection batches. Default: 25. Larger batches are auto-sliced to fit the context window." title: OpenAI yahoo_finance_settings: title: Yahoo Finance @@ -161,6 +169,7 @@ en: success: Settings updated invalid_onboarding_state: Invalid onboarding state invalid_sync_time: Invalid sync time format. Please use HH:MM format (e.g., 02:30). + invalid_llm_budget: "%{field} must be a whole number ≥ %{minimum}." scheduler_sync_failed: Settings saved, but failed to update the sync schedule. Please try again or check the server logs. disconnect_external_assistant: external_assistant_disconnected: External assistant disconnected diff --git a/test/controllers/settings/hostings_controller_test.rb b/test/controllers/settings/hostings_controller_test.rb index c38585bc3..4ebe20f87 100644 --- a/test/controllers/settings/hostings_controller_test.rb +++ b/test/controllers/settings/hostings_controller_test.rb @@ -235,6 +235,61 @@ class Settings::HostingsControllerTest < ActionDispatch::IntegrationTest end end + test "accepts valid llm budget overrides and blanks clear them" do + with_self_hosting do + patch settings_hosting_url, params: { setting: { + llm_context_window: "4096", + llm_max_response_tokens: "1024", + llm_max_items_per_call: "40" + } } + + assert_redirected_to settings_hosting_url + assert_equal 4096, Setting.llm_context_window + assert_equal 1024, Setting.llm_max_response_tokens + assert_equal 40, Setting.llm_max_items_per_call + + patch settings_hosting_url, params: { setting: { + llm_context_window: "", + llm_max_response_tokens: "", + llm_max_items_per_call: "" + } } + + assert_nil Setting.llm_context_window + assert_nil Setting.llm_max_response_tokens + assert_nil Setting.llm_max_items_per_call + end + ensure + Setting.llm_context_window = nil + Setting.llm_max_response_tokens = nil + Setting.llm_max_items_per_call = nil + end + + test "rejects llm budget below field minimum" do + with_self_hosting do + patch settings_hosting_url, params: { setting: { llm_context_window: "0" } } + + assert_response :unprocessable_entity + assert_match(/must be a whole number/, flash[:alert]) + assert_nil Setting.llm_context_window + + patch settings_hosting_url, params: { setting: { llm_max_response_tokens: "-5" } } + + assert_response :unprocessable_entity + assert_match(/must be a whole number/, flash[:alert]) + assert_nil Setting.llm_max_response_tokens + + patch settings_hosting_url, params: { setting: { llm_max_items_per_call: "not-a-number" } } + + assert_response :unprocessable_entity + assert_match(/must be a whole number/, flash[:alert]) + assert_nil Setting.llm_max_items_per_call + end + ensure + Setting.llm_context_window = nil + Setting.llm_max_response_tokens = nil + Setting.llm_max_items_per_call = nil + end + test "can clear data only when admin" do with_self_hosting do sign_in users(:family_member) diff --git a/test/models/assistant/history_trimmer_test.rb b/test/models/assistant/history_trimmer_test.rb new file mode 100644 index 000000000..abd94324e --- /dev/null +++ b/test/models/assistant/history_trimmer_test.rb @@ -0,0 +1,93 @@ +require "test_helper" + +class Assistant::HistoryTrimmerTest < ActiveSupport::TestCase + test "returns empty array for empty input" do + assert_equal [], Assistant::HistoryTrimmer.new([], max_tokens: 1000).call + assert_equal [], Assistant::HistoryTrimmer.new(nil, max_tokens: 1000).call + end + + test "returns empty array when max_tokens is zero or negative" do + messages = [ { role: "user", content: "hi" } ] + assert_equal [], Assistant::HistoryTrimmer.new(messages, max_tokens: 0).call + assert_equal [], Assistant::HistoryTrimmer.new(messages, max_tokens: -10).call + end + + test "keeps full history when budget is generous" do + messages = [ + { role: "user", content: "a" * 40 }, + { role: "assistant", content: "b" * 40 }, + { role: "user", content: "c" * 40 } + ] + + result = Assistant::HistoryTrimmer.new(messages, max_tokens: 10_000).call + + assert_equal messages, result + end + + test "drops oldest messages when budget is tight" do + messages = [ + { role: "user", content: "a" * 100 }, + { role: "assistant", content: "b" * 100 }, + { role: "user", content: "c" * 100 } + ] + + # ~40 tokens per message; budget of 60 should keep only the last one + result = Assistant::HistoryTrimmer.new(messages, max_tokens: 60).call + + assert_equal 1, result.size + assert_equal "c" * 100, result.first[:content] + end + + test "keeps tool_call and tool_result as an atomic pair" do + messages = [ + { role: "user", content: "a" * 40 }, + { role: "assistant", content: "", tool_calls: [ { id: "c1", type: "function", function: { name: "f", arguments: "{}" } } ] }, + { role: "tool", tool_call_id: "c1", name: "f", content: "result" }, + { role: "assistant", content: "follow up" } + ] + + # Small budget: should keep the newest assistant follow-up. Pair is either + # kept together or dropped together — never split. + result = Assistant::HistoryTrimmer.new(messages, max_tokens: 20).call + + tool_call_idx = result.index { |m| m[:role] == "assistant" && m[:tool_calls] } + tool_result_idx = result.index { |m| m[:role] == "tool" } + + if tool_call_idx + assert_not_nil tool_result_idx, "tool_call without paired tool_result" + assert tool_result_idx > tool_call_idx, "tool_result must follow its tool_call" + else + assert_nil tool_result_idx, "tool_result without paired tool_call leaked through" + end + end + + test "never splits tool_call + tool_result pair when dropping" do + messages = [ + { role: "assistant", content: "", tool_calls: [ { id: "c1", type: "function", function: { name: "f", arguments: "{}" } } ] }, + { role: "tool", tool_call_id: "c1", name: "f", content: "x" * 200 }, + { role: "user", content: "y" * 20 } + ] + + # Budget big enough for the user turn, not for the pair + result = Assistant::HistoryTrimmer.new(messages, max_tokens: 15).call + + assert_equal 1, result.size + assert_equal "user", result.first[:role] + end + + test "handles multiple tool results following a single assistant tool_calls message" do + messages = [ + { role: "user", content: "hi" }, + { role: "assistant", content: "", tool_calls: [ + { id: "c1", type: "function", function: { name: "f1", arguments: "{}" } }, + { id: "c2", type: "function", function: { name: "f2", arguments: "{}" } } + ] }, + { role: "tool", tool_call_id: "c1", name: "f1", content: "r1" }, + { role: "tool", tool_call_id: "c2", name: "f2", content: "r2" } + ] + + result = Assistant::HistoryTrimmer.new(messages, max_tokens: 10_000).call + + assert_equal messages, result + end +end diff --git a/test/models/assistant/token_estimator_test.rb b/test/models/assistant/token_estimator_test.rb new file mode 100644 index 000000000..a22313c9f --- /dev/null +++ b/test/models/assistant/token_estimator_test.rb @@ -0,0 +1,40 @@ +require "test_helper" + +class Assistant::TokenEstimatorTest < ActiveSupport::TestCase + test "estimate is zero for nil or empty" do + assert_equal 0, Assistant::TokenEstimator.estimate(nil) + assert_equal 0, Assistant::TokenEstimator.estimate("") + assert_equal 0, Assistant::TokenEstimator.estimate([]) + end + + test "estimate applies chars/4 with safety factor for strings" do + # 100 chars / 4 = 25 tokens, × 1.25 safety factor, ceil = 32 + assert_equal 32, Assistant::TokenEstimator.estimate("a" * 100) + end + + test "estimate sums across arrays" do + # Use lengths that avoid ceil rounding drift: 80 chars → 20 tokens → 25.0 ceil → 25. + string_estimate = Assistant::TokenEstimator.estimate("a" * 80) + array_estimate = Assistant::TokenEstimator.estimate([ "a" * 80, "a" * 80 ]) + + assert_equal string_estimate * 2, array_estimate + end + + test "estimate serializes hashes via JSON" do + hash = { role: "assistant", content: "hi" } + # {"role":"assistant","content":"hi"} => 35 chars / 4 × 1.25 ceil = 11 + assert_equal 11, Assistant::TokenEstimator.estimate(hash) + end + + test "estimate handles nested structures" do + nested = [ { role: "user", content: "hello" }, { role: "assistant", content: "world" } ] + # Each hash gets JSON-serialized and summed + expected = nested.sum { |h| Assistant::TokenEstimator.estimate(h) } + assert_equal expected, Assistant::TokenEstimator.estimate(nested) + end + + test "estimate coerces unknown types via to_s" do + assert Assistant::TokenEstimator.estimate(12345) > 0 + assert Assistant::TokenEstimator.estimate(:symbol) > 0 + end +end diff --git a/test/models/assistant_test.rb b/test/models/assistant_test.rb index b396cf7ed..6efc2e782 100644 --- a/test/models/assistant_test.rb +++ b/test/models/assistant_test.rb @@ -14,6 +14,10 @@ class AssistantTest < ActiveSupport::TestCase @provider = mock @expected_session_id = @chat.id.to_s @expected_user_identifier = ::Digest::SHA256.hexdigest(@chat.user_id.to_s) + @expected_conversation_history = [ + { role: "user", content: "Can you help me understand my spending habits?" }, + { role: "user", content: "What is my net worth?" } + ] end test "errors get added to chat" do @@ -100,6 +104,7 @@ class AssistantTest < ActiveSupport::TestCase @provider.expects(:chat_response).with do |message, **options| assert_equal @expected_session_id, options[:session_id] assert_equal @expected_user_identifier, options[:user_identifier] + assert_equal @expected_conversation_history, options[:messages] text_chunks.each do |text_chunk| options[:streamer].call(text_chunk) end @@ -154,6 +159,7 @@ class AssistantTest < ActiveSupport::TestCase @provider.expects(:chat_response).with do |message, **options| assert_equal @expected_session_id, options[:session_id] assert_equal @expected_user_identifier, options[:user_identifier] + assert_equal @expected_conversation_history, options[:messages] call2_text_chunks.each do |text_chunk| options[:streamer].call(text_chunk) end @@ -165,6 +171,7 @@ class AssistantTest < ActiveSupport::TestCase @provider.expects(:chat_response).with do |message, **options| assert_equal @expected_session_id, options[:session_id] assert_equal @expected_user_identifier, options[:user_identifier] + assert_equal @expected_conversation_history, options[:messages] options[:streamer].call(call1_response_chunk) true end.returns(call1_response).once.in_sequence(sequence) @@ -418,6 +425,95 @@ class AssistantTest < ActiveSupport::TestCase assert_raises(Assistant::Error) { Assistant.for_chat(nil) } end + test "builtin demotes a partially-streamed assistant message to failed on error" do + @assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider) + + boom = StandardError.new("boom mid-stream") + + @provider.expects(:chat_response).with do |_prompt, **options| + # Simulate a partial text chunk landing before the error propagates. + options[:streamer].call(provider_text_chunk("partial tokens ")) + true + end.returns(provider_error_response(boom)) + + @assistant.respond_to(@message) + + partial = @chat.messages.where(type: "AssistantMessage").order(:created_at).last + assert partial.present?, "partial assistant message should be persisted" + assert_equal "failed", partial.status + assert_equal "partial tokens ", partial.content + end + + test "conversation_history excludes failed and pending messages" do + # Add a failed assistant turn; it must NOT leak into history. + AssistantMessage.create!( + chat: @chat, + content: "partial error response", + ai_model: "gpt-4.1", + status: "failed" + ) + + @assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider) + + captured_history = nil + @provider.expects(:chat_response).with do |_prompt, **options| + captured_history = options[:messages] + options[:streamer].call( + provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []) + ) + true + end.returns(provider_success_response( + provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []).data + )) + + @assistant.respond_to(@message) + + contents = captured_history.map { |m| m[:content] } + assert_not_includes contents, "partial error response" + end + + test "conversation_history serializes assistant tool_calls with paired tool result" do + assistant_msg = AssistantMessage.create!( + chat: @chat, + content: "Looking that up", + ai_model: "gpt-4.1", + status: "complete" + ) + + ToolCall::Function.create!( + message: assistant_msg, + provider_id: "call_abc", + provider_call_id: "call_abc", + function_name: "get_net_worth", + function_arguments: { foo: "bar" }, + function_result: { amount: 1000, currency: "USD" } + ) + + @assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider) + + captured_history = nil + @provider.expects(:chat_response).with do |_prompt, **options| + captured_history = options[:messages] + options[:streamer].call( + provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []) + ) + true + end.returns(provider_success_response( + provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []).data + )) + + @assistant.respond_to(@message) + + tool_call_entry = captured_history.find { |m| m[:role] == "assistant" && m[:tool_calls].present? } + tool_result_entry = captured_history.find { |m| m[:role] == "tool" } + + assert_not_nil tool_call_entry, "tool_call message missing from history" + assert_not_nil tool_result_entry, "tool_result message missing from history" + assert_equal "call_abc", tool_call_entry[:tool_calls].first[:id] + assert_equal "call_abc", tool_result_entry[:tool_call_id] + assert_equal "get_net_worth", tool_result_entry[:name] + end + private def mock_external_sse_response(sse_body) diff --git a/test/models/provider/openai/batch_slicer_test.rb b/test/models/provider/openai/batch_slicer_test.rb new file mode 100644 index 000000000..c56133991 --- /dev/null +++ b/test/models/provider/openai/batch_slicer_test.rb @@ -0,0 +1,74 @@ +require "test_helper" + +class Provider::Openai::BatchSlicerTest < ActiveSupport::TestCase + test "returns empty array for empty input" do + assert_equal [], Provider::Openai::BatchSlicer.call([], max_items: 25, max_tokens: 2000) + end + + test "single batch when within item cap and token budget" do + items = Array.new(10) { |i| { id: i, name: "Item#{i}" } } + + batches = Provider::Openai::BatchSlicer.call(items, max_items: 25, max_tokens: 10_000) + + assert_equal 1, batches.size + assert_equal items, batches.first + end + + test "splits on item cap" do + items = Array.new(30) { |i| { id: i } } + + batches = Provider::Openai::BatchSlicer.call(items, max_items: 10, max_tokens: 100_000) + + assert_equal 3, batches.size + assert_equal 10, batches.first.size + assert_equal 10, batches.last.size + assert_equal items, batches.flatten + end + + test "splits on token budget" do + big_item = { payload: "x" * 200 } # ~63 tokens each + items = Array.new(10) { big_item } + + # ~125-token budget fits 2 items per batch + batches = Provider::Openai::BatchSlicer.call(items, max_items: 100, max_tokens: 125) + + assert batches.size > 1 + assert_equal items, batches.flatten + batches.each do |batch| + tokens = Assistant::TokenEstimator.estimate(batch) + assert tokens <= 125, "Batch exceeded token budget: #{tokens}" + end + end + + test "raises ContextOverflowError when single item exceeds budget" do + huge = { payload: "x" * 10_000 } + + assert_raises Provider::Openai::BatchSlicer::ContextOverflowError do + Provider::Openai::BatchSlicer.call([ huge ], max_items: 25, max_tokens: 500) + end + end + + test "raises ContextOverflowError when fixed_tokens exceed budget" do + assert_raises Provider::Openai::BatchSlicer::ContextOverflowError do + Provider::Openai::BatchSlicer.call([ { a: 1 } ], max_items: 25, max_tokens: 500, fixed_tokens: 600) + end + end + + test "respects fixed_tokens when computing available budget" do + items = Array.new(5) { { payload: "x" * 80 } } # ~25 tokens each + + # Budget of 200 minus fixed 100 = 100 available; fits ~4 items per batch + batches = Provider::Openai::BatchSlicer.call(items, max_items: 25, max_tokens: 200, fixed_tokens: 100) + + assert batches.size >= 2 + assert_equal items, batches.flatten + end + + test "never produces empty batches" do + items = Array.new(7) { { id: SecureRandom.hex(4) } } + + batches = Provider::Openai::BatchSlicer.call(items, max_items: 2, max_tokens: 10_000) + + batches.each { |b| assert b.any?, "Empty batch produced" } + end +end diff --git a/test/models/provider/openai_test.rb b/test/models/provider/openai_test.rb index 879fe20e7..5d3e5e960 100644 --- a/test/models/provider/openai_test.rb +++ b/test/models/provider/openai_test.rb @@ -330,4 +330,130 @@ class Provider::OpenaiTest < ActiveSupport::TestCase @subject.send(:create_langfuse_trace, name: "openai.test", input: { foo: "bar" }) end + + test "SUPPORTED_MODELS and VISION_CAPABLE_MODEL_PREFIXES are Ruby constants, not YAML-derived" do + assert_kind_of Array, Provider::Openai::SUPPORTED_MODELS + assert Provider::Openai::SUPPORTED_MODELS.all? { |s| s.is_a?(String) } + assert Provider::Openai::SUPPORTED_MODELS.frozen? + + assert_kind_of Array, Provider::Openai::VISION_CAPABLE_MODEL_PREFIXES + assert Provider::Openai::VISION_CAPABLE_MODEL_PREFIXES.frozen? + assert_equal "gpt-4.1", Provider::Openai::DEFAULT_MODEL + end + + test "budget readers default to conservative values" do + with_env_overrides( + "LLM_CONTEXT_WINDOW" => nil, + "LLM_MAX_RESPONSE_TOKENS" => nil, + "LLM_SYSTEM_PROMPT_RESERVE" => nil, + "LLM_MAX_HISTORY_TOKENS" => nil, + "LLM_MAX_ITEMS_PER_CALL" => nil + ) do + subject = Provider::Openai.new("test-token") + assert_equal 2048, subject.context_window + assert_equal 512, subject.max_response_tokens + assert_equal 256, subject.system_prompt_reserve + assert_equal 2048 - 512 - 256, subject.max_history_tokens + assert_equal 2048 - 512 - 256, subject.max_input_tokens + assert_equal 25, subject.max_items_per_call + end + end + + test "budget readers respect explicit env overrides" do + with_env_overrides( + "LLM_CONTEXT_WINDOW" => "8192", + "LLM_MAX_RESPONSE_TOKENS" => "1024", + "LLM_SYSTEM_PROMPT_RESERVE" => "512", + "LLM_MAX_HISTORY_TOKENS" => "4096", + "LLM_MAX_ITEMS_PER_CALL" => "50" + ) do + subject = Provider::Openai.new("test-token") + assert_equal 8192, subject.context_window + assert_equal 1024, subject.max_response_tokens + assert_equal 512, subject.system_prompt_reserve + assert_equal 4096, subject.max_history_tokens # explicit overrides derived + assert_equal 8192 - 1024 - 512, subject.max_input_tokens + assert_equal 50, subject.max_items_per_call + end + end + + test "budget readers fall back to Setting when ENV unset" do + with_env_overrides( + "LLM_CONTEXT_WINDOW" => nil, + "LLM_MAX_RESPONSE_TOKENS" => nil, + "LLM_MAX_ITEMS_PER_CALL" => nil + ) do + Setting.llm_context_window = 8192 + Setting.llm_max_response_tokens = 1024 + Setting.llm_max_items_per_call = 40 + + subject = Provider::Openai.new("test-token") + assert_equal 8192, subject.context_window + assert_equal 1024, subject.max_response_tokens + assert_equal 40, subject.max_items_per_call + end + ensure + Setting.llm_context_window = nil + Setting.llm_max_response_tokens = nil + Setting.llm_max_items_per_call = nil + end + + test "budget readers: ENV beats Setting when both present" do + with_env_overrides("LLM_CONTEXT_WINDOW" => "16384") do + Setting.llm_context_window = 4096 + subject = Provider::Openai.new("test-token") + assert_equal 16384, subject.context_window + end + ensure + Setting.llm_context_window = nil + end + + test "budget readers: zero or negative values fall through to default" do + with_env_overrides( + "LLM_CONTEXT_WINDOW" => "0", + "LLM_MAX_RESPONSE_TOKENS" => nil, + "LLM_MAX_ITEMS_PER_CALL" => nil + ) do + Setting.llm_context_window = 0 + subject = Provider::Openai.new("test-token") + assert_equal 2048, subject.context_window + end + ensure + Setting.llm_context_window = nil + end + + test "auto_categorize fans out oversized batches into sequential sub-calls" do + with_env_overrides("LLM_MAX_ITEMS_PER_CALL" => "10") do + subject = Provider::Openai.new("test-token") + transactions = Array.new(25) { |i| { id: i.to_s, name: "txn#{i}", amount: 10, classification: "expense" } } + user_categories = [ { id: "cat1", name: "Groceries", is_subcategory: false, parent_id: nil, classification: "expense" } ] + + # Capture the batch size passed to each AutoCategorizer. `.new` is called + # once per sub-batch; we record each invocation's transactions count. + seen_sizes = [] + fake_instance = mock + fake_instance.stubs(:auto_categorize).returns([]) + Provider::Openai::AutoCategorizer.stubs(:new).with do |*_args, **kwargs| + seen_sizes << kwargs[:transactions].size + true + end.returns(fake_instance) + + response = subject.auto_categorize(transactions: transactions, user_categories: user_categories) + + assert response.success? + assert_equal [ 10, 10, 5 ], seen_sizes + end + end + + test "build_input no longer accepts inline messages history" do + config = Provider::Openai::ChatConfig.new(functions: [], function_results: []) + # Positive control: prompt works + result = config.build_input(prompt: "hi") + assert_equal [ { role: "user", content: "hi" } ], result + + # `messages:` kwarg is no longer part of the signature — calling with it must raise + assert_raises(ArgumentError) do + config.build_input(prompt: "hi", messages: [ { role: "user", content: "old" } ]) + end + end end