diff --git a/.env.example b/.env.example index f5e98e7a4..b11153b27 100644 --- a/.env.example +++ b/.env.example @@ -25,19 +25,34 @@ OPENAI_ACCESS_TOKEN= OPENAI_MODEL= OPENAI_URI_BASE= +# Optional: LLM token budget (applies to chat, auto-categorize, merchant detection, PDF processing). +# Lower these for small-context local models (Ollama, LM Studio, LocalAI). +# Defaults work for modern cloud OpenAI models without configuration. +# LLM_CONTEXT_WINDOW=2048 +# LLM_MAX_RESPONSE_TOKENS=512 +# LLM_MAX_HISTORY_TOKENS= +# LLM_SYSTEM_PROMPT_RESERVE=256 +# LLM_MAX_ITEMS_PER_CALL=25 + +# Optional: OpenAI-compatible capability flags +# OPENAI_REQUEST_TIMEOUT=60 # HTTP timeout in seconds; raise for slow local models +# OPENAI_SUPPORTS_PDF_PROCESSING=true # Set to false for endpoints without vision support +# OPENAI_SUPPORTS_RESPONSES_ENDPOINT= # Override Responses-API vs chat.completions routing +# LLM_JSON_MODE= # auto | strict | json_object | none + # Optional: External AI Assistant — delegates chat to a remote AI agent # instead of calling LLMs directly. The agent calls back to Sure's /mcp endpoint. # See docs/hosting/ai.md for full details. # ASSISTANT_TYPE=external # EXTERNAL_ASSISTANT_URL=https://your-agent-host/v1/chat/completions -# EXTERNAL_ASSISTANT_TOKEN=your-api-token +# EXTERNAL_ASSISTANT_TOKEN=your-api-token # pipelock:ignore # EXTERNAL_ASSISTANT_AGENT_ID=main # EXTERNAL_ASSISTANT_SESSION_KEY=agent:main:main # EXTERNAL_ASSISTANT_ALLOWED_EMAILS=user@example.com # Optional: MCP server endpoint — enables /mcp for external AI assistants. # Both values are required. MCP_USER_EMAIL must match an existing user's email. -# MCP_API_TOKEN=your-random-bearer-token +# MCP_API_TOKEN=your-random-bearer-token # pipelock:ignore # MCP_USER_EMAIL=user@example.com # Optional: Langfuse config @@ -82,7 +97,7 @@ EMAIL_SENDER= # Database Configuration DB_HOST=localhost # May need to be changed to `DB_HOST=db` if using devcontainer DB_PORT=5432 -POSTGRES_PASSWORD=postgres +POSTGRES_PASSWORD=postgres # pipelock:ignore POSTGRES_USER=postgres # Redis configuration @@ -94,7 +109,7 @@ REDIS_URL=redis://localhost:6379/1 # REDIS_SENTINEL_HOSTS=sentinel1:26379,sentinel2:26379,sentinel3:26379 # REDIS_SENTINEL_MASTER=mymaster # REDIS_SENTINEL_USERNAME=default -# REDIS_PASSWORD=your-redis-password +# REDIS_PASSWORD=your-redis-password # pipelock:ignore # App Domain # This is the domain that your Sure instance will be hosted at. It is used to generate links in emails and other places. diff --git a/.env.local.example b/.env.local.example index 75a8de778..1795cc8b9 100644 --- a/.env.local.example +++ b/.env.local.example @@ -28,8 +28,22 @@ TWELVE_DATA_API_KEY = OPENAI_ACCESS_TOKEN = OPENAI_URI_BASE = OPENAI_MODEL = -# OPENAI_REQUEST_TIMEOUT: Request timeout in seconds (default: 60) -# OPENAI_SUPPORTS_PDF_PROCESSING: Set to false for endpoints without vision support (default: true) + +# LLM token budget. Applies to ALL outbound LLM calls: chat history, +# auto-categorize, merchant detection, provider enhancer, PDF processing. +# Defaults to Ollama's historical 2048-token baseline so small local models +# work out of the box — raise explicitly for cloud or larger-context models. +# LLM_CONTEXT_WINDOW = 2048 # Total tokens the model will accept +# LLM_MAX_RESPONSE_TOKENS = 512 # Reserved for the model's reply +# LLM_MAX_HISTORY_TOKENS = # Derived if unset (context - response - system_reserve) +# LLM_SYSTEM_PROMPT_RESERVE = 256 # Tokens reserved for the system prompt +# LLM_MAX_ITEMS_PER_CALL = 25 # Upper bound on auto-categorize / merchant batches + +# OpenAI-compatible capability flags (custom/self-hosted providers) +# OPENAI_REQUEST_TIMEOUT = 60 # HTTP timeout in seconds; raise for slow local models +# OPENAI_SUPPORTS_PDF_PROCESSING = true # Set to false for endpoints without vision support +# OPENAI_SUPPORTS_RESPONSES_ENDPOINT = # true to force Responses API on custom providers +# LLM_JSON_MODE = # auto | strict | json_object | none # (example: LM Studio/Docker config) OpenAI-compatible API endpoint config # OPENAI_URI_BASE = http://host.docker.internal:1234/ diff --git a/app/controllers/settings/hostings_controller.rb b/app/controllers/settings/hostings_controller.rb index 59a85939a..063f2a6b0 100644 --- a/app/controllers/settings/hostings_controller.rb +++ b/app/controllers/settings/hostings_controller.rb @@ -1,6 +1,15 @@ class Settings::HostingsController < ApplicationController layout "settings" + # Minimum accepted value for each configurable LLM budget field. Mirrors the + # `min:` attribute on the form inputs in `_openai_settings.html.erb` so the + # controller rejects what the browser-side validator would reject. + LLM_BUDGET_MINIMUMS = { + llm_context_window: 256, + llm_max_response_tokens: 64, + llm_max_items_per_call: 1 + }.freeze + guard_feature unless: -> { self_hosted? } before_action :ensure_admin, only: [ :update, :clear_cache, :disconnect_external_assistant ] @@ -157,6 +166,21 @@ class Settings::HostingsController < ApplicationController Setting.openai_json_mode = hosting_params[:openai_json_mode].presence end + LLM_BUDGET_MINIMUMS.each do |key, minimum| + next unless hosting_params.key?(key) + raw = hosting_params[key].to_s.strip + if raw.blank? + Setting.public_send("#{key}=", nil) + next + end + parsed = Integer(raw, 10) rescue nil + if parsed.nil? || parsed < minimum + label = t("settings.hostings.openai_settings.#{key}_label") + raise Setting::ValidationError, t(".invalid_llm_budget", field: label, minimum: minimum) + end + Setting.public_send("#{key}=", parsed) + end + if hosting_params.key?(:external_assistant_url) Setting.external_assistant_url = hosting_params[:external_assistant_url] end @@ -199,7 +223,7 @@ class Settings::HostingsController < ApplicationController private def hosting_params return ActionController::Parameters.new unless params.key?(:setting) - params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :invite_only_default_family_id, :brand_fetch_client_id, :brand_fetch_high_res_logos, :twelve_data_api_key, :tiingo_api_key, :eodhd_api_key, :alpha_vantage_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider, :syncs_include_pending, :auto_sync_enabled, :auto_sync_time, :external_assistant_url, :external_assistant_token, :external_assistant_agent_id, securities_providers: []) + params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :invite_only_default_family_id, :brand_fetch_client_id, :brand_fetch_high_res_logos, :twelve_data_api_key, :tiingo_api_key, :eodhd_api_key, :alpha_vantage_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :llm_context_window, :llm_max_response_tokens, :llm_max_items_per_call, :exchange_rate_provider, :securities_provider, :syncs_include_pending, :auto_sync_enabled, :auto_sync_time, :external_assistant_url, :external_assistant_token, :external_assistant_agent_id, securities_providers: []) end def update_assistant_type diff --git a/app/models/assistant/builtin.rb b/app/models/assistant/builtin.rb index 1d615eb5a..14bc9c05d 100644 --- a/app/models/assistant/builtin.rb +++ b/app/models/assistant/builtin.rb @@ -63,6 +63,13 @@ class Assistant::Builtin < Assistant::Base responder.respond(previous_response_id: latest_response_id) rescue => e stop_thinking + # If we streamed any partial content before the error, the message was + # persisted with the default `complete` status. Demote it to `failed` so + # `Assistant::Responder#conversation_history` won't feed a broken turn + # back into future prompts. + if assistant_message&.persisted? + assistant_message.update_columns(status: "failed") + end chat.add_error(e) end diff --git a/app/models/assistant/history_trimmer.rb b/app/models/assistant/history_trimmer.rb new file mode 100644 index 000000000..02e64f8b9 --- /dev/null +++ b/app/models/assistant/history_trimmer.rb @@ -0,0 +1,53 @@ +class Assistant::HistoryTrimmer + def initialize(messages, max_tokens:) + @messages = messages || [] + @max_tokens = max_tokens.to_i + end + + def call + return [] if @messages.empty? || @max_tokens <= 0 + + kept = [] + tokens = 0 + + group_tool_pairs(@messages).reverse_each do |group| + group_tokens = Assistant::TokenEstimator.estimate(group) + break if tokens + group_tokens > @max_tokens + + kept.unshift(*group) + tokens += group_tokens + end + + kept + end + + private + + # Bundles each assistant message that has `tool_calls` with the + # consecutive `role: "tool"` results that follow it, so the trimmer + # never splits a call/result pair when dropping from the oldest end. + def group_tool_pairs(messages) + groups = [] + current_group = nil + + messages.each do |msg| + if assistant_with_tool_calls?(msg) + groups << current_group if current_group + current_group = [ msg ] + elsif msg[:role].to_s == "tool" && current_group + current_group << msg + else + groups << current_group if current_group + current_group = nil + groups << [ msg ] + end + end + + groups << current_group if current_group + groups + end + + def assistant_with_tool_calls?(msg) + msg[:role].to_s == "assistant" && msg[:tool_calls].is_a?(Array) && msg[:tool_calls].any? + end +end diff --git a/app/models/assistant/responder.rb b/app/models/assistant/responder.rb index f2b6d121a..480c69c22 100644 --- a/app/models/assistant/responder.rb +++ b/app/models/assistant/responder.rb @@ -79,6 +79,7 @@ class Assistant::Responder instructions: instructions, functions: function_tool_caller.function_definitions, function_results: function_results, + messages: conversation_history, streamer: streamer, previous_response_id: previous_response_id, session_id: chat_session_id, @@ -114,4 +115,46 @@ class Assistant::Responder def chat @chat ||= message.chat end + + def conversation_history + messages = [] + return messages unless chat&.messages + + chat.messages + .where(type: [ "UserMessage", "AssistantMessage" ], status: "complete") + .includes(:tool_calls) + .ordered + .each do |chat_message| + if chat_message.tool_calls.any? + messages << { + role: chat_message.role, + content: chat_message.content || "", + tool_calls: chat_message.tool_calls.map(&:to_tool_call) + } + + chat_message.tool_calls.map(&:to_result).each do |fn_result| + # Handle nil explicitly to avoid serializing to "null" + output = fn_result[:output] + content = if output.nil? + "" + elsif output.is_a?(String) + output + else + output.to_json + end + + messages << { + role: "tool", + tool_call_id: fn_result[:call_id], + name: fn_result[:name], + content: content + } + end + + elsif !chat_message.content.blank? + messages << { role: chat_message.role, content: chat_message.content || "" } + end + end + messages + end end diff --git a/app/models/assistant/token_estimator.rb b/app/models/assistant/token_estimator.rb new file mode 100644 index 000000000..4f5e3fd60 --- /dev/null +++ b/app/models/assistant/token_estimator.rb @@ -0,0 +1,19 @@ +module Assistant::TokenEstimator + CHARS_PER_TOKEN = 4 + SAFETY_FACTOR = 1.25 + + def self.estimate(value) + chars = char_length(value) + ((chars / CHARS_PER_TOKEN.to_f) * SAFETY_FACTOR).ceil + end + + def self.char_length(value) + case value + when nil then 0 + when String then value.length + when Array then value.sum { |v| char_length(v) } + when Hash then value.to_json.length + else value.to_s.length + end + end +end diff --git a/app/models/chat.rb b/app/models/chat.rb index d47dcccac..f97fca68b 100644 --- a/app/models/chat.rb +++ b/app/models/chat.rb @@ -26,9 +26,9 @@ class Chat < ApplicationRecord end # Returns the default AI model to use for chats - # Priority: ENV variable > Setting > OpenAI default + # Priority: AI Config > Setting def default_model - ENV["OPENAI_MODEL"].presence || Setting.openai_model.presence || Provider::Openai::DEFAULT_MODEL + Provider::Openai.effective_model.presence || Setting.openai_model end end diff --git a/app/models/provider/llm_concept.rb b/app/models/provider/llm_concept.rb index 46e6dcd84..52550111f 100644 --- a/app/models/provider/llm_concept.rb +++ b/app/models/provider/llm_concept.rb @@ -40,6 +40,7 @@ module Provider::LlmConcept instructions: nil, functions: [], function_results: [], + messages: nil, streamer: nil, previous_response_id: nil, session_id: nil, diff --git a/app/models/provider/openai.rb b/app/models/provider/openai.rb index 60c103a9e..052829cc1 100644 --- a/app/models/provider/openai.rb +++ b/app/models/provider/openai.rb @@ -4,31 +4,29 @@ class Provider::Openai < Provider # Subclass so errors caught in this provider are raised as Provider::Openai::Error Error = Class.new(Provider::Error) - # Supported OpenAI model prefixes (e.g., "gpt-4" matches "gpt-4", "gpt-4.1", "gpt-4-turbo", etc.) - DEFAULT_OPENAI_MODEL_PREFIXES = %w[gpt-4 gpt-5 o1 o3] - DEFAULT_MODEL = "gpt-4.1" - - # Models that support PDF/vision input (not all OpenAI models have vision capabilities) + DEFAULT_MODEL = "gpt-4.1".freeze + SUPPORTED_MODELS = %w[gpt-4 gpt-5 o1 o3].freeze VISION_CAPABLE_MODEL_PREFIXES = %w[gpt-4o gpt-4-turbo gpt-4.1 gpt-5 o1 o3].freeze - # Returns the effective model that would be used by the provider - # Uses the same logic as Provider::Registry and the initializer + # Returns the effective model that would be used by the provider. + # Priority: explicit ENV > Setting > DEFAULT_MODEL. def self.effective_model - configured_model = ENV.fetch("OPENAI_MODEL", Setting.openai_model) - configured_model.presence || DEFAULT_MODEL + ENV.fetch("OPENAI_MODEL") { Setting.openai_model }.presence || DEFAULT_MODEL end def initialize(access_token, uri_base: nil, model: nil) client_options = { access_token: access_token } - client_options[:uri_base] = uri_base if uri_base.present? + llm_uri_base = uri_base.presence + llm_model = model.presence + client_options[:uri_base] = llm_uri_base if llm_uri_base.present? client_options[:request_timeout] = ENV.fetch("OPENAI_REQUEST_TIMEOUT", 60).to_i @client = ::OpenAI::Client.new(**client_options) - @uri_base = uri_base - if custom_provider? && model.blank? + @uri_base = llm_uri_base + if custom_provider? && llm_model.blank? raise Error, "Model is required when using a custom OpenAI‑compatible provider" end - @default_model = model.presence || DEFAULT_MODEL + @default_model = llm_model.presence || self.class.effective_model end def supports_model?(model) @@ -36,7 +34,18 @@ class Provider::Openai < Provider return true if custom_provider? # Otherwise, check if model starts with any supported OpenAI prefix - DEFAULT_OPENAI_MODEL_PREFIXES.any? { |prefix| model.start_with?(prefix) } + SUPPORTED_MODELS.any? { |prefix| model.start_with?(prefix) } + end + + def supports_responses_endpoint? + return @supports_responses_endpoint if defined?(@supports_responses_endpoint) + + env_override = ENV["OPENAI_SUPPORTS_RESPONSES_ENDPOINT"] + if env_override.to_s.present? + return @supports_responses_endpoint = ActiveModel::Type::Boolean.new.cast(env_override) + end + + @supports_responses_endpoint = !custom_provider? end def provider_name @@ -47,7 +56,7 @@ class Provider::Openai < Provider if custom_provider? @default_model.present? ? "configured model: #{@default_model}" : "any model" else - "models starting with: #{DEFAULT_OPENAI_MODEL_PREFIXES.join(', ')}" + "models starting with: #{SUPPORTED_MODELS.join(", ")}" end end @@ -55,9 +64,43 @@ class Provider::Openai < Provider @uri_base.present? end + # Token-budget knobs. Precedence: ENV > Setting > default. Defaults match + # Ollama's historical 2048-token baseline so local small-context models work + # out of the box. Users on larger-context cloud models can raise via ENV or + # via the Self-Hosting settings page. + def context_window + positive_budget(ENV["LLM_CONTEXT_WINDOW"], Setting.llm_context_window, 2048) + end + + def max_response_tokens + positive_budget(ENV["LLM_MAX_RESPONSE_TOKENS"], Setting.llm_max_response_tokens, 512) + end + + def system_prompt_reserve + positive_budget(ENV["LLM_SYSTEM_PROMPT_RESERVE"], nil, 256) + end + + def max_history_tokens + explicit = ENV["LLM_MAX_HISTORY_TOKENS"].presence&.to_i + return explicit if explicit&.positive? + [ context_window - max_response_tokens - system_prompt_reserve, 256 ].max + end + + # Budget available for a one-shot (non-chat) request's full input, + # excluding reserved response tokens AND the system/instructions prompt. + # Drives the batch slicer for the auto_categorize / auto_detect_merchants / + # enhance_provider_merchants calls — each ships ~200–400 tokens of + # instructions + JSON schema that aren't counted in `fixed_tokens`. + def max_input_tokens + [ context_window - max_response_tokens - system_prompt_reserve, 256 ].max + end + + def max_items_per_call + positive_budget(ENV["LLM_MAX_ITEMS_PER_CALL"], Setting.llm_max_items_per_call, 25) + end + def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil) with_provider_response do - raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25 if user_categories.blank? family_id = family&.id || "unknown" Rails.logger.error("Cannot auto-categorize transactions for family #{family_id}: no categories available") @@ -71,16 +114,20 @@ class Provider::Openai < Provider input: { transactions: transactions, user_categories: user_categories } ) - result = AutoCategorizer.new( - client, - model: effective_model, - transactions: transactions, - user_categories: user_categories, - custom_provider: custom_provider?, - langfuse_trace: trace, - family: family, - json_mode: json_mode - ).auto_categorize + batches = slice_for_context(transactions, fixed: user_categories) + + result = batches.flat_map do |batch| + AutoCategorizer.new( + client, + model: effective_model, + transactions: batch, + user_categories: user_categories, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family, + json_mode: json_mode + ).auto_categorize + end upsert_langfuse_trace(trace: trace, output: result.map(&:to_h)) @@ -90,8 +137,6 @@ class Provider::Openai < Provider def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil) with_provider_response do - raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25 - effective_model = model.presence || @default_model trace = create_langfuse_trace( @@ -99,16 +144,20 @@ class Provider::Openai < Provider input: { transactions: transactions, user_merchants: user_merchants } ) - result = AutoMerchantDetector.new( - client, - model: effective_model, - transactions: transactions, - user_merchants: user_merchants, - custom_provider: custom_provider?, - langfuse_trace: trace, - family: family, - json_mode: json_mode - ).auto_detect_merchants + batches = slice_for_context(transactions, fixed: user_merchants) + + result = batches.flat_map do |batch| + AutoMerchantDetector.new( + client, + model: effective_model, + transactions: batch, + user_merchants: user_merchants, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family, + json_mode: json_mode + ).auto_detect_merchants + end upsert_langfuse_trace(trace: trace, output: result.map(&:to_h)) @@ -118,8 +167,6 @@ class Provider::Openai < Provider def enhance_provider_merchants(merchants: [], model: "", family: nil, json_mode: nil) with_provider_response do - raise Error, "Too many merchants to enhance. Max is 25 per request." if merchants.size > 25 - effective_model = model.presence || @default_model trace = create_langfuse_trace( @@ -127,15 +174,19 @@ class Provider::Openai < Provider input: { merchants: merchants } ) - result = ProviderMerchantEnhancer.new( - client, - model: effective_model, - merchants: merchants, - custom_provider: custom_provider?, - langfuse_trace: trace, - family: family, - json_mode: json_mode - ).enhance_merchants + batches = slice_for_context(merchants) + + result = batches.flat_map do |batch| + ProviderMerchantEnhancer.new( + client, + model: effective_model, + merchants: batch, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family, + json_mode: json_mode + ).enhance_merchants + end upsert_langfuse_trace(trace: trace, output: result.map(&:to_h)) @@ -171,7 +222,8 @@ class Provider::Openai < Provider pdf_content: pdf_content, custom_provider: custom_provider?, langfuse_trace: trace, - family: family + family: family, + max_response_tokens: max_response_tokens ).process upsert_langfuse_trace(trace: trace, output: result.to_h) @@ -207,25 +259,17 @@ class Provider::Openai < Provider instructions: nil, functions: [], function_results: [], + messages: nil, streamer: nil, previous_response_id: nil, session_id: nil, user_identifier: nil, family: nil ) - if custom_provider? - generic_chat_response( - prompt: prompt, - model: model, - instructions: instructions, - functions: functions, - function_results: function_results, - streamer: streamer, - session_id: session_id, - user_identifier: user_identifier, - family: family - ) - else + if supports_responses_endpoint? + # Native path uses the Responses API which chains history via + # `previous_response_id`; it does NOT need (and must not receive) + # inline message history in the input payload. native_chat_response( prompt: prompt, model: model, @@ -238,12 +282,48 @@ class Provider::Openai < Provider user_identifier: user_identifier, family: family ) + else + generic_chat_response( + prompt: prompt, + model: model, + instructions: instructions, + functions: functions, + function_results: function_results, + messages: messages, + streamer: streamer, + session_id: session_id, + user_identifier: user_identifier, + family: family + ) end end private attr_reader :client + # Returns the first positive integer among env, setting, default. Treats + # zero or negative values as "unset" and falls through — a 0-token budget + # is never what the user meant. + def positive_budget(env_value, setting_value, default) + from_env = env_value.to_s.strip.to_i + return from_env if from_env.positive? + return setting_value.to_i if setting_value.to_i.positive? + default + end + + # Routes one-shot (non-chat) inputs through the BatchSlicer so large + # caller batches are split to fit the model's context window. `fixed` is + # the portion of the prompt that stays constant across every sub-batch + # (e.g. user_categories, user_merchants), used for fixed-tokens accounting. + def slice_for_context(items, fixed: nil) + BatchSlicer.call( + Array(items), + max_items: max_items_per_call, + max_tokens: max_input_tokens, + fixed_tokens: fixed ? Assistant::TokenEstimator.estimate(fixed) : 0 + ) + end + def native_chat_response( prompt:, model:, @@ -278,7 +358,7 @@ class Provider::Openai < Provider nil end - input_payload = chat_config.build_input(prompt) + input_payload = chat_config.build_input(prompt: prompt) begin raw_response = client.responses.create(parameters: { @@ -344,6 +424,7 @@ class Provider::Openai < Provider instructions: nil, functions: [], function_results: [], + messages: nil, streamer: nil, session_id: nil, user_identifier: nil, @@ -353,7 +434,8 @@ class Provider::Openai < Provider messages = build_generic_messages( prompt: prompt, instructions: instructions, - function_results: function_results + function_results: function_results, + messages: messages ) tools = build_generic_tools(functions) @@ -412,16 +494,24 @@ class Provider::Openai < Provider end end - def build_generic_messages(prompt:, instructions: nil, function_results: []) - messages = [] + def build_generic_messages(prompt:, instructions: nil, function_results: [], messages: nil) + payload = [] # Add system message if instructions present if instructions.present? - messages << { role: "system", content: instructions } + payload << { role: "system", content: instructions } end - # Add user prompt - messages << { role: "user", content: prompt } + # Add conversation history or user prompt. History is trimmed to fit the + # configured token budget so small-context local models (Ollama, LM Studio, + # LocalAI) don't silently truncate. tool_call/tool_result pairs are + # preserved atomically by HistoryTrimmer. + if messages.present? + trimmed = Assistant::HistoryTrimmer.new(messages, max_tokens: max_history_tokens).call + payload.concat(trimmed) + elsif prompt.present? + payload << { role: "user", content: prompt } + end # If there are function results, we need to add the assistant message that made the tool calls # followed by the tool messages with the results @@ -442,7 +532,7 @@ class Provider::Openai < Provider } end - messages << { + payload << { role: "assistant", content: "", # Some OpenAI-compatible APIs require string, not null tool_calls: tool_calls @@ -462,7 +552,7 @@ class Provider::Openai < Provider output.to_json end - messages << { + payload << { role: "tool", tool_call_id: fn_result[:call_id], name: fn_result[:name], @@ -471,7 +561,7 @@ class Provider::Openai < Provider end end - messages + payload end def build_generic_tools(functions) diff --git a/app/models/provider/openai/batch_slicer.rb b/app/models/provider/openai/batch_slicer.rb new file mode 100644 index 000000000..577ddba5d --- /dev/null +++ b/app/models/provider/openai/batch_slicer.rb @@ -0,0 +1,45 @@ +class Provider::Openai::BatchSlicer + class ContextOverflowError < StandardError; end + + # Splits `items` into sub-batches that respect both a hard item cap and a + # token-budget cap. Used by auto_categorize / auto_detect_merchants / + # enhance_provider_merchants so callers can pass larger batches and have the + # provider fan them out to fit small-context models. + def self.call(items, max_items:, max_tokens:, fixed_tokens: 0) + items = Array(items) + return [] if items.empty? + + available = max_tokens.to_i - fixed_tokens.to_i + if available <= 0 + raise ContextOverflowError, + "Fixed prompt tokens (#{fixed_tokens}) exceed context budget (#{max_tokens})" + end + + batches = [] + current = [] + current_tokens = 0 + + items.each do |item| + item_tokens = Assistant::TokenEstimator.estimate(item) + if item_tokens > available + raise ContextOverflowError, + "Single item requires ~#{item_tokens} tokens, which exceeds available budget (#{available})" + end + + would_exceed_items = current.size >= max_items.to_i + would_exceed_tokens = current_tokens + item_tokens > available + + if would_exceed_items || would_exceed_tokens + batches << current unless current.empty? + current = [] + current_tokens = 0 + end + + current << item + current_tokens += item_tokens + end + + batches << current unless current.empty? + batches + end +end diff --git a/app/models/provider/openai/chat_config.rb b/app/models/provider/openai/chat_config.rb index 5e98a66ce..65e683de4 100644 --- a/app/models/provider/openai/chat_config.rb +++ b/app/models/provider/openai/chat_config.rb @@ -16,7 +16,9 @@ class Provider::Openai::ChatConfig end end - def build_input(prompt) + def build_input(prompt: nil) + input_messages = prompt.present? ? [ { role: "user", content: prompt } ] : [] + results = function_results.map do |fn_result| # Handle nil explicitly to avoid serializing to "null" output = fn_result[:output] @@ -36,7 +38,7 @@ class Provider::Openai::ChatConfig end [ - { role: "user", content: prompt }, + *input_messages, *results ] end diff --git a/app/models/provider/openai/pdf_processor.rb b/app/models/provider/openai/pdf_processor.rb index f65510e87..1cb63e773 100644 --- a/app/models/provider/openai/pdf_processor.rb +++ b/app/models/provider/openai/pdf_processor.rb @@ -1,15 +1,16 @@ class Provider::Openai::PdfProcessor include Provider::Openai::Concerns::UsageRecorder - attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family + attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family, :max_response_tokens - def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil) + def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil, max_response_tokens:) @client = client @model = model @pdf_content = pdf_content @custom_provider = custom_provider @langfuse_trace = langfuse_trace @family = family + @max_response_tokens = max_response_tokens end def process @@ -175,7 +176,7 @@ class Provider::Openai::PdfProcessor { role: "system", content: instructions + "\n\nIMPORTANT: Respond with valid JSON only, no markdown or other formatting." }, { role: "user", content: content } ], - max_tokens: 4096 + max_tokens: max_response_tokens } response = client.chat(parameters: params) diff --git a/app/models/setting.rb b/app/models/setting.rb index f16a66d6a..c5aa08d7e 100644 --- a/app/models/setting.rb +++ b/app/models/setting.rb @@ -10,6 +10,14 @@ class Setting < RailsSettings::Base field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"] field :openai_model, type: :string, default: ENV["OPENAI_MODEL"] field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"] + + # LLM token budget (applies to every outbound LLM call: chat, auto-categorize, + # merchant detection, enhance-merchants, PDF processing). Defaults track + # Ollama's historical 2048-token baseline so local small-context models work + # out of the box. ENV overrides Setting at read time in Provider::Openai. + field :llm_context_window, type: :integer, default: ENV["LLM_CONTEXT_WINDOW"]&.to_i + field :llm_max_response_tokens, type: :integer, default: ENV["LLM_MAX_RESPONSE_TOKENS"]&.to_i + field :llm_max_items_per_call, type: :integer, default: ENV["LLM_MAX_ITEMS_PER_CALL"]&.to_i field :external_assistant_url, type: :string field :external_assistant_token, type: :string field :external_assistant_agent_id, type: :string diff --git a/app/views/settings/hostings/_openai_settings.html.erb b/app/views/settings/hostings/_openai_settings.html.erb index 6691d9556..b54dd7671 100644 --- a/app/views/settings/hostings/_openai_settings.html.erb +++ b/app/views/settings/hostings/_openai_settings.html.erb @@ -63,5 +63,37 @@ { disabled: ENV["LLM_JSON_MODE"].present?, data: { "auto-submit-form-target": "auto" } } %>
<%= t(".json_mode_help") %>
+ +<%= t(".budget_description") %>
+ + <%= form.number_field :llm_context_window, + label: t(".context_window_label"), + placeholder: "2048", + value: Setting.llm_context_window, + min: 256, + disabled: ENV["LLM_CONTEXT_WINDOW"].present?, + data: { "auto-submit-form-target": "auto" } %> +<%= t(".context_window_help") %>
+ + <%= form.number_field :llm_max_response_tokens, + label: t(".max_response_tokens_label"), + placeholder: "512", + value: Setting.llm_max_response_tokens, + min: 64, + disabled: ENV["LLM_MAX_RESPONSE_TOKENS"].present?, + data: { "auto-submit-form-target": "auto" } %> +<%= t(".max_response_tokens_help") %>
+ + <%= form.number_field :llm_max_items_per_call, + label: t(".max_items_per_call_label"), + placeholder: "25", + value: Setting.llm_max_items_per_call, + min: 1, + disabled: ENV["LLM_MAX_ITEMS_PER_CALL"].present?, + data: { "auto-submit-form-target": "auto" } %> +<%= t(".max_items_per_call_help") %>
+