From cfde4c70a12dd8b3285f4d37c4d455972b438aa4 Mon Sep 17 00:00:00 2001 From: Guillem Arias Date: Mon, 25 May 2026 20:34:33 +0200 Subject: [PATCH] fix(ai): guard PDF size + surface bank-statement truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - PdfProcessor and BankStatementExtractor raise upfront when pdf_content.bytesize exceeds MAX_PDF_BYTES (32 MB, matching Anthropic's hard limit). Previously a 100 MB PDF would be base64-encoded (~133 MB) and packed into the JSON body before the API rejected it — peak heap ~270 MB per Sidekiq worker. - BankStatementExtractor inspects response.stop_reason; when the model hit max_tokens it logs a warning and flags result[:truncated] so downstream callers know the transaction list may be incomplete. - ISO date pattern added to statement_period_start/end schema in PdfProcessor so the model can't return "March 2026" — Anthropic enforces the regex via the tool's input_schema. Tests cover the size guard (raises before any client.messages call), truncated-result flagging, and the warning log path. --- .../anthropic/bank_statement_extractor.rb | 24 +++++++++++- .../provider/anthropic/pdf_processor.rb | 13 ++++++- .../bank_statement_extractor_test.rb | 39 +++++++++++++++++++ .../provider/anthropic/pdf_processor_test.rb | 15 +++++++ 4 files changed, 88 insertions(+), 3 deletions(-) diff --git a/app/models/provider/anthropic/bank_statement_extractor.rb b/app/models/provider/anthropic/bank_statement_extractor.rb index 55e299197..68fe3c086 100644 --- a/app/models/provider/anthropic/bank_statement_extractor.rb +++ b/app/models/provider/anthropic/bank_statement_extractor.rb @@ -3,6 +3,9 @@ class Provider::Anthropic::BankStatementExtractor TOOL_NAME = "report_bank_statement".freeze + # Mirrors Provider::Anthropic::PdfProcessor::MAX_PDF_BYTES. + MAX_PDF_BYTES = 32 * 1024 * 1024 + attr_reader :client, :model, :pdf_content, :langfuse_trace, :family def initialize(client:, model:, pdf_content:, langfuse_trace: nil, family: nil) @@ -15,6 +18,10 @@ class Provider::Anthropic::BankStatementExtractor def extract raise Provider::Anthropic::Error, "PDF content is required" if pdf_content.blank? + if pdf_content.bytesize > MAX_PDF_BYTES + raise Provider::Anthropic::Error, + "PDF exceeds Anthropic's 32 MB limit (#{pdf_content.bytesize} bytes)" + end span = langfuse_trace&.span(name: "extract_bank_statement_api_call", input: { model: model, @@ -33,9 +40,19 @@ class Provider::Anthropic::BankStatementExtractor parsed = extract_tool_input(response) result = build_result(parsed) + truncated = stop_reason(response) == :max_tokens + if truncated + Rails.logger.warn( + "[BankStatementExtractor] response truncated by max_tokens — extracted #{result[:transactions].size} " \ + "transactions but more may be present in the statement. Raise ANTHROPIC_MAX_TOKENS or chunk the PDF." + ) + result[:truncated] = true + end + record_usage(model, response.usage, operation: "extract_bank_statement", metadata: { pdf_size: pdf_content.bytesize, - transaction_count: result[:transactions].size + transaction_count: result[:transactions].size, + truncated: truncated }) span&.end(output: { transaction_count: result[:transactions].size }, usage: usage_hash(response.usage)) @@ -124,6 +141,11 @@ class Provider::Anthropic::BankStatementExtractor INSTRUCTIONS end + def stop_reason(response) + raw = response.respond_to?(:stop_reason) ? response.stop_reason : nil + raw.to_s.to_sym if raw + end + def extract_tool_input(response) tool_use = Array(response.content).find { |block| block_type(block) == :tool_use } raise Provider::Anthropic::Error, "Model did not invoke #{TOOL_NAME}" unless tool_use diff --git a/app/models/provider/anthropic/pdf_processor.rb b/app/models/provider/anthropic/pdf_processor.rb index 9c9786106..8157df1c4 100644 --- a/app/models/provider/anthropic/pdf_processor.rb +++ b/app/models/provider/anthropic/pdf_processor.rb @@ -3,6 +3,11 @@ class Provider::Anthropic::PdfProcessor TOOL_NAME = "report_document_analysis".freeze + # Anthropic's native document block accepts PDFs up to 32 MB / 100 pages. + # We guard the size limit upstream to avoid base64-encoding a 100 MB blob + # in vain (peak heap ~270 MB before the API rejects it). + MAX_PDF_BYTES = 32 * 1024 * 1024 + attr_reader :client, :model, :pdf_content, :langfuse_trace, :family def initialize(client, model:, pdf_content:, langfuse_trace: nil, family: nil) @@ -15,6 +20,10 @@ class Provider::Anthropic::PdfProcessor def process raise Provider::Anthropic::Error, "PDF content is required" if pdf_content.blank? + if pdf_content.bytesize > MAX_PDF_BYTES + raise Provider::Anthropic::Error, + "PDF exceeds Anthropic's 32 MB limit (#{pdf_content.bytesize} bytes)" + end span = langfuse_trace&.span(name: "process_pdf_api_call", input: { model: model, @@ -87,8 +96,8 @@ class Provider::Anthropic::PdfProcessor type: "object", properties: { institution_name: { type: [ "string", "null" ] }, - statement_period_start: { type: [ "string", "null" ], description: "YYYY-MM-DD or null" }, - statement_period_end: { type: [ "string", "null" ], description: "YYYY-MM-DD or null" }, + statement_period_start: { type: [ "string", "null" ], pattern: "^\\d{4}-\\d{2}-\\d{2}$", description: "YYYY-MM-DD or null" }, + statement_period_end: { type: [ "string", "null" ], pattern: "^\\d{4}-\\d{2}-\\d{2}$", description: "YYYY-MM-DD or null" }, transaction_count: { type: [ "integer", "null" ] }, opening_balance: { type: [ "number", "null" ] }, closing_balance: { type: [ "number", "null" ] }, diff --git a/test/models/provider/anthropic/bank_statement_extractor_test.rb b/test/models/provider/anthropic/bank_statement_extractor_test.rb index bfb84eb11..e775012e0 100644 --- a/test/models/provider/anthropic/bank_statement_extractor_test.rb +++ b/test/models/provider/anthropic/bank_statement_extractor_test.rb @@ -77,6 +77,45 @@ class Provider::Anthropic::BankStatementExtractorTest < ActiveSupport::TestCase assert_match(/did not invoke report_bank_statement/i, err.message) end + test "raises before API call when pdf_content exceeds the 32 MB limit" do + oversized = "a".b * (Provider::Anthropic::BankStatementExtractor::MAX_PDF_BYTES + 1) + client = mock + client.expects(:messages).never + + err = assert_raises(Provider::Anthropic::Error) do + Provider::Anthropic::BankStatementExtractor.new( + client: client, + model: "claude-sonnet-4-6", + pdf_content: oversized + ).extract + end + assert_match(/exceeds Anthropic's 32 MB limit/i, err.message) + end + + test "flags result as truncated when stop_reason is max_tokens" do + fake_response = build_response( + content: [ + tool_use_block( + id: "toolu_1", + name: "report_bank_statement", + input: { "transactions" => [ { "date" => "2026-03-05", "description" => "Coffee", "amount" => -4.5 } ] } + ) + ] + ) + fake_response.stop_reason = :max_tokens + client = stub_client(fake_response) + + Rails.logger.expects(:warn).with(regexp_matches(/truncated by max_tokens/i)) + + result = Provider::Anthropic::BankStatementExtractor.new( + client: client, + model: "claude-sonnet-4-6", + pdf_content: @pdf_content + ).extract + + assert_equal true, result[:truncated] + end + private def stub_client(response) messages = mock diff --git a/test/models/provider/anthropic/pdf_processor_test.rb b/test/models/provider/anthropic/pdf_processor_test.rb index 6a1187b3b..6d8b4cce8 100644 --- a/test/models/provider/anthropic/pdf_processor_test.rb +++ b/test/models/provider/anthropic/pdf_processor_test.rb @@ -83,6 +83,21 @@ class Provider::Anthropic::PdfProcessorTest < ActiveSupport::TestCase assert_match(/PDF content is required/i, err.message) end + test "raises before any API call when pdf_content exceeds the 32 MB limit" do + oversized = "a".b * (Provider::Anthropic::PdfProcessor::MAX_PDF_BYTES + 1) + client = mock + client.expects(:messages).never + + err = assert_raises(Provider::Anthropic::Error) do + Provider::Anthropic::PdfProcessor.new( + client, + model: "claude-sonnet-4-6", + pdf_content: oversized + ).process + end + assert_match(/exceeds Anthropic's 32 MB limit/i, err.message) + end + private def stub_client(response) messages = mock