fix(ai): guard PDF size + surface bank-statement truncation

- PdfProcessor and BankStatementExtractor raise upfront when
  pdf_content.bytesize exceeds MAX_PDF_BYTES (32 MB, matching
  Anthropic's hard limit). Previously a 100 MB PDF would be
  base64-encoded (~133 MB) and packed into the JSON body before
  the API rejected it — peak heap ~270 MB per Sidekiq worker.
- BankStatementExtractor inspects response.stop_reason; when the
  model hit max_tokens it logs a warning and flags result[:truncated]
  so downstream callers know the transaction list may be incomplete.
- ISO date pattern added to statement_period_start/end schema in
  PdfProcessor so the model can't return "March 2026" — Anthropic
  enforces the regex via the tool's input_schema.

Tests cover the size guard (raises before any client.messages call),
truncated-result flagging, and the warning log path.
This commit is contained in:
Guillem Arias
2026-05-25 20:34:33 +02:00
parent 38e950fe23
commit cfde4c70a1
4 changed files with 88 additions and 3 deletions

View File

@@ -77,6 +77,45 @@ class Provider::Anthropic::BankStatementExtractorTest < ActiveSupport::TestCase
assert_match(/did not invoke report_bank_statement/i, err.message)
end
test "raises before API call when pdf_content exceeds the 32 MB limit" do
oversized = "a".b * (Provider::Anthropic::BankStatementExtractor::MAX_PDF_BYTES + 1)
client = mock
client.expects(:messages).never
err = assert_raises(Provider::Anthropic::Error) do
Provider::Anthropic::BankStatementExtractor.new(
client: client,
model: "claude-sonnet-4-6",
pdf_content: oversized
).extract
end
assert_match(/exceeds Anthropic's 32 MB limit/i, err.message)
end
test "flags result as truncated when stop_reason is max_tokens" do
fake_response = build_response(
content: [
tool_use_block(
id: "toolu_1",
name: "report_bank_statement",
input: { "transactions" => [ { "date" => "2026-03-05", "description" => "Coffee", "amount" => -4.5 } ] }
)
]
)
fake_response.stop_reason = :max_tokens
client = stub_client(fake_response)
Rails.logger.expects(:warn).with(regexp_matches(/truncated by max_tokens/i))
result = Provider::Anthropic::BankStatementExtractor.new(
client: client,
model: "claude-sonnet-4-6",
pdf_content: @pdf_content
).extract
assert_equal true, result[:truncated]
end
private
def stub_client(response)
messages = mock

View File

@@ -83,6 +83,21 @@ class Provider::Anthropic::PdfProcessorTest < ActiveSupport::TestCase
assert_match(/PDF content is required/i, err.message)
end
test "raises before any API call when pdf_content exceeds the 32 MB limit" do
oversized = "a".b * (Provider::Anthropic::PdfProcessor::MAX_PDF_BYTES + 1)
client = mock
client.expects(:messages).never
err = assert_raises(Provider::Anthropic::Error) do
Provider::Anthropic::PdfProcessor.new(
client,
model: "claude-sonnet-4-6",
pdf_content: oversized
).process
end
assert_match(/exceeds Anthropic's 32 MB limit/i, err.message)
end
private
def stub_client(response)
messages = mock