mirror of
https://github.com/we-promise/sure.git
synced 2026-05-29 15:34:58 +00:00
fix(ai): guard PDF size + surface bank-statement truncation
- PdfProcessor and BankStatementExtractor raise upfront when pdf_content.bytesize exceeds MAX_PDF_BYTES (32 MB, matching Anthropic's hard limit). Previously a 100 MB PDF would be base64-encoded (~133 MB) and packed into the JSON body before the API rejected it — peak heap ~270 MB per Sidekiq worker. - BankStatementExtractor inspects response.stop_reason; when the model hit max_tokens it logs a warning and flags result[:truncated] so downstream callers know the transaction list may be incomplete. - ISO date pattern added to statement_period_start/end schema in PdfProcessor so the model can't return "March 2026" — Anthropic enforces the regex via the tool's input_schema. Tests cover the size guard (raises before any client.messages call), truncated-result flagging, and the warning log path.
This commit is contained in:
@@ -3,6 +3,9 @@ class Provider::Anthropic::BankStatementExtractor
|
||||
|
||||
TOOL_NAME = "report_bank_statement".freeze
|
||||
|
||||
# Mirrors Provider::Anthropic::PdfProcessor::MAX_PDF_BYTES.
|
||||
MAX_PDF_BYTES = 32 * 1024 * 1024
|
||||
|
||||
attr_reader :client, :model, :pdf_content, :langfuse_trace, :family
|
||||
|
||||
def initialize(client:, model:, pdf_content:, langfuse_trace: nil, family: nil)
|
||||
@@ -15,6 +18,10 @@ class Provider::Anthropic::BankStatementExtractor
|
||||
|
||||
def extract
|
||||
raise Provider::Anthropic::Error, "PDF content is required" if pdf_content.blank?
|
||||
if pdf_content.bytesize > MAX_PDF_BYTES
|
||||
raise Provider::Anthropic::Error,
|
||||
"PDF exceeds Anthropic's 32 MB limit (#{pdf_content.bytesize} bytes)"
|
||||
end
|
||||
|
||||
span = langfuse_trace&.span(name: "extract_bank_statement_api_call", input: {
|
||||
model: model,
|
||||
@@ -33,9 +40,19 @@ class Provider::Anthropic::BankStatementExtractor
|
||||
parsed = extract_tool_input(response)
|
||||
result = build_result(parsed)
|
||||
|
||||
truncated = stop_reason(response) == :max_tokens
|
||||
if truncated
|
||||
Rails.logger.warn(
|
||||
"[BankStatementExtractor] response truncated by max_tokens — extracted #{result[:transactions].size} " \
|
||||
"transactions but more may be present in the statement. Raise ANTHROPIC_MAX_TOKENS or chunk the PDF."
|
||||
)
|
||||
result[:truncated] = true
|
||||
end
|
||||
|
||||
record_usage(model, response.usage, operation: "extract_bank_statement", metadata: {
|
||||
pdf_size: pdf_content.bytesize,
|
||||
transaction_count: result[:transactions].size
|
||||
transaction_count: result[:transactions].size,
|
||||
truncated: truncated
|
||||
})
|
||||
|
||||
span&.end(output: { transaction_count: result[:transactions].size }, usage: usage_hash(response.usage))
|
||||
@@ -124,6 +141,11 @@ class Provider::Anthropic::BankStatementExtractor
|
||||
INSTRUCTIONS
|
||||
end
|
||||
|
||||
def stop_reason(response)
|
||||
raw = response.respond_to?(:stop_reason) ? response.stop_reason : nil
|
||||
raw.to_s.to_sym if raw
|
||||
end
|
||||
|
||||
def extract_tool_input(response)
|
||||
tool_use = Array(response.content).find { |block| block_type(block) == :tool_use }
|
||||
raise Provider::Anthropic::Error, "Model did not invoke #{TOOL_NAME}" unless tool_use
|
||||
|
||||
@@ -3,6 +3,11 @@ class Provider::Anthropic::PdfProcessor
|
||||
|
||||
TOOL_NAME = "report_document_analysis".freeze
|
||||
|
||||
# Anthropic's native document block accepts PDFs up to 32 MB / 100 pages.
|
||||
# We guard the size limit upstream to avoid base64-encoding a 100 MB blob
|
||||
# in vain (peak heap ~270 MB before the API rejects it).
|
||||
MAX_PDF_BYTES = 32 * 1024 * 1024
|
||||
|
||||
attr_reader :client, :model, :pdf_content, :langfuse_trace, :family
|
||||
|
||||
def initialize(client, model:, pdf_content:, langfuse_trace: nil, family: nil)
|
||||
@@ -15,6 +20,10 @@ class Provider::Anthropic::PdfProcessor
|
||||
|
||||
def process
|
||||
raise Provider::Anthropic::Error, "PDF content is required" if pdf_content.blank?
|
||||
if pdf_content.bytesize > MAX_PDF_BYTES
|
||||
raise Provider::Anthropic::Error,
|
||||
"PDF exceeds Anthropic's 32 MB limit (#{pdf_content.bytesize} bytes)"
|
||||
end
|
||||
|
||||
span = langfuse_trace&.span(name: "process_pdf_api_call", input: {
|
||||
model: model,
|
||||
@@ -87,8 +96,8 @@ class Provider::Anthropic::PdfProcessor
|
||||
type: "object",
|
||||
properties: {
|
||||
institution_name: { type: [ "string", "null" ] },
|
||||
statement_period_start: { type: [ "string", "null" ], description: "YYYY-MM-DD or null" },
|
||||
statement_period_end: { type: [ "string", "null" ], description: "YYYY-MM-DD or null" },
|
||||
statement_period_start: { type: [ "string", "null" ], pattern: "^\\d{4}-\\d{2}-\\d{2}$", description: "YYYY-MM-DD or null" },
|
||||
statement_period_end: { type: [ "string", "null" ], pattern: "^\\d{4}-\\d{2}-\\d{2}$", description: "YYYY-MM-DD or null" },
|
||||
transaction_count: { type: [ "integer", "null" ] },
|
||||
opening_balance: { type: [ "number", "null" ] },
|
||||
closing_balance: { type: [ "number", "null" ] },
|
||||
|
||||
@@ -77,6 +77,45 @@ class Provider::Anthropic::BankStatementExtractorTest < ActiveSupport::TestCase
|
||||
assert_match(/did not invoke report_bank_statement/i, err.message)
|
||||
end
|
||||
|
||||
test "raises before API call when pdf_content exceeds the 32 MB limit" do
|
||||
oversized = "a".b * (Provider::Anthropic::BankStatementExtractor::MAX_PDF_BYTES + 1)
|
||||
client = mock
|
||||
client.expects(:messages).never
|
||||
|
||||
err = assert_raises(Provider::Anthropic::Error) do
|
||||
Provider::Anthropic::BankStatementExtractor.new(
|
||||
client: client,
|
||||
model: "claude-sonnet-4-6",
|
||||
pdf_content: oversized
|
||||
).extract
|
||||
end
|
||||
assert_match(/exceeds Anthropic's 32 MB limit/i, err.message)
|
||||
end
|
||||
|
||||
test "flags result as truncated when stop_reason is max_tokens" do
|
||||
fake_response = build_response(
|
||||
content: [
|
||||
tool_use_block(
|
||||
id: "toolu_1",
|
||||
name: "report_bank_statement",
|
||||
input: { "transactions" => [ { "date" => "2026-03-05", "description" => "Coffee", "amount" => -4.5 } ] }
|
||||
)
|
||||
]
|
||||
)
|
||||
fake_response.stop_reason = :max_tokens
|
||||
client = stub_client(fake_response)
|
||||
|
||||
Rails.logger.expects(:warn).with(regexp_matches(/truncated by max_tokens/i))
|
||||
|
||||
result = Provider::Anthropic::BankStatementExtractor.new(
|
||||
client: client,
|
||||
model: "claude-sonnet-4-6",
|
||||
pdf_content: @pdf_content
|
||||
).extract
|
||||
|
||||
assert_equal true, result[:truncated]
|
||||
end
|
||||
|
||||
private
|
||||
def stub_client(response)
|
||||
messages = mock
|
||||
|
||||
@@ -83,6 +83,21 @@ class Provider::Anthropic::PdfProcessorTest < ActiveSupport::TestCase
|
||||
assert_match(/PDF content is required/i, err.message)
|
||||
end
|
||||
|
||||
test "raises before any API call when pdf_content exceeds the 32 MB limit" do
|
||||
oversized = "a".b * (Provider::Anthropic::PdfProcessor::MAX_PDF_BYTES + 1)
|
||||
client = mock
|
||||
client.expects(:messages).never
|
||||
|
||||
err = assert_raises(Provider::Anthropic::Error) do
|
||||
Provider::Anthropic::PdfProcessor.new(
|
||||
client,
|
||||
model: "claude-sonnet-4-6",
|
||||
pdf_content: oversized
|
||||
).process
|
||||
end
|
||||
assert_match(/exceeds Anthropic's 32 MB limit/i, err.message)
|
||||
end
|
||||
|
||||
private
|
||||
def stub_client(response)
|
||||
messages = mock
|
||||
|
||||
Reference in New Issue
Block a user