mirror of
https://github.com/we-promise/sure.git
synced 2026-05-29 15:34:58 +00:00
- PdfProcessor and BankStatementExtractor raise upfront when pdf_content.bytesize exceeds MAX_PDF_BYTES (32 MB, matching Anthropic's hard limit). Previously a 100 MB PDF would be base64-encoded (~133 MB) and packed into the JSON body before the API rejected it — peak heap ~270 MB per Sidekiq worker. - BankStatementExtractor inspects response.stop_reason; when the model hit max_tokens it logs a warning and flags result[:truncated] so downstream callers know the transaction list may be incomplete. - ISO date pattern added to statement_period_start/end schema in PdfProcessor so the model can't return "March 2026" — Anthropic enforces the regex via the tool's input_schema. Tests cover the size guard (raises before any client.messages call), truncated-result flagging, and the warning log path.
126 lines
3.9 KiB
Ruby
126 lines
3.9 KiB
Ruby
require "test_helper"
|
|
|
|
class Provider::Anthropic::PdfProcessorTest < ActiveSupport::TestCase
|
|
setup do
|
|
@pdf_content = "%PDF-1.4 fake bytes".b
|
|
end
|
|
|
|
test "sends PDF as native document content block and parses tool response" do
|
|
fake_response = build_response(content: [
|
|
tool_use_block(
|
|
id: "toolu_1",
|
|
name: "report_document_analysis",
|
|
input: {
|
|
"document_type" => "bank_statement",
|
|
"summary" => "Bank of Example, Mar 2026 statement.",
|
|
"extracted_data" => {
|
|
"institution_name" => "Bank of Example",
|
|
"statement_period_start" => "2026-03-01",
|
|
"statement_period_end" => "2026-03-31",
|
|
"transaction_count" => 42,
|
|
"opening_balance" => 1000.0,
|
|
"closing_balance" => 1500.0,
|
|
"currency" => "USD",
|
|
"account_holder" => "Account Holder"
|
|
}
|
|
}
|
|
)
|
|
])
|
|
captured = nil
|
|
client = stub_client(fake_response) { |params| captured = params }
|
|
|
|
result = Provider::Anthropic::PdfProcessor.new(
|
|
client,
|
|
model: "claude-sonnet-4-6",
|
|
pdf_content: @pdf_content
|
|
).process
|
|
|
|
document_block = captured[:messages].first[:content].first
|
|
assert_equal "document", document_block[:type]
|
|
assert_equal "application/pdf", document_block[:source][:media_type]
|
|
assert_equal "base64", document_block[:source][:type]
|
|
assert_equal Base64.strict_encode64(@pdf_content), document_block[:source][:data]
|
|
|
|
assert_equal "report_document_analysis", captured[:tool_choice][:name]
|
|
assert captured[:tool_choice][:disable_parallel_tool_use]
|
|
|
|
assert_equal "bank_statement", result.document_type
|
|
assert_equal "Bank of Example, Mar 2026 statement.", result.summary
|
|
assert_equal 42, result.extracted_data["transaction_count"]
|
|
end
|
|
|
|
test "normalizes unknown document_type to other" do
|
|
fake_response = build_response(content: [
|
|
tool_use_block(
|
|
id: "toolu_2",
|
|
name: "report_document_analysis",
|
|
input: {
|
|
"document_type" => "alien_invasion_form",
|
|
"summary" => "Unknown.",
|
|
"extracted_data" => {}
|
|
}
|
|
)
|
|
])
|
|
client = stub_client(fake_response)
|
|
|
|
result = Provider::Anthropic::PdfProcessor.new(
|
|
client,
|
|
model: "claude-sonnet-4-6",
|
|
pdf_content: @pdf_content
|
|
).process
|
|
|
|
assert_equal "other", result.document_type
|
|
end
|
|
|
|
test "raises when pdf_content is blank" do
|
|
err = assert_raises(Provider::Anthropic::Error) do
|
|
Provider::Anthropic::PdfProcessor.new(
|
|
mock,
|
|
model: "claude-sonnet-4-6",
|
|
pdf_content: ""
|
|
).process
|
|
end
|
|
assert_match(/PDF content is required/i, err.message)
|
|
end
|
|
|
|
test "raises before any API call when pdf_content exceeds the 32 MB limit" do
|
|
oversized = "a".b * (Provider::Anthropic::PdfProcessor::MAX_PDF_BYTES + 1)
|
|
client = mock
|
|
client.expects(:messages).never
|
|
|
|
err = assert_raises(Provider::Anthropic::Error) do
|
|
Provider::Anthropic::PdfProcessor.new(
|
|
client,
|
|
model: "claude-sonnet-4-6",
|
|
pdf_content: oversized
|
|
).process
|
|
end
|
|
assert_match(/exceeds Anthropic's 32 MB limit/i, err.message)
|
|
end
|
|
|
|
private
|
|
def stub_client(response)
|
|
messages = mock
|
|
messages.expects(:create).with do |params|
|
|
yield(params) if block_given?
|
|
true
|
|
end.returns(response)
|
|
client = mock
|
|
client.stubs(:messages).returns(messages)
|
|
client
|
|
end
|
|
|
|
def build_response(content:, usage: { input_tokens: 800, output_tokens: 200 })
|
|
OpenStruct.new(
|
|
id: "msg_test",
|
|
model: "claude-sonnet-4-6",
|
|
content: content,
|
|
usage: OpenStruct.new(input_tokens: usage[:input_tokens], output_tokens: usage[:output_tokens])
|
|
)
|
|
end
|
|
|
|
def tool_use_block(id:, name:, input:)
|
|
OpenStruct.new(type: :tool_use, id: id, name: name, input: input)
|
|
end
|
|
end
|