Files
sure/app/models/provider/anthropic/pdf_processor.rb
Guillem Arias cfde4c70a1 fix(ai): guard PDF size + surface bank-statement truncation
- PdfProcessor and BankStatementExtractor raise upfront when
  pdf_content.bytesize exceeds MAX_PDF_BYTES (32 MB, matching
  Anthropic's hard limit). Previously a 100 MB PDF would be
  base64-encoded (~133 MB) and packed into the JSON body before
  the API rejected it — peak heap ~270 MB per Sidekiq worker.
- BankStatementExtractor inspects response.stop_reason; when the
  model hit max_tokens it logs a warning and flags result[:truncated]
  so downstream callers know the transaction list may be incomplete.
- ISO date pattern added to statement_period_start/end schema in
  PdfProcessor so the model can't return "March 2026" — Anthropic
  enforces the regex via the tool's input_schema.

Tests cover the size guard (raises before any client.messages call),
truncated-result flagging, and the warning log path.
2026-05-29 14:51:09 +02:00

180 lines
6.2 KiB
Ruby

class Provider::Anthropic::PdfProcessor
include Provider::Anthropic::Concerns::UsageRecorder
TOOL_NAME = "report_document_analysis".freeze
# Anthropic's native document block accepts PDFs up to 32 MB / 100 pages.
# We guard the size limit upstream to avoid base64-encoding a 100 MB blob
# in vain (peak heap ~270 MB before the API rejects it).
MAX_PDF_BYTES = 32 * 1024 * 1024
attr_reader :client, :model, :pdf_content, :langfuse_trace, :family
def initialize(client, model:, pdf_content:, langfuse_trace: nil, family: nil)
@client = client
@model = model
@pdf_content = pdf_content
@langfuse_trace = langfuse_trace
@family = family
end
def process
raise Provider::Anthropic::Error, "PDF content is required" if pdf_content.blank?
if pdf_content.bytesize > MAX_PDF_BYTES
raise Provider::Anthropic::Error,
"PDF exceeds Anthropic's 32 MB limit (#{pdf_content.bytesize} bytes)"
end
span = langfuse_trace&.span(name: "process_pdf_api_call", input: {
model: model,
pdf_size: pdf_content&.bytesize
})
response = client.messages.create(
model: model,
max_tokens: max_tokens,
system_: instructions,
messages: [ { role: "user", content: user_content } ],
tools: [ output_tool ],
tool_choice: { type: "tool", name: TOOL_NAME, disable_parallel_tool_use: true }
)
parsed = extract_tool_input(response)
result = build_result(parsed)
record_usage(model, response.usage, operation: "process_pdf", metadata: { pdf_size: pdf_content.bytesize })
span&.end(output: result.to_h, usage: usage_hash(response.usage))
result
rescue => e
span&.end(output: { error: e.message }, level: "ERROR")
record_usage_error(model, operation: "process_pdf", error: e, metadata: { pdf_size: pdf_content&.bytesize })
raise
end
private
PdfProcessingResult = Provider::LlmConcept::PdfProcessingResult
def max_tokens
ENV.fetch("ANTHROPIC_MAX_TOKENS", 4096).to_i
end
def user_content
[
{
type: "document",
source: {
type: "base64",
media_type: "application/pdf",
data: Base64.strict_encode64(pdf_content)
}
},
{
type: "text",
text: "Analyze the attached document and return the result via the report_document_analysis tool."
}
]
end
def output_tool
{
name: TOOL_NAME,
description: "Return the structured analysis of the attached document.",
input_schema: {
type: "object",
properties: {
document_type: {
type: "string",
enum: Import::DOCUMENT_TYPES,
description: "Classification of the document."
},
summary: {
type: "string",
description: "Concise human-readable summary of the document."
},
extracted_data: {
type: "object",
properties: {
institution_name: { type: [ "string", "null" ] },
statement_period_start: { type: [ "string", "null" ], pattern: "^\\d{4}-\\d{2}-\\d{2}$", description: "YYYY-MM-DD or null" },
statement_period_end: { type: [ "string", "null" ], pattern: "^\\d{4}-\\d{2}-\\d{2}$", description: "YYYY-MM-DD or null" },
transaction_count: { type: [ "integer", "null" ] },
opening_balance: { type: [ "number", "null" ] },
closing_balance: { type: [ "number", "null" ] },
currency: { type: [ "string", "null" ] },
account_holder: { type: [ "string", "null" ] }
},
required: [],
additionalProperties: false
}
},
required: [ "document_type", "summary", "extracted_data" ],
additionalProperties: false
}
}
end
def instructions
<<~INSTRUCTIONS.strip_heredoc
You analyze financial documents. For the attached PDF, classify the document type,
summarize it, and extract key metadata. Return the result via the report_document_analysis tool.
Classification options:
- bank_statement: bank account statements (incl. mobile money / digital wallets)
- credit_card_statement: credit card statements
- investment_statement: brokerage / investment statements
- financial_document: tax forms, receipts, invoices, financial reports
- contract: legal agreements, loans, terms of service
- other: anything else
Rules:
- Be factual; only report what is clearly visible
- If a field is unclear/redacted, return null for it
- Do not invent figures or names you cannot read
- For statements with many transactions, return the count rather than enumerating them
INSTRUCTIONS
end
def extract_tool_input(response)
tool_use = Array(response.content).find { |block| block_type(block) == :tool_use }
raise Provider::Anthropic::Error, "Model did not invoke #{TOOL_NAME}" unless tool_use
input = block_input(tool_use)
input = JSON.parse(input) if input.is_a?(String)
input
end
def build_result(parsed)
PdfProcessingResult.new(
summary: parsed["summary"] || parsed[:summary],
document_type: normalize_document_type(parsed["document_type"] || parsed[:document_type]),
extracted_data: parsed["extracted_data"] || parsed[:extracted_data] || {}
)
end
def normalize_document_type(doc_type)
return "other" if doc_type.blank?
normalized = doc_type.to_s.strip.downcase.gsub(/\s+/, "_")
Import::DOCUMENT_TYPES.include?(normalized) ? normalized : "other"
end
def block_type(block)
raw = block.respond_to?(:type) ? block.type : block[:type] || block["type"]
raw.to_s.to_sym
end
def block_input(block)
block.respond_to?(:input) ? block.input : (block[:input] || block["input"])
end
def usage_hash(raw_usage)
return {} unless raw_usage
{
"input_tokens" => raw_usage.input_tokens.to_i,
"output_tokens" => raw_usage.output_tokens.to_i,
"total_tokens" => raw_usage.input_tokens.to_i + raw_usage.output_tokens.to_i
}
end
end