diff --git a/.env.local.example b/.env.local.example index e91e7cf0e..b9ddcabf3 100644 --- a/.env.local.example +++ b/.env.local.example @@ -28,6 +28,8 @@ TWELVE_DATA_API_KEY = OPENAI_ACCESS_TOKEN = OPENAI_URI_BASE = OPENAI_MODEL = +# OPENAI_REQUEST_TIMEOUT: Request timeout in seconds (default: 60) +# OPENAI_SUPPORTS_PDF_PROCESSING: Set to false for endpoints without vision support (default: true) # (example: LM Studio/Docker config) OpenAI-compatible API endpoint config # OPENAI_URI_BASE = http://host.docker.internal:1234/ diff --git a/Gemfile b/Gemfile index 64dd5099e..f179d6798 100644 --- a/Gemfile +++ b/Gemfile @@ -81,6 +81,7 @@ gem "rotp", "~> 6.3" gem "rqrcode", "~> 3.0" gem "activerecord-import" gem "rubyzip", "~> 2.3" +gem "pdf-reader", "~> 2.12" # OpenID Connect, OAuth & SAML authentication gem "omniauth", "~> 2.1" diff --git a/Gemfile.lock b/Gemfile.lock index dfc8ee3c9..852d372c1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,7 @@ GEM remote: https://rubygems.org/ specs: + Ascii85 (2.0.1) aasm (5.5.1) concurrent-ruby (~> 1.0) actioncable (7.2.2.2) @@ -79,6 +80,7 @@ GEM addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) aes_key_wrap (1.1.0) + afm (1.0.0) after_commit_everywhere (1.6.0) activerecord (>= 4.2) activesupport @@ -232,6 +234,7 @@ GEM globalid (1.2.1) activesupport (>= 6.1) hashdiff (1.2.0) + hashery (2.1.2) hashie (5.0.0) heapy (0.2.0) thor @@ -446,6 +449,12 @@ GEM parser (3.3.8.0) ast (~> 2.4.1) racc + pdf-reader (2.15.1) + Ascii85 (>= 1.0, < 3.0, != 2.0.0) + afm (>= 0.2.1, < 2) + hashery (~> 2.0) + ruby-rc4 + ttfunk pg (1.5.9) plaid (41.0.0) faraday (>= 1.0.1, < 3.0) @@ -629,6 +638,7 @@ GEM faraday (>= 1) faraday-multipart (>= 1) ruby-progressbar (1.13.0) + ruby-rc4 (0.1.5) ruby-saml (1.18.1) nokogiri (>= 1.13.10) rexml @@ -712,6 +722,8 @@ GEM unicode-display_width (>= 1.1.1, < 4) thor (1.4.0) timeout (0.4.3) + ttfunk (1.8.0) + bigdecimal (~> 3.1) turbo-rails (2.0.16) actionpack (>= 7.1.0) railties (>= 7.1.0) @@ -818,6 +830,7 @@ DEPENDENCIES omniauth_openid_connect ostruct pagy + pdf-reader (~> 2.12) pg (~> 1.5) plaid posthog-ruby diff --git a/app/controllers/api/v1/imports_controller.rb b/app/controllers/api/v1/imports_controller.rb index 2b6a5a5af..b3b048bba 100644 --- a/app/controllers/api/v1/imports_controller.rb +++ b/app/controllers/api/v1/imports_controller.rb @@ -67,7 +67,7 @@ class Api::V1::ImportsController < Api::V1::BaseController }, status: :unprocessable_entity end - unless Import::ALLOWED_MIME_TYPES.include?(file.content_type) + unless Import::ALLOWED_CSV_MIME_TYPES.include?(file.content_type) return render json: { error: "invalid_file_type", message: "Invalid file type. Please upload a CSV file." diff --git a/app/controllers/import/uploads_controller.rb b/app/controllers/import/uploads_controller.rb index e51b52787..a9a185d51 100644 --- a/app/controllers/import/uploads_controller.rb +++ b/app/controllers/import/uploads_controller.rb @@ -33,7 +33,7 @@ class Import::UploadsController < ApplicationController end def csv_str - @csv_str ||= upload_params[:csv_file]&.read || upload_params[:raw_file_str] + @csv_str ||= upload_params[:import_file]&.read || upload_params[:raw_file_str] end def csv_valid?(str) @@ -48,6 +48,6 @@ class Import::UploadsController < ApplicationController end def upload_params - params.require(:import).permit(:raw_file_str, :csv_file, :col_sep) + params.require(:import).permit(:raw_file_str, :import_file, :col_sep) end end diff --git a/app/controllers/imports_controller.rb b/app/controllers/imports_controller.rb index 227e94866..88a346838 100644 --- a/app/controllers/imports_controller.rb +++ b/app/controllers/imports_controller.rb @@ -25,6 +25,18 @@ class ImportsController < ApplicationController end def create + file = import_params[:import_file] + + # Handle PDF file uploads - process with AI + if file.present? && Import::ALLOWED_PDF_MIME_TYPES.include?(file.content_type) + unless valid_pdf_file?(file) + redirect_to new_import_path, alert: t("imports.create.invalid_pdf") + return + end + create_pdf_import(file) + return + end + type = params.dig(:import, :type).to_s type = "TransactionImport" unless Import::TYPES.include?(type) @@ -35,35 +47,35 @@ class ImportsController < ApplicationController date_format: Current.family.date_format, ) - if import_params[:csv_file].present? - file = import_params[:csv_file] - + if file.present? if file.size > Import::MAX_CSV_SIZE import.destroy - redirect_to new_import_path, alert: "File is too large. Maximum size is #{Import::MAX_CSV_SIZE / 1.megabyte}MB." + redirect_to new_import_path, alert: t("imports.create.file_too_large", max_size: Import::MAX_CSV_SIZE / 1.megabyte) return end - unless Import::ALLOWED_MIME_TYPES.include?(file.content_type) + unless Import::ALLOWED_CSV_MIME_TYPES.include?(file.content_type) import.destroy - redirect_to new_import_path, alert: "Invalid file type. Please upload a CSV file." + redirect_to new_import_path, alert: t("imports.create.invalid_file_type") return end # Stream reading is not fully applicable here as we store the raw string in the DB, # but we have validated size beforehand to prevent memory exhaustion from massive files. import.update!(raw_file_str: file.read) - redirect_to import_configuration_path(import), notice: "CSV uploaded successfully." + redirect_to import_configuration_path(import), notice: t("imports.create.csv_uploaded") else redirect_to import_upload_path(import) end end def show + return unless @import.requires_csv_workflow? + if !@import.uploaded? - redirect_to import_upload_path(@import), alert: "Please finalize your file upload." + redirect_to import_upload_path(@import), alert: t("imports.show.finalize_upload") elsif !@import.publishable? - redirect_to import_confirm_path(@import), alert: "Please finalize your mappings before proceeding." + redirect_to import_confirm_path(@import), alert: t("imports.show.finalize_mappings") end end @@ -93,6 +105,25 @@ class ImportsController < ApplicationController end def import_params - params.require(:import).permit(:csv_file) + params.require(:import).permit(:import_file) + end + + def create_pdf_import(file) + if file.size > Import::MAX_PDF_SIZE + redirect_to new_import_path, alert: t("imports.create.pdf_too_large", max_size: Import::MAX_PDF_SIZE / 1.megabyte) + return + end + + pdf_import = Current.family.imports.create!(type: "PdfImport") + pdf_import.pdf_file.attach(file) + pdf_import.process_with_ai_later + + redirect_to import_path(pdf_import), notice: t("imports.create.pdf_processing") + end + + def valid_pdf_file?(file) + header = file.read(5) + file.rewind + header&.start_with?("%PDF-") end end diff --git a/app/jobs/process_pdf_job.rb b/app/jobs/process_pdf_job.rb new file mode 100644 index 000000000..25c31f11f --- /dev/null +++ b/app/jobs/process_pdf_job.rb @@ -0,0 +1,54 @@ +class ProcessPdfJob < ApplicationJob + queue_as :medium_priority + + def perform(pdf_import) + return unless pdf_import.is_a?(PdfImport) + return unless pdf_import.pdf_uploaded? + return if pdf_import.status == "complete" + return if pdf_import.ai_processed? && (!pdf_import.bank_statement? || pdf_import.has_extracted_transactions?) + + pdf_import.update!(status: :importing) + + begin + pdf_import.process_with_ai + + # For bank statements, extract transactions + if pdf_import.bank_statement? + Rails.logger.info("ProcessPdfJob: Extracting transactions for bank statement import #{pdf_import.id}") + pdf_import.extract_transactions + Rails.logger.info("ProcessPdfJob: Extracted #{pdf_import.extracted_transactions.size} transactions") + end + + # Find the user who created this import (first admin or any user in the family) + user = pdf_import.family.users.find_by(role: :admin) || pdf_import.family.users.first + + if user + pdf_import.send_next_steps_email(user) + end + + pdf_import.update!(status: :complete) + rescue StandardError => e + sanitized_error = sanitize_error_message(e) + Rails.logger.error("PDF processing failed for import #{pdf_import.id}: #{e.class.name} - #{sanitized_error}") + begin + pdf_import.update!(status: :failed, error: sanitized_error) + rescue StandardError => update_error + Rails.logger.error("Failed to update import status: #{update_error.message}") + end + raise + end + end + + private + + def sanitize_error_message(error) + case error + when RuntimeError, ArgumentError + I18n.t("imports.pdf_import.processing_failed_with_message", + message: error.message.truncate(500)) + else + I18n.t("imports.pdf_import.processing_failed_generic", + error: error.class.name.demodulize) + end + end +end diff --git a/app/mailers/pdf_import_mailer.rb b/app/mailers/pdf_import_mailer.rb new file mode 100644 index 000000000..5f9f759d7 --- /dev/null +++ b/app/mailers/pdf_import_mailer.rb @@ -0,0 +1,12 @@ +class PdfImportMailer < ApplicationMailer + def next_steps + @user = params[:user] + @pdf_import = params[:pdf_import] + @import_url = import_url(@pdf_import) + + mail( + to: @user.email, + subject: t(".subject", product: product_name) + ) + end +end diff --git a/app/models/assistant/configurable.rb b/app/models/assistant/configurable.rb index a2898c30b..2aae1eb06 100644 --- a/app/models/assistant/configurable.rb +++ b/app/models/assistant/configurable.rb @@ -19,7 +19,8 @@ module Assistant::Configurable Assistant::Function::GetAccounts, Assistant::Function::GetHoldings, Assistant::Function::GetBalanceSheet, - Assistant::Function::GetIncomeStatement + Assistant::Function::GetIncomeStatement, + Assistant::Function::ImportBankStatement ] end diff --git a/app/models/assistant/function/import_bank_statement.rb b/app/models/assistant/function/import_bank_statement.rb new file mode 100644 index 000000000..b0cd02906 --- /dev/null +++ b/app/models/assistant/function/import_bank_statement.rb @@ -0,0 +1,188 @@ +require "csv" + +class Assistant::Function::ImportBankStatement < Assistant::Function + class << self + def name + "import_bank_statement" + end + + def description + <<~INSTRUCTIONS + Use this to import transactions from a bank statement PDF that has already been uploaded. + + This function will: + 1. Extract transaction data from the PDF using AI + 2. Create a transaction import with the extracted data + 3. Return the import ID and extracted transactions for review + + The PDF must have already been uploaded via the PDF import feature. + Only use this for PDFs that are identified as bank statements. + + Example: + + ``` + import_bank_statement({ + pdf_import_id: "abc123-def456", + account_id: "xyz789" + }) + ``` + + If account_id is not provided, you should ask the user which account to import to. + INSTRUCTIONS + end + end + + def strict_mode? + false + end + + def params_schema + build_schema( + required: [ "pdf_import_id" ], + properties: { + pdf_import_id: { + type: "string", + description: "The ID of the PDF import to extract transactions from" + }, + account_id: { + type: "string", + description: "The ID of the account to import transactions into. If not provided, will return available accounts." + } + } + ) + end + + def call(params = {}) + pdf_import = family.imports.find_by(id: params["pdf_import_id"], type: "PdfImport") + + unless pdf_import + return { + success: false, + error: "PDF import not found", + message: "Could not find a PDF import with ID: #{params["pdf_import_id"]}" + } + end + + unless pdf_import.document_type == "bank_statement" + return { + success: false, + error: "not_bank_statement", + message: "This PDF is not a bank statement. Document type: #{pdf_import.document_type}", + available_actions: [ "Use a different PDF that is a bank statement" ] + } + end + + # If no account specified, return available accounts + if params["account_id"].blank? + return { + success: false, + error: "account_required", + message: "Please specify which account to import transactions into", + available_accounts: family.accounts.visible.depository.map { |a| { id: a.id, name: a.name } } + } + end + + account = family.accounts.find_by(id: params["account_id"]) + unless account + return { + success: false, + error: "account_not_found", + message: "Account not found", + available_accounts: family.accounts.visible.depository.map { |a| { id: a.id, name: a.name } } + } + end + + # Extract transactions from the PDF using provider + provider = Provider::Registry.get_provider(:openai) + unless provider + return { + success: false, + error: "provider_not_configured", + message: "OpenAI provider is not configured" + } + end + + response = provider.extract_bank_statement( + pdf_content: pdf_import.pdf_file_content, + model: openai_model, + family: family + ) + + unless response.success? + error_message = response.error&.message || "Unknown extraction error" + return { + success: false, + error: "extraction_failed", + message: "Failed to extract transactions: #{error_message}" + } + end + + result = response.data + + if result[:transactions].blank? + return { + success: false, + error: "no_transactions_found", + message: "Could not extract any transactions from the bank statement" + } + end + + # Create a CSV from extracted transactions + csv_content = generate_csv(result[:transactions]) + + # Create a TransactionImport + import = family.imports.create!( + type: "TransactionImport", + account: account, + raw_file_str: csv_content, + date_col_label: "date", + amount_col_label: "amount", + name_col_label: "name", + category_col_label: "category", + notes_col_label: "notes", + date_format: "%Y-%m-%d", + signage_convention: "inflows_positive" + ) + + import.generate_rows_from_csv + + { + success: true, + import_id: import.id, + transaction_count: result[:transactions].size, + transactions_preview: result[:transactions].first(5), + statement_period: result[:period], + account_holder: result[:account_holder], + message: "Successfully extracted #{result[:transactions].size} transactions. Import created with ID: #{import.id}. Review and publish when ready." + } + rescue Provider::ProviderError, Faraday::Error, Timeout::Error, RuntimeError => e + Rails.logger.error("ImportBankStatement error: #{e.class.name} - #{e.message}") + Rails.logger.error(e.backtrace.first(10).join("\n")) + { + success: false, + error: "extraction_failed", + message: "Failed to extract transactions: #{e.message.truncate(200)}" + } + end + + private + + def generate_csv(transactions) + CSV.generate do |csv| + csv << %w[date amount name category notes] + transactions.each do |txn| + csv << [ + txn[:date], + txn[:amount], + txn[:name] || txn[:description], + txn[:category], + txn[:notes] + ] + end + end + end + + def openai_model + ENV["OPENAI_MODEL"].presence || Provider::Openai::DEFAULT_MODEL + end +end diff --git a/app/models/import.rb b/app/models/import.rb index 141a1ce05..203ed3a1a 100644 --- a/app/models/import.rb +++ b/app/models/import.rb @@ -3,9 +3,13 @@ class Import < ApplicationRecord MappingError = Class.new(StandardError) MAX_CSV_SIZE = 10.megabytes - ALLOWED_MIME_TYPES = %w[text/csv text/plain application/vnd.ms-excel application/csv].freeze + MAX_PDF_SIZE = 25.megabytes + ALLOWED_CSV_MIME_TYPES = %w[text/csv text/plain application/vnd.ms-excel application/csv].freeze + ALLOWED_PDF_MIME_TYPES = %w[application/pdf].freeze - TYPES = %w[TransactionImport TradeImport AccountImport MintImport CategoryImport RuleImport].freeze + DOCUMENT_TYPES = %w[bank_statement credit_card_statement investment_statement financial_document contract other].freeze + + TYPES = %w[TransactionImport TradeImport AccountImport MintImport CategoryImport RuleImport PdfImport].freeze SIGNAGE_CONVENTIONS = %w[inflows_positive inflows_negative] SEPARATORS = [ [ "Comma (,)", "," ], [ "Semicolon (;)", ";" ] ].freeze @@ -134,6 +138,14 @@ class Import < ApplicationRecord [] end + # Returns false for import types that don't need CSV column mapping (e.g., PdfImport). + # Override in subclasses that handle data extraction differently. + def requires_csv_workflow? + true + end + + # Subclasses that require CSV workflow must override this. + # Non-CSV imports (e.g., PdfImport) can return []. def column_keys raise NotImplementedError, "Subclass must implement column_keys" end diff --git a/app/models/pdf_import.rb b/app/models/pdf_import.rb new file mode 100644 index 000000000..8b25e8bfa --- /dev/null +++ b/app/models/pdf_import.rb @@ -0,0 +1,110 @@ +class PdfImport < Import + has_one_attached :pdf_file + + validates :document_type, inclusion: { in: DOCUMENT_TYPES }, allow_nil: true + + def pdf_uploaded? + pdf_file.attached? + end + + def ai_processed? + ai_summary.present? + end + + def process_with_ai_later + ProcessPdfJob.perform_later(self) + end + + def process_with_ai + provider = Provider::Registry.get_provider(:openai) + raise "AI provider not configured" unless provider + raise "AI provider does not support PDF processing" unless provider.supports_pdf_processing? + + response = provider.process_pdf( + pdf_content: pdf_file_content, + family: family + ) + + unless response.success? + error_message = response.error&.message || "Unknown PDF processing error" + raise error_message + end + + result = response.data + update!( + ai_summary: result.summary, + document_type: result.document_type + ) + + result + end + + def extract_transactions + return unless bank_statement? + + provider = Provider::Registry.get_provider(:openai) + raise "AI provider not configured" unless provider + + response = provider.extract_bank_statement( + pdf_content: pdf_file_content, + family: family + ) + + unless response.success? + error_message = response.error&.message || "Unknown extraction error" + raise error_message + end + + update!(extracted_data: response.data) + response.data + end + + def bank_statement? + document_type == "bank_statement" + end + + def has_extracted_transactions? + extracted_data.present? && extracted_data["transactions"].present? + end + + def extracted_transactions + extracted_data&.dig("transactions") || [] + end + + def send_next_steps_email(user) + PdfImportMailer.with( + user: user, + pdf_import: self + ).next_steps.deliver_later + end + + def uploaded? + pdf_uploaded? + end + + def configured? + ai_processed? + end + + def cleaned? + ai_processed? + end + + def publishable? + false + end + + def column_keys + [] + end + + def requires_csv_workflow? + false + end + + def pdf_file_content + return nil unless pdf_file.attached? + + pdf_file.download + end +end diff --git a/app/models/provider/llm_concept.rb b/app/models/provider/llm_concept.rb index dbd6f0458..5faf233dd 100644 --- a/app/models/provider/llm_concept.rb +++ b/app/models/provider/llm_concept.rb @@ -13,6 +13,16 @@ module Provider::LlmConcept raise NotImplementedError, "Subclasses must implement #auto_detect_merchants" end + PdfProcessingResult = Data.define(:summary, :document_type, :extracted_data) + + def supports_pdf_processing? + false + end + + def process_pdf(pdf_content:, family: nil) + raise NotImplementedError, "Provider does not support PDF processing" + end + ChatMessage = Data.define(:id, :output_text) ChatStreamChunk = Data.define(:type, :data, :usage) ChatResponse = Data.define(:id, :model, :messages, :function_requests) diff --git a/app/models/provider/openai.rb b/app/models/provider/openai.rb index 9ba1d23b0..08ac224f9 100644 --- a/app/models/provider/openai.rb +++ b/app/models/provider/openai.rb @@ -8,6 +8,9 @@ class Provider::Openai < Provider DEFAULT_OPENAI_MODEL_PREFIXES = %w[gpt-4 gpt-5 o1 o3] DEFAULT_MODEL = "gpt-4.1" + # Models that support PDF/vision input (not all OpenAI models have vision capabilities) + VISION_CAPABLE_MODEL_PREFIXES = %w[gpt-4o gpt-4-turbo gpt-4.1 gpt-5 o1 o3].freeze + # Returns the effective model that would be used by the provider # Uses the same logic as Provider::Registry and the initializer def self.effective_model @@ -18,6 +21,7 @@ class Provider::Openai < Provider def initialize(access_token, uri_base: nil, model: nil) client_options = { access_token: access_token } client_options[:uri_base] = uri_base if uri_base.present? + client_options[:request_timeout] = ENV.fetch("OPENAI_REQUEST_TIMEOUT", 60).to_i @client = ::OpenAI::Client.new(**client_options) @uri_base = uri_base @@ -112,6 +116,65 @@ class Provider::Openai < Provider end end + # Can be disabled via ENV for OpenAI-compatible endpoints that don't support vision + # Only vision-capable models (gpt-4o, gpt-4-turbo, gpt-4.1, etc.) support PDF input + def supports_pdf_processing? + return false unless ENV.fetch("OPENAI_SUPPORTS_PDF_PROCESSING", "true").to_s.downcase.in?(%w[true 1 yes]) + + # Custom providers manage their own model capabilities + return true if custom_provider? + + # Check if the configured model supports vision/PDF input + VISION_CAPABLE_MODEL_PREFIXES.any? { |prefix| @default_model.start_with?(prefix) } + end + + def process_pdf(pdf_content:, model: "", family: nil) + raise "Model does not support PDF/vision processing" unless supports_pdf_processing? + + with_provider_response do + effective_model = model.presence || @default_model + + trace = create_langfuse_trace( + name: "openai.process_pdf", + input: { pdf_size: pdf_content&.bytesize } + ) + + result = PdfProcessor.new( + client, + model: effective_model, + pdf_content: pdf_content, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family + ).process + + trace&.update(output: result.to_h) + + result + end + end + + def extract_bank_statement(pdf_content:, model: "", family: nil) + with_provider_response do + effective_model = model.presence || @default_model + + trace = create_langfuse_trace( + name: "openai.extract_bank_statement", + input: { pdf_size: pdf_content&.bytesize } + ) + + result = BankStatementExtractor.new( + client: client, + pdf_content: pdf_content, + model: effective_model + ).extract + + trace&.update(output: { transaction_count: result[:transactions].size }) + + result + end + end + def chat_response( prompt, model:, diff --git a/app/models/provider/openai/bank_statement_extractor.rb b/app/models/provider/openai/bank_statement_extractor.rb new file mode 100644 index 000000000..59456d80b --- /dev/null +++ b/app/models/provider/openai/bank_statement_extractor.rb @@ -0,0 +1,213 @@ +class Provider::Openai::BankStatementExtractor + MAX_CHARS_PER_CHUNK = 3000 + attr_reader :client, :pdf_content, :model + + def initialize(client:, pdf_content:, model:) + @client = client + @pdf_content = pdf_content + @model = model + end + + def extract + pages = extract_pages_from_pdf + raise Provider::Openai::Error, "Could not extract text from PDF" if pages.empty? + + chunks = build_chunks(pages) + Rails.logger.info("BankStatementExtractor: Processing #{chunks.size} chunk(s) from #{pages.size} page(s)") + + all_transactions = [] + metadata = {} + + chunks.each_with_index do |chunk, index| + Rails.logger.info("BankStatementExtractor: Processing chunk #{index + 1}/#{chunks.size}") + result = process_chunk(chunk, index == 0) + + # Tag transactions with chunk index for deduplication + tagged_transactions = (result[:transactions] || []).map { |t| t.merge(chunk_index: index) } + all_transactions.concat(tagged_transactions) + + if index == 0 + metadata = { + account_holder: result[:account_holder], + account_number: result[:account_number], + bank_name: result[:bank_name], + opening_balance: result[:opening_balance], + closing_balance: result[:closing_balance], + period: result[:period] + } + end + + if result[:closing_balance].present? + metadata[:closing_balance] = result[:closing_balance] + end + if result.dig(:period, :end_date).present? + metadata[:period] ||= {} + metadata[:period][:end_date] = result.dig(:period, :end_date) + end + end + + { + transactions: deduplicate_transactions(all_transactions), + period: metadata[:period] || {}, + account_holder: metadata[:account_holder], + account_number: metadata[:account_number], + bank_name: metadata[:bank_name], + opening_balance: metadata[:opening_balance], + closing_balance: metadata[:closing_balance] + } + end + + private + + def extract_pages_from_pdf + return [] if pdf_content.blank? + + reader = PDF::Reader.new(StringIO.new(pdf_content)) + reader.pages.map(&:text).reject(&:blank?) + rescue => e + Rails.logger.error("Failed to extract text from PDF: #{e.message}") + [] + end + + def build_chunks(pages) + chunks = [] + current_chunk = [] + current_size = 0 + + pages.each do |page_text| + if page_text.length > MAX_CHARS_PER_CHUNK + chunks << current_chunk.join("\n\n") if current_chunk.any? + current_chunk = [] + current_size = 0 + chunks << page_text + next + end + + if current_size + page_text.length > MAX_CHARS_PER_CHUNK && current_chunk.any? + chunks << current_chunk.join("\n\n") + current_chunk = [] + current_size = 0 + end + + current_chunk << page_text + current_size += page_text.length + end + + chunks << current_chunk.join("\n\n") if current_chunk.any? + chunks + end + + def process_chunk(text, is_first_chunk) + params = { + model: model, + messages: [ + { role: "system", content: is_first_chunk ? instructions_with_metadata : instructions_transactions_only }, + { role: "user", content: "Extract transactions:\n\n#{text}" } + ], + response_format: { type: "json_object" } + } + + response = client.chat(parameters: params) + content = response.dig("choices", 0, "message", "content") + + raise Provider::Openai::Error, "No response from AI" if content.blank? + + parsed = parse_json_response(content) + + { + transactions: normalize_transactions(parsed["transactions"] || []), + period: { + start_date: parsed.dig("statement_period", "start_date"), + end_date: parsed.dig("statement_period", "end_date") + }, + account_holder: parsed["account_holder"], + account_number: parsed["account_number"], + bank_name: parsed["bank_name"], + opening_balance: parsed["opening_balance"], + closing_balance: parsed["closing_balance"] + } + end + + def parse_json_response(content) + cleaned = content.gsub(%r{^```json\s*}i, "").gsub(/```\s*$/, "").strip + JSON.parse(cleaned) + rescue JSON::ParserError => e + Rails.logger.error("BankStatementExtractor JSON parse error: #{e.message} (content_length=#{content.to_s.bytesize})") + { "transactions" => [] } + end + + def deduplicate_transactions(transactions) + # Deduplicates transactions that appear in consecutive chunks (chunking artifacts). + # + # KNOWN LIMITATION: Legitimate duplicate transactions (same date, amount, merchant) + # that happen to appear in adjacent chunks will be incorrectly deduplicated. + # This is an acceptable trade-off since chunking artifacts are more common than + # true same-day duplicates at chunk boundaries. Transactions within the same + # chunk are always preserved regardless of similarity. + seen = Set.new + transactions.select do |t| + # Create key without chunk_index for deduplication + key = [ t[:date], t[:amount], t[:name], t[:chunk_index] ] + + # Check if we've seen this exact transaction in a different chunk + duplicate = seen.any? do |prev_key| + prev_key[0..2] == key[0..2] && (prev_key[3] - key[3]).abs <= 1 + end + + seen << key + !duplicate + end.map { |t| t.except(:chunk_index) } + end + + def normalize_transactions(transactions) + transactions.map do |txn| + { + date: parse_date(txn["date"]), + amount: parse_amount(txn["amount"]), + name: txn["description"] || txn["name"] || txn["merchant"], + category: infer_category(txn), + notes: txn["reference"] || txn["notes"] + } + end.compact + end + + def parse_date(date_str) + return nil if date_str.blank? + + Date.parse(date_str).strftime("%Y-%m-%d") + rescue ArgumentError + nil + end + + def parse_amount(amount) + return nil if amount.nil? + + if amount.is_a?(Numeric) + amount.to_f + else + amount.to_s.gsub(/[^0-9.\-]/, "").to_f + end + end + + def infer_category(txn) + txn["category"] || txn["type"] + end + + def instructions_with_metadata + <<~INSTRUCTIONS.strip + Extract bank statement data as JSON. Return: + {"bank_name":"...","account_holder":"...","account_number":"last 4 digits","statement_period":{"start_date":"YYYY-MM-DD","end_date":"YYYY-MM-DD"},"opening_balance":0.00,"closing_balance":0.00,"transactions":[{"date":"YYYY-MM-DD","description":"...","amount":-0.00}]} + + Rules: Negative amounts for debits/expenses, positive for credits/deposits. Dates as YYYY-MM-DD. Extract ALL transactions. JSON only, no markdown. + INSTRUCTIONS + end + + def instructions_transactions_only + <<~INSTRUCTIONS.strip + Extract transactions from bank statement text as JSON. Return: + {"transactions":[{"date":"YYYY-MM-DD","description":"...","amount":-0.00}]} + + Rules: Negative amounts for debits/expenses, positive for credits/deposits. Dates as YYYY-MM-DD. Extract ALL transactions. JSON only, no markdown. + INSTRUCTIONS + end +end diff --git a/app/models/provider/openai/pdf_processor.rb b/app/models/provider/openai/pdf_processor.rb new file mode 100644 index 000000000..b99caa77c --- /dev/null +++ b/app/models/provider/openai/pdf_processor.rb @@ -0,0 +1,265 @@ +class Provider::Openai::PdfProcessor + include Provider::Openai::Concerns::UsageRecorder + + attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family + + def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil) + @client = client + @model = model + @pdf_content = pdf_content + @custom_provider = custom_provider + @langfuse_trace = langfuse_trace + @family = family + end + + def process + span = langfuse_trace&.span(name: "process_pdf_api_call", input: { + model: model.presence || Provider::Openai::DEFAULT_MODEL, + pdf_size: pdf_content&.bytesize + }) + + # Try text extraction first (works with all models) + # Fall back to vision API with images if text extraction fails (for scanned PDFs) + response = begin + process_with_text_extraction + rescue Provider::Openai::Error => e + Rails.logger.warn("Text extraction failed: #{e.message}, trying vision API with images") + process_with_vision + end + + span&.end(output: response.to_h) + response + rescue => e + span&.end(output: { error: e.message }, level: "ERROR") + raise + end + + def instructions + <<~INSTRUCTIONS.strip + You are a financial document analysis assistant. Your job is to analyze uploaded PDF documents + and provide a structured summary of what the document contains. + + For each document, you must determine: + + 1. **Document Type**: Classify the document as one of the following: + - `bank_statement`: A bank account statement showing transactions, balances, and account activity + - `credit_card_statement`: A credit card statement showing charges, payments, and balances + - `investment_statement`: An investment/brokerage statement showing holdings, trades, or portfolio performance + - `financial_document`: General financial documents like tax forms, receipts, invoices, or financial reports + - `contract`: Legal agreements, loan documents, terms of service, or policy documents + - `other`: Any document that doesn't fit the above categories + + 2. **Summary**: Provide a concise summary of the document that includes: + - The issuing institution or company name (if identifiable) + - The date range or statement period (if applicable) + - Key financial figures (account balances, total transactions, etc.) + - The account holder's name (if visible, use "Account Holder" if redacted) + - Any notable items or important information + + 3. **Extracted Data**: If the document is a statement with transactions, extract key metadata: + - Number of transactions (if countable) + - Statement period (start and end dates) + - Opening and closing balances (if visible) + - Currency used + + IMPORTANT GUIDELINES: + - Be factual and precise - only report what you can clearly see in the document + - If information is unclear or redacted, note it as "not clearly visible" or "redacted" + - Do NOT make assumptions about data you cannot see + - For statements with many transactions, provide a count rather than listing each one + - Focus on providing actionable information that helps the user understand what they uploaded + - If the document is unreadable or the PDF is corrupted, indicate this clearly + + Respond with ONLY valid JSON in this exact format (no markdown code blocks, no other text): + { + "document_type": "bank_statement|credit_card_statement|investment_statement|financial_document|contract|other", + "summary": "A clear, concise summary of the document contents...", + "extracted_data": { + "institution_name": "Name of bank/company or null", + "statement_period_start": "YYYY-MM-DD or null", + "statement_period_end": "YYYY-MM-DD or null", + "transaction_count": number or null, + "opening_balance": number or null, + "closing_balance": number or null, + "currency": "USD/EUR/etc or null", + "account_holder": "Name or null" + } + } + INSTRUCTIONS + end + + private + + PdfProcessingResult = Provider::LlmConcept::PdfProcessingResult + + def process_with_text_extraction + effective_model = model.presence || Provider::Openai::DEFAULT_MODEL + + # Extract text from PDF using pdf-reader gem + pdf_text = extract_text_from_pdf + raise Provider::Openai::Error, "Could not extract text from PDF" if pdf_text.blank? + + # Truncate if too long (max ~100k chars to stay within token limits) + pdf_text = pdf_text.truncate(100_000) if pdf_text.length > 100_000 + + params = { + model: effective_model, + messages: [ + { role: "system", content: instructions }, + { + role: "user", + content: "Please analyze the following document text and provide a structured summary:\n\n#{pdf_text}" + } + ], + response_format: { type: "json_object" } + } + + response = client.chat(parameters: params) + + Rails.logger.info("Tokens used to process PDF: #{response.dig("usage", "total_tokens")}") + + record_usage( + effective_model, + response.dig("usage"), + operation: "process_pdf", + metadata: { pdf_size: pdf_content&.bytesize } + ) + + parse_response_generic(response) + end + + def extract_text_from_pdf + return nil if pdf_content.blank? + + reader = PDF::Reader.new(StringIO.new(pdf_content)) + text_parts = [] + + reader.pages.each_with_index do |page, index| + text_parts << "--- Page #{index + 1} ---" + text_parts << page.text + end + + text_parts.join("\n\n") + rescue => e + Rails.logger.error("Failed to extract text from PDF: #{e.message}") + nil + end + + def process_with_vision + effective_model = model.presence || Provider::Openai::DEFAULT_MODEL + + # Convert PDF to images using pdftoppm + images_base64 = convert_pdf_to_images + raise Provider::Openai::Error, "Could not convert PDF to images" if images_base64.blank? + + # Build message content with images (max 5 pages to avoid token limits) + content = [] + images_base64.first(5).each do |img_base64| + content << { + type: "image_url", + image_url: { + url: "data:image/png;base64,#{img_base64}", + detail: "low" + } + } + end + content << { + type: "text", + text: "Please analyze this PDF document (#{images_base64.size} pages total, showing first #{[ images_base64.size, 5 ].min}) and respond with valid JSON only." + } + + # Note: response_format is not compatible with vision, so we ask for JSON in the prompt + params = { + model: effective_model, + messages: [ + { role: "system", content: instructions + "\n\nIMPORTANT: Respond with valid JSON only, no markdown or other formatting." }, + { role: "user", content: content } + ], + max_tokens: 4096 + } + + response = client.chat(parameters: params) + + Rails.logger.info("Tokens used to process PDF via vision: #{response.dig("usage", "total_tokens")}") + + record_usage( + effective_model, + response.dig("usage"), + operation: "process_pdf_vision", + metadata: { pdf_size: pdf_content&.bytesize, pages: images_base64.size } + ) + + parse_response_generic(response) + end + + def convert_pdf_to_images + return [] if pdf_content.blank? + + Dir.mktmpdir do |tmpdir| + pdf_path = File.join(tmpdir, "input.pdf") + File.binwrite(pdf_path, pdf_content) + + # Convert PDF to PNG images using pdftoppm + output_prefix = File.join(tmpdir, "page") + system("pdftoppm", "-png", "-r", "150", pdf_path, output_prefix) + + # Read all generated images + image_files = Dir.glob(File.join(tmpdir, "page-*.png")).sort + image_files.map do |img_path| + Base64.strict_encode64(File.binread(img_path)) + end + end + rescue => e + Rails.logger.error("Failed to convert PDF to images: #{e.message}") + [] + end + + def parse_response_generic(response) + raw = response.dig("choices", 0, "message", "content") + parsed = parse_json_flexibly(raw) + + build_result(parsed) + end + + def build_result(parsed) + PdfProcessingResult.new( + summary: parsed["summary"], + document_type: normalize_document_type(parsed["document_type"]), + extracted_data: parsed["extracted_data"] || {} + ) + end + + def normalize_document_type(doc_type) + return "other" if doc_type.blank? + + normalized = doc_type.to_s.strip.downcase.gsub(/\s+/, "_") + Import::DOCUMENT_TYPES.include?(normalized) ? normalized : "other" + end + + def parse_json_flexibly(raw) + return {} if raw.blank? + + # Try direct parse first + JSON.parse(raw) + rescue JSON::ParserError + # Try to extract JSON from markdown code blocks + if raw =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Continue to next strategy + end + end + + # Try to find any JSON object + if raw =~ /(\{[\s\S]*\})/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Fall through to error + end + end + + raise Provider::Openai::Error, "Could not parse JSON from PDF processing response: #{raw.truncate(200)}" + end +end diff --git a/app/views/import/uploads/show.html.erb b/app/views/import/uploads/show.html.erb index 7227ff352..338654578 100644 --- a/app/views/import/uploads/show.html.erb +++ b/app/views/import/uploads/show.html.erb @@ -44,7 +44,7 @@
- <%= form.file_field :csv_file, class: "hidden", "data-auto-submit-form-target": "auto", "data-file-upload-target": "input", "data-drag-and-drop-import-target": "input" %> + <%= form.file_field :import_file, class: "hidden", "data-auto-submit-form-target": "auto", "data-file-upload-target": "input", "data-drag-and-drop-import-target": "input" %> diff --git a/app/views/imports/_pdf_import.html.erb b/app/views/imports/_pdf_import.html.erb new file mode 100644 index 000000000..f2b1ea969 --- /dev/null +++ b/app/views/imports/_pdf_import.html.erb @@ -0,0 +1,84 @@ +<%# locals: (import:) %> + +<%= t("imports.pdf_import.processing_description") %>
+<%= t("imports.pdf_import.failed_description") %>
+ <% if import.error.present? %> +<%= import.error %>
+ <% end %> +<%= t("imports.pdf_import.complete_description") %>
++ <%= t("imports.document_types.#{import.document_type}") %> +
++ <%= import.ai_summary %> +
+<%= t("imports.pdf_import.email_sent_notice") %>
+<%= t("imports.pdf_import.unknown_state_description") %>
+<%= t(".intro", product: product_name) %>
+ +<%= @pdf_import.document_type.present? ? t("imports.document_types.#{@pdf_import.document_type}") : t("imports.document_types.other") %>
+ +<%= @pdf_import.ai_summary %>
+ +<% if @pdf_import.document_type.in?(%w[bank_statement credit_card_statement investment_statement]) %> +<%= t(".transactions_note") %>
+<% else %> +<%= t(".document_stored_note") %>
+<% end %> + +<%= t(".next_steps_intro") %>
+ +