diff --git a/.env.local.example b/.env.local.example index e91e7cf0e..b9ddcabf3 100644 --- a/.env.local.example +++ b/.env.local.example @@ -28,6 +28,8 @@ TWELVE_DATA_API_KEY = OPENAI_ACCESS_TOKEN = OPENAI_URI_BASE = OPENAI_MODEL = +# OPENAI_REQUEST_TIMEOUT: Request timeout in seconds (default: 60) +# OPENAI_SUPPORTS_PDF_PROCESSING: Set to false for endpoints without vision support (default: true) # (example: LM Studio/Docker config) OpenAI-compatible API endpoint config # OPENAI_URI_BASE = http://host.docker.internal:1234/ diff --git a/Gemfile b/Gemfile index 64dd5099e..f179d6798 100644 --- a/Gemfile +++ b/Gemfile @@ -81,6 +81,7 @@ gem "rotp", "~> 6.3" gem "rqrcode", "~> 3.0" gem "activerecord-import" gem "rubyzip", "~> 2.3" +gem "pdf-reader", "~> 2.12" # OpenID Connect, OAuth & SAML authentication gem "omniauth", "~> 2.1" diff --git a/Gemfile.lock b/Gemfile.lock index dfc8ee3c9..852d372c1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,7 @@ GEM remote: https://rubygems.org/ specs: + Ascii85 (2.0.1) aasm (5.5.1) concurrent-ruby (~> 1.0) actioncable (7.2.2.2) @@ -79,6 +80,7 @@ GEM addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) aes_key_wrap (1.1.0) + afm (1.0.0) after_commit_everywhere (1.6.0) activerecord (>= 4.2) activesupport @@ -232,6 +234,7 @@ GEM globalid (1.2.1) activesupport (>= 6.1) hashdiff (1.2.0) + hashery (2.1.2) hashie (5.0.0) heapy (0.2.0) thor @@ -446,6 +449,12 @@ GEM parser (3.3.8.0) ast (~> 2.4.1) racc + pdf-reader (2.15.1) + Ascii85 (>= 1.0, < 3.0, != 2.0.0) + afm (>= 0.2.1, < 2) + hashery (~> 2.0) + ruby-rc4 + ttfunk pg (1.5.9) plaid (41.0.0) faraday (>= 1.0.1, < 3.0) @@ -629,6 +638,7 @@ GEM faraday (>= 1) faraday-multipart (>= 1) ruby-progressbar (1.13.0) + ruby-rc4 (0.1.5) ruby-saml (1.18.1) nokogiri (>= 1.13.10) rexml @@ -712,6 +722,8 @@ GEM unicode-display_width (>= 1.1.1, < 4) thor (1.4.0) timeout (0.4.3) + ttfunk (1.8.0) + bigdecimal (~> 3.1) turbo-rails (2.0.16) actionpack (>= 7.1.0) railties (>= 7.1.0) @@ -818,6 +830,7 @@ DEPENDENCIES omniauth_openid_connect ostruct pagy + pdf-reader (~> 2.12) pg (~> 1.5) plaid posthog-ruby diff --git a/app/controllers/api/v1/imports_controller.rb b/app/controllers/api/v1/imports_controller.rb index 2b6a5a5af..b3b048bba 100644 --- a/app/controllers/api/v1/imports_controller.rb +++ b/app/controllers/api/v1/imports_controller.rb @@ -67,7 +67,7 @@ class Api::V1::ImportsController < Api::V1::BaseController }, status: :unprocessable_entity end - unless Import::ALLOWED_MIME_TYPES.include?(file.content_type) + unless Import::ALLOWED_CSV_MIME_TYPES.include?(file.content_type) return render json: { error: "invalid_file_type", message: "Invalid file type. Please upload a CSV file." diff --git a/app/controllers/import/uploads_controller.rb b/app/controllers/import/uploads_controller.rb index e51b52787..a9a185d51 100644 --- a/app/controllers/import/uploads_controller.rb +++ b/app/controllers/import/uploads_controller.rb @@ -33,7 +33,7 @@ class Import::UploadsController < ApplicationController end def csv_str - @csv_str ||= upload_params[:csv_file]&.read || upload_params[:raw_file_str] + @csv_str ||= upload_params[:import_file]&.read || upload_params[:raw_file_str] end def csv_valid?(str) @@ -48,6 +48,6 @@ class Import::UploadsController < ApplicationController end def upload_params - params.require(:import).permit(:raw_file_str, :csv_file, :col_sep) + params.require(:import).permit(:raw_file_str, :import_file, :col_sep) end end diff --git a/app/controllers/imports_controller.rb b/app/controllers/imports_controller.rb index 227e94866..88a346838 100644 --- a/app/controllers/imports_controller.rb +++ b/app/controllers/imports_controller.rb @@ -25,6 +25,18 @@ class ImportsController < ApplicationController end def create + file = import_params[:import_file] + + # Handle PDF file uploads - process with AI + if file.present? && Import::ALLOWED_PDF_MIME_TYPES.include?(file.content_type) + unless valid_pdf_file?(file) + redirect_to new_import_path, alert: t("imports.create.invalid_pdf") + return + end + create_pdf_import(file) + return + end + type = params.dig(:import, :type).to_s type = "TransactionImport" unless Import::TYPES.include?(type) @@ -35,35 +47,35 @@ class ImportsController < ApplicationController date_format: Current.family.date_format, ) - if import_params[:csv_file].present? - file = import_params[:csv_file] - + if file.present? if file.size > Import::MAX_CSV_SIZE import.destroy - redirect_to new_import_path, alert: "File is too large. Maximum size is #{Import::MAX_CSV_SIZE / 1.megabyte}MB." + redirect_to new_import_path, alert: t("imports.create.file_too_large", max_size: Import::MAX_CSV_SIZE / 1.megabyte) return end - unless Import::ALLOWED_MIME_TYPES.include?(file.content_type) + unless Import::ALLOWED_CSV_MIME_TYPES.include?(file.content_type) import.destroy - redirect_to new_import_path, alert: "Invalid file type. Please upload a CSV file." + redirect_to new_import_path, alert: t("imports.create.invalid_file_type") return end # Stream reading is not fully applicable here as we store the raw string in the DB, # but we have validated size beforehand to prevent memory exhaustion from massive files. import.update!(raw_file_str: file.read) - redirect_to import_configuration_path(import), notice: "CSV uploaded successfully." + redirect_to import_configuration_path(import), notice: t("imports.create.csv_uploaded") else redirect_to import_upload_path(import) end end def show + return unless @import.requires_csv_workflow? + if !@import.uploaded? - redirect_to import_upload_path(@import), alert: "Please finalize your file upload." + redirect_to import_upload_path(@import), alert: t("imports.show.finalize_upload") elsif !@import.publishable? - redirect_to import_confirm_path(@import), alert: "Please finalize your mappings before proceeding." + redirect_to import_confirm_path(@import), alert: t("imports.show.finalize_mappings") end end @@ -93,6 +105,25 @@ class ImportsController < ApplicationController end def import_params - params.require(:import).permit(:csv_file) + params.require(:import).permit(:import_file) + end + + def create_pdf_import(file) + if file.size > Import::MAX_PDF_SIZE + redirect_to new_import_path, alert: t("imports.create.pdf_too_large", max_size: Import::MAX_PDF_SIZE / 1.megabyte) + return + end + + pdf_import = Current.family.imports.create!(type: "PdfImport") + pdf_import.pdf_file.attach(file) + pdf_import.process_with_ai_later + + redirect_to import_path(pdf_import), notice: t("imports.create.pdf_processing") + end + + def valid_pdf_file?(file) + header = file.read(5) + file.rewind + header&.start_with?("%PDF-") end end diff --git a/app/jobs/process_pdf_job.rb b/app/jobs/process_pdf_job.rb new file mode 100644 index 000000000..25c31f11f --- /dev/null +++ b/app/jobs/process_pdf_job.rb @@ -0,0 +1,54 @@ +class ProcessPdfJob < ApplicationJob + queue_as :medium_priority + + def perform(pdf_import) + return unless pdf_import.is_a?(PdfImport) + return unless pdf_import.pdf_uploaded? + return if pdf_import.status == "complete" + return if pdf_import.ai_processed? && (!pdf_import.bank_statement? || pdf_import.has_extracted_transactions?) + + pdf_import.update!(status: :importing) + + begin + pdf_import.process_with_ai + + # For bank statements, extract transactions + if pdf_import.bank_statement? + Rails.logger.info("ProcessPdfJob: Extracting transactions for bank statement import #{pdf_import.id}") + pdf_import.extract_transactions + Rails.logger.info("ProcessPdfJob: Extracted #{pdf_import.extracted_transactions.size} transactions") + end + + # Find the user who created this import (first admin or any user in the family) + user = pdf_import.family.users.find_by(role: :admin) || pdf_import.family.users.first + + if user + pdf_import.send_next_steps_email(user) + end + + pdf_import.update!(status: :complete) + rescue StandardError => e + sanitized_error = sanitize_error_message(e) + Rails.logger.error("PDF processing failed for import #{pdf_import.id}: #{e.class.name} - #{sanitized_error}") + begin + pdf_import.update!(status: :failed, error: sanitized_error) + rescue StandardError => update_error + Rails.logger.error("Failed to update import status: #{update_error.message}") + end + raise + end + end + + private + + def sanitize_error_message(error) + case error + when RuntimeError, ArgumentError + I18n.t("imports.pdf_import.processing_failed_with_message", + message: error.message.truncate(500)) + else + I18n.t("imports.pdf_import.processing_failed_generic", + error: error.class.name.demodulize) + end + end +end diff --git a/app/mailers/pdf_import_mailer.rb b/app/mailers/pdf_import_mailer.rb new file mode 100644 index 000000000..5f9f759d7 --- /dev/null +++ b/app/mailers/pdf_import_mailer.rb @@ -0,0 +1,12 @@ +class PdfImportMailer < ApplicationMailer + def next_steps + @user = params[:user] + @pdf_import = params[:pdf_import] + @import_url = import_url(@pdf_import) + + mail( + to: @user.email, + subject: t(".subject", product: product_name) + ) + end +end diff --git a/app/models/assistant/configurable.rb b/app/models/assistant/configurable.rb index a2898c30b..2aae1eb06 100644 --- a/app/models/assistant/configurable.rb +++ b/app/models/assistant/configurable.rb @@ -19,7 +19,8 @@ module Assistant::Configurable Assistant::Function::GetAccounts, Assistant::Function::GetHoldings, Assistant::Function::GetBalanceSheet, - Assistant::Function::GetIncomeStatement + Assistant::Function::GetIncomeStatement, + Assistant::Function::ImportBankStatement ] end diff --git a/app/models/assistant/function/import_bank_statement.rb b/app/models/assistant/function/import_bank_statement.rb new file mode 100644 index 000000000..b0cd02906 --- /dev/null +++ b/app/models/assistant/function/import_bank_statement.rb @@ -0,0 +1,188 @@ +require "csv" + +class Assistant::Function::ImportBankStatement < Assistant::Function + class << self + def name + "import_bank_statement" + end + + def description + <<~INSTRUCTIONS + Use this to import transactions from a bank statement PDF that has already been uploaded. + + This function will: + 1. Extract transaction data from the PDF using AI + 2. Create a transaction import with the extracted data + 3. Return the import ID and extracted transactions for review + + The PDF must have already been uploaded via the PDF import feature. + Only use this for PDFs that are identified as bank statements. + + Example: + + ``` + import_bank_statement({ + pdf_import_id: "abc123-def456", + account_id: "xyz789" + }) + ``` + + If account_id is not provided, you should ask the user which account to import to. + INSTRUCTIONS + end + end + + def strict_mode? + false + end + + def params_schema + build_schema( + required: [ "pdf_import_id" ], + properties: { + pdf_import_id: { + type: "string", + description: "The ID of the PDF import to extract transactions from" + }, + account_id: { + type: "string", + description: "The ID of the account to import transactions into. If not provided, will return available accounts." + } + } + ) + end + + def call(params = {}) + pdf_import = family.imports.find_by(id: params["pdf_import_id"], type: "PdfImport") + + unless pdf_import + return { + success: false, + error: "PDF import not found", + message: "Could not find a PDF import with ID: #{params["pdf_import_id"]}" + } + end + + unless pdf_import.document_type == "bank_statement" + return { + success: false, + error: "not_bank_statement", + message: "This PDF is not a bank statement. Document type: #{pdf_import.document_type}", + available_actions: [ "Use a different PDF that is a bank statement" ] + } + end + + # If no account specified, return available accounts + if params["account_id"].blank? + return { + success: false, + error: "account_required", + message: "Please specify which account to import transactions into", + available_accounts: family.accounts.visible.depository.map { |a| { id: a.id, name: a.name } } + } + end + + account = family.accounts.find_by(id: params["account_id"]) + unless account + return { + success: false, + error: "account_not_found", + message: "Account not found", + available_accounts: family.accounts.visible.depository.map { |a| { id: a.id, name: a.name } } + } + end + + # Extract transactions from the PDF using provider + provider = Provider::Registry.get_provider(:openai) + unless provider + return { + success: false, + error: "provider_not_configured", + message: "OpenAI provider is not configured" + } + end + + response = provider.extract_bank_statement( + pdf_content: pdf_import.pdf_file_content, + model: openai_model, + family: family + ) + + unless response.success? + error_message = response.error&.message || "Unknown extraction error" + return { + success: false, + error: "extraction_failed", + message: "Failed to extract transactions: #{error_message}" + } + end + + result = response.data + + if result[:transactions].blank? + return { + success: false, + error: "no_transactions_found", + message: "Could not extract any transactions from the bank statement" + } + end + + # Create a CSV from extracted transactions + csv_content = generate_csv(result[:transactions]) + + # Create a TransactionImport + import = family.imports.create!( + type: "TransactionImport", + account: account, + raw_file_str: csv_content, + date_col_label: "date", + amount_col_label: "amount", + name_col_label: "name", + category_col_label: "category", + notes_col_label: "notes", + date_format: "%Y-%m-%d", + signage_convention: "inflows_positive" + ) + + import.generate_rows_from_csv + + { + success: true, + import_id: import.id, + transaction_count: result[:transactions].size, + transactions_preview: result[:transactions].first(5), + statement_period: result[:period], + account_holder: result[:account_holder], + message: "Successfully extracted #{result[:transactions].size} transactions. Import created with ID: #{import.id}. Review and publish when ready." + } + rescue Provider::ProviderError, Faraday::Error, Timeout::Error, RuntimeError => e + Rails.logger.error("ImportBankStatement error: #{e.class.name} - #{e.message}") + Rails.logger.error(e.backtrace.first(10).join("\n")) + { + success: false, + error: "extraction_failed", + message: "Failed to extract transactions: #{e.message.truncate(200)}" + } + end + + private + + def generate_csv(transactions) + CSV.generate do |csv| + csv << %w[date amount name category notes] + transactions.each do |txn| + csv << [ + txn[:date], + txn[:amount], + txn[:name] || txn[:description], + txn[:category], + txn[:notes] + ] + end + end + end + + def openai_model + ENV["OPENAI_MODEL"].presence || Provider::Openai::DEFAULT_MODEL + end +end diff --git a/app/models/import.rb b/app/models/import.rb index 141a1ce05..203ed3a1a 100644 --- a/app/models/import.rb +++ b/app/models/import.rb @@ -3,9 +3,13 @@ class Import < ApplicationRecord MappingError = Class.new(StandardError) MAX_CSV_SIZE = 10.megabytes - ALLOWED_MIME_TYPES = %w[text/csv text/plain application/vnd.ms-excel application/csv].freeze + MAX_PDF_SIZE = 25.megabytes + ALLOWED_CSV_MIME_TYPES = %w[text/csv text/plain application/vnd.ms-excel application/csv].freeze + ALLOWED_PDF_MIME_TYPES = %w[application/pdf].freeze - TYPES = %w[TransactionImport TradeImport AccountImport MintImport CategoryImport RuleImport].freeze + DOCUMENT_TYPES = %w[bank_statement credit_card_statement investment_statement financial_document contract other].freeze + + TYPES = %w[TransactionImport TradeImport AccountImport MintImport CategoryImport RuleImport PdfImport].freeze SIGNAGE_CONVENTIONS = %w[inflows_positive inflows_negative] SEPARATORS = [ [ "Comma (,)", "," ], [ "Semicolon (;)", ";" ] ].freeze @@ -134,6 +138,14 @@ class Import < ApplicationRecord [] end + # Returns false for import types that don't need CSV column mapping (e.g., PdfImport). + # Override in subclasses that handle data extraction differently. + def requires_csv_workflow? + true + end + + # Subclasses that require CSV workflow must override this. + # Non-CSV imports (e.g., PdfImport) can return []. def column_keys raise NotImplementedError, "Subclass must implement column_keys" end diff --git a/app/models/pdf_import.rb b/app/models/pdf_import.rb new file mode 100644 index 000000000..8b25e8bfa --- /dev/null +++ b/app/models/pdf_import.rb @@ -0,0 +1,110 @@ +class PdfImport < Import + has_one_attached :pdf_file + + validates :document_type, inclusion: { in: DOCUMENT_TYPES }, allow_nil: true + + def pdf_uploaded? + pdf_file.attached? + end + + def ai_processed? + ai_summary.present? + end + + def process_with_ai_later + ProcessPdfJob.perform_later(self) + end + + def process_with_ai + provider = Provider::Registry.get_provider(:openai) + raise "AI provider not configured" unless provider + raise "AI provider does not support PDF processing" unless provider.supports_pdf_processing? + + response = provider.process_pdf( + pdf_content: pdf_file_content, + family: family + ) + + unless response.success? + error_message = response.error&.message || "Unknown PDF processing error" + raise error_message + end + + result = response.data + update!( + ai_summary: result.summary, + document_type: result.document_type + ) + + result + end + + def extract_transactions + return unless bank_statement? + + provider = Provider::Registry.get_provider(:openai) + raise "AI provider not configured" unless provider + + response = provider.extract_bank_statement( + pdf_content: pdf_file_content, + family: family + ) + + unless response.success? + error_message = response.error&.message || "Unknown extraction error" + raise error_message + end + + update!(extracted_data: response.data) + response.data + end + + def bank_statement? + document_type == "bank_statement" + end + + def has_extracted_transactions? + extracted_data.present? && extracted_data["transactions"].present? + end + + def extracted_transactions + extracted_data&.dig("transactions") || [] + end + + def send_next_steps_email(user) + PdfImportMailer.with( + user: user, + pdf_import: self + ).next_steps.deliver_later + end + + def uploaded? + pdf_uploaded? + end + + def configured? + ai_processed? + end + + def cleaned? + ai_processed? + end + + def publishable? + false + end + + def column_keys + [] + end + + def requires_csv_workflow? + false + end + + def pdf_file_content + return nil unless pdf_file.attached? + + pdf_file.download + end +end diff --git a/app/models/provider/llm_concept.rb b/app/models/provider/llm_concept.rb index dbd6f0458..5faf233dd 100644 --- a/app/models/provider/llm_concept.rb +++ b/app/models/provider/llm_concept.rb @@ -13,6 +13,16 @@ module Provider::LlmConcept raise NotImplementedError, "Subclasses must implement #auto_detect_merchants" end + PdfProcessingResult = Data.define(:summary, :document_type, :extracted_data) + + def supports_pdf_processing? + false + end + + def process_pdf(pdf_content:, family: nil) + raise NotImplementedError, "Provider does not support PDF processing" + end + ChatMessage = Data.define(:id, :output_text) ChatStreamChunk = Data.define(:type, :data, :usage) ChatResponse = Data.define(:id, :model, :messages, :function_requests) diff --git a/app/models/provider/openai.rb b/app/models/provider/openai.rb index 9ba1d23b0..08ac224f9 100644 --- a/app/models/provider/openai.rb +++ b/app/models/provider/openai.rb @@ -8,6 +8,9 @@ class Provider::Openai < Provider DEFAULT_OPENAI_MODEL_PREFIXES = %w[gpt-4 gpt-5 o1 o3] DEFAULT_MODEL = "gpt-4.1" + # Models that support PDF/vision input (not all OpenAI models have vision capabilities) + VISION_CAPABLE_MODEL_PREFIXES = %w[gpt-4o gpt-4-turbo gpt-4.1 gpt-5 o1 o3].freeze + # Returns the effective model that would be used by the provider # Uses the same logic as Provider::Registry and the initializer def self.effective_model @@ -18,6 +21,7 @@ class Provider::Openai < Provider def initialize(access_token, uri_base: nil, model: nil) client_options = { access_token: access_token } client_options[:uri_base] = uri_base if uri_base.present? + client_options[:request_timeout] = ENV.fetch("OPENAI_REQUEST_TIMEOUT", 60).to_i @client = ::OpenAI::Client.new(**client_options) @uri_base = uri_base @@ -112,6 +116,65 @@ class Provider::Openai < Provider end end + # Can be disabled via ENV for OpenAI-compatible endpoints that don't support vision + # Only vision-capable models (gpt-4o, gpt-4-turbo, gpt-4.1, etc.) support PDF input + def supports_pdf_processing? + return false unless ENV.fetch("OPENAI_SUPPORTS_PDF_PROCESSING", "true").to_s.downcase.in?(%w[true 1 yes]) + + # Custom providers manage their own model capabilities + return true if custom_provider? + + # Check if the configured model supports vision/PDF input + VISION_CAPABLE_MODEL_PREFIXES.any? { |prefix| @default_model.start_with?(prefix) } + end + + def process_pdf(pdf_content:, model: "", family: nil) + raise "Model does not support PDF/vision processing" unless supports_pdf_processing? + + with_provider_response do + effective_model = model.presence || @default_model + + trace = create_langfuse_trace( + name: "openai.process_pdf", + input: { pdf_size: pdf_content&.bytesize } + ) + + result = PdfProcessor.new( + client, + model: effective_model, + pdf_content: pdf_content, + custom_provider: custom_provider?, + langfuse_trace: trace, + family: family + ).process + + trace&.update(output: result.to_h) + + result + end + end + + def extract_bank_statement(pdf_content:, model: "", family: nil) + with_provider_response do + effective_model = model.presence || @default_model + + trace = create_langfuse_trace( + name: "openai.extract_bank_statement", + input: { pdf_size: pdf_content&.bytesize } + ) + + result = BankStatementExtractor.new( + client: client, + pdf_content: pdf_content, + model: effective_model + ).extract + + trace&.update(output: { transaction_count: result[:transactions].size }) + + result + end + end + def chat_response( prompt, model:, diff --git a/app/models/provider/openai/bank_statement_extractor.rb b/app/models/provider/openai/bank_statement_extractor.rb new file mode 100644 index 000000000..59456d80b --- /dev/null +++ b/app/models/provider/openai/bank_statement_extractor.rb @@ -0,0 +1,213 @@ +class Provider::Openai::BankStatementExtractor + MAX_CHARS_PER_CHUNK = 3000 + attr_reader :client, :pdf_content, :model + + def initialize(client:, pdf_content:, model:) + @client = client + @pdf_content = pdf_content + @model = model + end + + def extract + pages = extract_pages_from_pdf + raise Provider::Openai::Error, "Could not extract text from PDF" if pages.empty? + + chunks = build_chunks(pages) + Rails.logger.info("BankStatementExtractor: Processing #{chunks.size} chunk(s) from #{pages.size} page(s)") + + all_transactions = [] + metadata = {} + + chunks.each_with_index do |chunk, index| + Rails.logger.info("BankStatementExtractor: Processing chunk #{index + 1}/#{chunks.size}") + result = process_chunk(chunk, index == 0) + + # Tag transactions with chunk index for deduplication + tagged_transactions = (result[:transactions] || []).map { |t| t.merge(chunk_index: index) } + all_transactions.concat(tagged_transactions) + + if index == 0 + metadata = { + account_holder: result[:account_holder], + account_number: result[:account_number], + bank_name: result[:bank_name], + opening_balance: result[:opening_balance], + closing_balance: result[:closing_balance], + period: result[:period] + } + end + + if result[:closing_balance].present? + metadata[:closing_balance] = result[:closing_balance] + end + if result.dig(:period, :end_date).present? + metadata[:period] ||= {} + metadata[:period][:end_date] = result.dig(:period, :end_date) + end + end + + { + transactions: deduplicate_transactions(all_transactions), + period: metadata[:period] || {}, + account_holder: metadata[:account_holder], + account_number: metadata[:account_number], + bank_name: metadata[:bank_name], + opening_balance: metadata[:opening_balance], + closing_balance: metadata[:closing_balance] + } + end + + private + + def extract_pages_from_pdf + return [] if pdf_content.blank? + + reader = PDF::Reader.new(StringIO.new(pdf_content)) + reader.pages.map(&:text).reject(&:blank?) + rescue => e + Rails.logger.error("Failed to extract text from PDF: #{e.message}") + [] + end + + def build_chunks(pages) + chunks = [] + current_chunk = [] + current_size = 0 + + pages.each do |page_text| + if page_text.length > MAX_CHARS_PER_CHUNK + chunks << current_chunk.join("\n\n") if current_chunk.any? + current_chunk = [] + current_size = 0 + chunks << page_text + next + end + + if current_size + page_text.length > MAX_CHARS_PER_CHUNK && current_chunk.any? + chunks << current_chunk.join("\n\n") + current_chunk = [] + current_size = 0 + end + + current_chunk << page_text + current_size += page_text.length + end + + chunks << current_chunk.join("\n\n") if current_chunk.any? + chunks + end + + def process_chunk(text, is_first_chunk) + params = { + model: model, + messages: [ + { role: "system", content: is_first_chunk ? instructions_with_metadata : instructions_transactions_only }, + { role: "user", content: "Extract transactions:\n\n#{text}" } + ], + response_format: { type: "json_object" } + } + + response = client.chat(parameters: params) + content = response.dig("choices", 0, "message", "content") + + raise Provider::Openai::Error, "No response from AI" if content.blank? + + parsed = parse_json_response(content) + + { + transactions: normalize_transactions(parsed["transactions"] || []), + period: { + start_date: parsed.dig("statement_period", "start_date"), + end_date: parsed.dig("statement_period", "end_date") + }, + account_holder: parsed["account_holder"], + account_number: parsed["account_number"], + bank_name: parsed["bank_name"], + opening_balance: parsed["opening_balance"], + closing_balance: parsed["closing_balance"] + } + end + + def parse_json_response(content) + cleaned = content.gsub(%r{^```json\s*}i, "").gsub(/```\s*$/, "").strip + JSON.parse(cleaned) + rescue JSON::ParserError => e + Rails.logger.error("BankStatementExtractor JSON parse error: #{e.message} (content_length=#{content.to_s.bytesize})") + { "transactions" => [] } + end + + def deduplicate_transactions(transactions) + # Deduplicates transactions that appear in consecutive chunks (chunking artifacts). + # + # KNOWN LIMITATION: Legitimate duplicate transactions (same date, amount, merchant) + # that happen to appear in adjacent chunks will be incorrectly deduplicated. + # This is an acceptable trade-off since chunking artifacts are more common than + # true same-day duplicates at chunk boundaries. Transactions within the same + # chunk are always preserved regardless of similarity. + seen = Set.new + transactions.select do |t| + # Create key without chunk_index for deduplication + key = [ t[:date], t[:amount], t[:name], t[:chunk_index] ] + + # Check if we've seen this exact transaction in a different chunk + duplicate = seen.any? do |prev_key| + prev_key[0..2] == key[0..2] && (prev_key[3] - key[3]).abs <= 1 + end + + seen << key + !duplicate + end.map { |t| t.except(:chunk_index) } + end + + def normalize_transactions(transactions) + transactions.map do |txn| + { + date: parse_date(txn["date"]), + amount: parse_amount(txn["amount"]), + name: txn["description"] || txn["name"] || txn["merchant"], + category: infer_category(txn), + notes: txn["reference"] || txn["notes"] + } + end.compact + end + + def parse_date(date_str) + return nil if date_str.blank? + + Date.parse(date_str).strftime("%Y-%m-%d") + rescue ArgumentError + nil + end + + def parse_amount(amount) + return nil if amount.nil? + + if amount.is_a?(Numeric) + amount.to_f + else + amount.to_s.gsub(/[^0-9.\-]/, "").to_f + end + end + + def infer_category(txn) + txn["category"] || txn["type"] + end + + def instructions_with_metadata + <<~INSTRUCTIONS.strip + Extract bank statement data as JSON. Return: + {"bank_name":"...","account_holder":"...","account_number":"last 4 digits","statement_period":{"start_date":"YYYY-MM-DD","end_date":"YYYY-MM-DD"},"opening_balance":0.00,"closing_balance":0.00,"transactions":[{"date":"YYYY-MM-DD","description":"...","amount":-0.00}]} + + Rules: Negative amounts for debits/expenses, positive for credits/deposits. Dates as YYYY-MM-DD. Extract ALL transactions. JSON only, no markdown. + INSTRUCTIONS + end + + def instructions_transactions_only + <<~INSTRUCTIONS.strip + Extract transactions from bank statement text as JSON. Return: + {"transactions":[{"date":"YYYY-MM-DD","description":"...","amount":-0.00}]} + + Rules: Negative amounts for debits/expenses, positive for credits/deposits. Dates as YYYY-MM-DD. Extract ALL transactions. JSON only, no markdown. + INSTRUCTIONS + end +end diff --git a/app/models/provider/openai/pdf_processor.rb b/app/models/provider/openai/pdf_processor.rb new file mode 100644 index 000000000..b99caa77c --- /dev/null +++ b/app/models/provider/openai/pdf_processor.rb @@ -0,0 +1,265 @@ +class Provider::Openai::PdfProcessor + include Provider::Openai::Concerns::UsageRecorder + + attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family + + def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil) + @client = client + @model = model + @pdf_content = pdf_content + @custom_provider = custom_provider + @langfuse_trace = langfuse_trace + @family = family + end + + def process + span = langfuse_trace&.span(name: "process_pdf_api_call", input: { + model: model.presence || Provider::Openai::DEFAULT_MODEL, + pdf_size: pdf_content&.bytesize + }) + + # Try text extraction first (works with all models) + # Fall back to vision API with images if text extraction fails (for scanned PDFs) + response = begin + process_with_text_extraction + rescue Provider::Openai::Error => e + Rails.logger.warn("Text extraction failed: #{e.message}, trying vision API with images") + process_with_vision + end + + span&.end(output: response.to_h) + response + rescue => e + span&.end(output: { error: e.message }, level: "ERROR") + raise + end + + def instructions + <<~INSTRUCTIONS.strip + You are a financial document analysis assistant. Your job is to analyze uploaded PDF documents + and provide a structured summary of what the document contains. + + For each document, you must determine: + + 1. **Document Type**: Classify the document as one of the following: + - `bank_statement`: A bank account statement showing transactions, balances, and account activity + - `credit_card_statement`: A credit card statement showing charges, payments, and balances + - `investment_statement`: An investment/brokerage statement showing holdings, trades, or portfolio performance + - `financial_document`: General financial documents like tax forms, receipts, invoices, or financial reports + - `contract`: Legal agreements, loan documents, terms of service, or policy documents + - `other`: Any document that doesn't fit the above categories + + 2. **Summary**: Provide a concise summary of the document that includes: + - The issuing institution or company name (if identifiable) + - The date range or statement period (if applicable) + - Key financial figures (account balances, total transactions, etc.) + - The account holder's name (if visible, use "Account Holder" if redacted) + - Any notable items or important information + + 3. **Extracted Data**: If the document is a statement with transactions, extract key metadata: + - Number of transactions (if countable) + - Statement period (start and end dates) + - Opening and closing balances (if visible) + - Currency used + + IMPORTANT GUIDELINES: + - Be factual and precise - only report what you can clearly see in the document + - If information is unclear or redacted, note it as "not clearly visible" or "redacted" + - Do NOT make assumptions about data you cannot see + - For statements with many transactions, provide a count rather than listing each one + - Focus on providing actionable information that helps the user understand what they uploaded + - If the document is unreadable or the PDF is corrupted, indicate this clearly + + Respond with ONLY valid JSON in this exact format (no markdown code blocks, no other text): + { + "document_type": "bank_statement|credit_card_statement|investment_statement|financial_document|contract|other", + "summary": "A clear, concise summary of the document contents...", + "extracted_data": { + "institution_name": "Name of bank/company or null", + "statement_period_start": "YYYY-MM-DD or null", + "statement_period_end": "YYYY-MM-DD or null", + "transaction_count": number or null, + "opening_balance": number or null, + "closing_balance": number or null, + "currency": "USD/EUR/etc or null", + "account_holder": "Name or null" + } + } + INSTRUCTIONS + end + + private + + PdfProcessingResult = Provider::LlmConcept::PdfProcessingResult + + def process_with_text_extraction + effective_model = model.presence || Provider::Openai::DEFAULT_MODEL + + # Extract text from PDF using pdf-reader gem + pdf_text = extract_text_from_pdf + raise Provider::Openai::Error, "Could not extract text from PDF" if pdf_text.blank? + + # Truncate if too long (max ~100k chars to stay within token limits) + pdf_text = pdf_text.truncate(100_000) if pdf_text.length > 100_000 + + params = { + model: effective_model, + messages: [ + { role: "system", content: instructions }, + { + role: "user", + content: "Please analyze the following document text and provide a structured summary:\n\n#{pdf_text}" + } + ], + response_format: { type: "json_object" } + } + + response = client.chat(parameters: params) + + Rails.logger.info("Tokens used to process PDF: #{response.dig("usage", "total_tokens")}") + + record_usage( + effective_model, + response.dig("usage"), + operation: "process_pdf", + metadata: { pdf_size: pdf_content&.bytesize } + ) + + parse_response_generic(response) + end + + def extract_text_from_pdf + return nil if pdf_content.blank? + + reader = PDF::Reader.new(StringIO.new(pdf_content)) + text_parts = [] + + reader.pages.each_with_index do |page, index| + text_parts << "--- Page #{index + 1} ---" + text_parts << page.text + end + + text_parts.join("\n\n") + rescue => e + Rails.logger.error("Failed to extract text from PDF: #{e.message}") + nil + end + + def process_with_vision + effective_model = model.presence || Provider::Openai::DEFAULT_MODEL + + # Convert PDF to images using pdftoppm + images_base64 = convert_pdf_to_images + raise Provider::Openai::Error, "Could not convert PDF to images" if images_base64.blank? + + # Build message content with images (max 5 pages to avoid token limits) + content = [] + images_base64.first(5).each do |img_base64| + content << { + type: "image_url", + image_url: { + url: "data:image/png;base64,#{img_base64}", + detail: "low" + } + } + end + content << { + type: "text", + text: "Please analyze this PDF document (#{images_base64.size} pages total, showing first #{[ images_base64.size, 5 ].min}) and respond with valid JSON only." + } + + # Note: response_format is not compatible with vision, so we ask for JSON in the prompt + params = { + model: effective_model, + messages: [ + { role: "system", content: instructions + "\n\nIMPORTANT: Respond with valid JSON only, no markdown or other formatting." }, + { role: "user", content: content } + ], + max_tokens: 4096 + } + + response = client.chat(parameters: params) + + Rails.logger.info("Tokens used to process PDF via vision: #{response.dig("usage", "total_tokens")}") + + record_usage( + effective_model, + response.dig("usage"), + operation: "process_pdf_vision", + metadata: { pdf_size: pdf_content&.bytesize, pages: images_base64.size } + ) + + parse_response_generic(response) + end + + def convert_pdf_to_images + return [] if pdf_content.blank? + + Dir.mktmpdir do |tmpdir| + pdf_path = File.join(tmpdir, "input.pdf") + File.binwrite(pdf_path, pdf_content) + + # Convert PDF to PNG images using pdftoppm + output_prefix = File.join(tmpdir, "page") + system("pdftoppm", "-png", "-r", "150", pdf_path, output_prefix) + + # Read all generated images + image_files = Dir.glob(File.join(tmpdir, "page-*.png")).sort + image_files.map do |img_path| + Base64.strict_encode64(File.binread(img_path)) + end + end + rescue => e + Rails.logger.error("Failed to convert PDF to images: #{e.message}") + [] + end + + def parse_response_generic(response) + raw = response.dig("choices", 0, "message", "content") + parsed = parse_json_flexibly(raw) + + build_result(parsed) + end + + def build_result(parsed) + PdfProcessingResult.new( + summary: parsed["summary"], + document_type: normalize_document_type(parsed["document_type"]), + extracted_data: parsed["extracted_data"] || {} + ) + end + + def normalize_document_type(doc_type) + return "other" if doc_type.blank? + + normalized = doc_type.to_s.strip.downcase.gsub(/\s+/, "_") + Import::DOCUMENT_TYPES.include?(normalized) ? normalized : "other" + end + + def parse_json_flexibly(raw) + return {} if raw.blank? + + # Try direct parse first + JSON.parse(raw) + rescue JSON::ParserError + # Try to extract JSON from markdown code blocks + if raw =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Continue to next strategy + end + end + + # Try to find any JSON object + if raw =~ /(\{[\s\S]*\})/m + begin + return JSON.parse($1) + rescue JSON::ParserError + # Fall through to error + end + end + + raise Provider::Openai::Error, "Could not parse JSON from PDF processing response: #{raw.truncate(200)}" + end +end diff --git a/app/views/import/uploads/show.html.erb b/app/views/import/uploads/show.html.erb index 7227ff352..338654578 100644 --- a/app/views/import/uploads/show.html.erb +++ b/app/views/import/uploads/show.html.erb @@ -44,7 +44,7 @@

- <%= form.file_field :csv_file, class: "hidden", "data-auto-submit-form-target": "auto", "data-file-upload-target": "input", "data-drag-and-drop-import-target": "input" %> + <%= form.file_field :import_file, class: "hidden", "data-auto-submit-form-target": "auto", "data-file-upload-target": "input", "data-drag-and-drop-import-target": "input" %> diff --git a/app/views/imports/_pdf_import.html.erb b/app/views/imports/_pdf_import.html.erb new file mode 100644 index 000000000..f2b1ea969 --- /dev/null +++ b/app/views/imports/_pdf_import.html.erb @@ -0,0 +1,84 @@ +<%# locals: (import:) %> + +
+
+ <% if import.importing? || import.pending? %> +
+ <%= icon "loader", class: "animate-pulse" %> +
+ +
+

<%= t("imports.pdf_import.processing_title") %>

+

<%= t("imports.pdf_import.processing_description") %>

+
+ +
+ <%= render DS::Link.new(text: t("imports.pdf_import.check_status"), href: import_path(import), variant: "primary", full_width: true) %> + <%= render DS::Link.new(text: t("imports.pdf_import.back_to_dashboard"), href: root_path, variant: "secondary", full_width: true) %> +
+ + <% elsif import.failed? %> +
+ <%= icon "x", class: "text-destructive" %> +
+ +
+

<%= t("imports.pdf_import.failed_title") %>

+

<%= t("imports.pdf_import.failed_description") %>

+ <% if import.error.present? %> +

<%= import.error %>

+ <% end %> +
+ +
+ <%= render DS::Link.new(text: t("imports.pdf_import.try_again"), href: new_import_path, variant: "primary", full_width: true) %> + <%= button_to t("imports.pdf_import.delete_import"), import_path(import), method: :delete, class: "btn btn--secondary w-full" %> +
+ + <% elsif import.complete? && import.ai_processed? %> +
+ <%= icon "check", class: "text-success" %> +
+ +
+

<%= t("imports.pdf_import.complete_title") %>

+

<%= t("imports.pdf_import.complete_description") %>

+
+ +
+
+

<%= t("imports.pdf_import.document_type_label") %>

+

+ <%= t("imports.document_types.#{import.document_type}") %> +

+
+ +
+

<%= t("imports.pdf_import.summary_label") %>

+

+ <%= import.ai_summary %> +

+
+
+ +
+

<%= t("imports.pdf_import.email_sent_notice") %>

+
+ +
+ <%= render DS::Link.new(text: t("imports.pdf_import.back_to_imports"), href: imports_path, variant: "primary", full_width: true) %> + <%= button_to t("imports.pdf_import.delete_import"), import_path(import), method: :delete, class: "btn btn--secondary w-full" %> +
+ + <% else %> +
+

<%= t("imports.pdf_import.unknown_state_title") %>

+

<%= t("imports.pdf_import.unknown_state_description") %>

+
+ +
+ <%= render DS::Link.new(text: t("imports.pdf_import.back_to_imports"), href: imports_path, variant: "primary", full_width: true) %> +
+ <% end %> +
+
diff --git a/app/views/imports/new.html.erb b/app/views/imports/new.html.erb index d6910c0ff..91fc9f5ac 100644 --- a/app/views/imports/new.html.erb +++ b/app/views/imports/new.html.erb @@ -140,6 +140,35 @@ <%= render "shared/ruler" %> <% end %> + + <% if (params[:type].nil? || params[:type] == "PdfImport") && Provider::Registry.get_provider(:openai)&.supports_pdf_processing? %> +
  • + <%= styled_form_with url: imports_path, scope: :import, multipart: true, class: "w-full" do |form| %> + <%= form.hidden_field :type, value: "PdfImport" %> + + <% end %> + + <%= render "shared/ruler" %> +
  • + <% end %> <% end %> diff --git a/app/views/imports/show.html.erb b/app/views/imports/show.html.erb index ffa552975..d4863585b 100644 --- a/app/views/imports/show.html.erb +++ b/app/views/imports/show.html.erb @@ -2,9 +2,11 @@ <%= render "imports/nav", import: @import %> <% end %> -<%= content_for :previous_path, import_confirm_path(@import) %> +<%= content_for :previous_path, @import.is_a?(PdfImport) ? imports_path : import_confirm_path(@import) %> -<% if @import.importing? %> +<% if @import.is_a?(PdfImport) %> + <%= render "imports/pdf_import", import: @import %> +<% elsif @import.importing? %> <%= render "imports/importing", import: @import %> <% elsif @import.complete? %> <%= render "imports/success", import: @import %> diff --git a/app/views/pdf_import_mailer/next_steps.html.erb b/app/views/pdf_import_mailer/next_steps.html.erb new file mode 100644 index 000000000..595cbcb59 --- /dev/null +++ b/app/views/pdf_import_mailer/next_steps.html.erb @@ -0,0 +1,30 @@ +

    <%= t(".greeting", name: @user.display_name) %>

    + +

    <%= t(".intro", product: product_name) %>

    + +

    <%= t(".document_type_label") %>

    +

    <%= @pdf_import.document_type.present? ? t("imports.document_types.#{@pdf_import.document_type}") : t("imports.document_types.other") %>

    + +

    <%= t(".summary_label") %>

    +

    <%= @pdf_import.ai_summary %>

    + +<% if @pdf_import.document_type.in?(%w[bank_statement credit_card_statement investment_statement]) %> +

    <%= t(".transactions_note") %>

    +<% else %> +

    <%= t(".document_stored_note") %>

    +<% end %> + +

    <%= t(".next_steps_label") %>

    +

    <%= t(".next_steps_intro") %>

    + + + +<%= link_to t(".view_import_button"), @import_url, class: "button" %> + + diff --git a/app/views/pdf_import_mailer/next_steps.text.erb b/app/views/pdf_import_mailer/next_steps.text.erb new file mode 100644 index 000000000..add337d78 --- /dev/null +++ b/app/views/pdf_import_mailer/next_steps.text.erb @@ -0,0 +1,28 @@ +<%= t(".greeting", name: @user.display_name) %> + +<%= t(".intro", product: product_name) %> + +<%= t(".document_type_label") %> +<%= @pdf_import.document_type ? t("imports.document_types.#{@pdf_import.document_type}") : t("imports.document_types.unknown") %> + +<%= t(".summary_label") %> +<%= @pdf_import.ai_summary %> + +<% if @pdf_import.document_type.in?(%w[bank_statement credit_card_statement investment_statement]) %> +<%= t(".transactions_note") %> +<% else %> +<%= t(".document_stored_note") %> +<% end %> + +<%= t(".next_steps_label") %> +<%= t(".next_steps_intro") %> + +<% if @pdf_import.document_type.in?(%w[bank_statement credit_card_statement investment_statement]) %> +- <%= t(".option_extract_transactions") %> +<% end %> +- <%= t(".option_keep_reference") %> +- <%= t(".option_delete") %> + +<%= t(".view_import_button") %>: <%= @import_url %> + +<%= t(".footer_note") %> diff --git a/app/views/transactions/_list.html.erb b/app/views/transactions/_list.html.erb index 1b0589e2a..b3f761065 100644 --- a/app/views/transactions/_list.html.erb +++ b/app/views/transactions/_list.html.erb @@ -7,7 +7,7 @@ <%= form_with url: imports_path, method: :post, class: "hidden", data: { drag_and_drop_import_target: "form" } do |f| %> <%= f.hidden_field "import[type]", value: "TransactionImport" %> - <%= f.file_field "import[csv_file]", class: "hidden", data: { drag_and_drop_import_target: "input" }, accept: ".csv" %> + <%= f.file_field "import[import_file]", class: "hidden", data: { drag_and_drop_import_target: "input" }, accept: ".csv" %> <% end %> <%= render "imports/drag_drop_overlay", title: t(".drag_drop_title"), subtitle: t(".drag_drop_subtitle") %> diff --git a/config/locales/mailers/pdf_import_mailer/en.yml b/config/locales/mailers/pdf_import_mailer/en.yml new file mode 100644 index 000000000..1399d306b --- /dev/null +++ b/config/locales/mailers/pdf_import_mailer/en.yml @@ -0,0 +1,5 @@ +--- +en: + pdf_import_mailer: + next_steps: + subject: "Your PDF document has been analyzed - %{product}" diff --git a/config/locales/views/imports/en.yml b/config/locales/views/imports/en.yml index cd8fb8bd6..a40f3cbf0 100644 --- a/config/locales/views/imports/en.yml +++ b/config/locales/views/imports/en.yml @@ -102,12 +102,51 @@ en: import_portfolio: Import investments import_rules: Import rules import_transactions: Import transactions + import_pdf: Import PDF document + import_pdf_description: AI-powered document analysis resume: Resume %{type} sources: Sources title: New CSV Import + create: + file_too_large: File is too large. Maximum size is %{max_size}MB. + invalid_file_type: Invalid file type. Please upload a CSV file. + csv_uploaded: CSV uploaded successfully. + pdf_too_large: PDF file is too large. Maximum size is %{max_size}MB. + pdf_processing: Your PDF is being processed. You will receive an email when analysis is complete. + invalid_pdf: The uploaded file is not a valid PDF. + show: + finalize_upload: Please finalize your file upload. + finalize_mappings: Please finalize your mappings before proceeding. ready: description: Here's a summary of the new items that will be added to your account once you publish this import. title: Confirm your import data errors: custom_column_requires_inflow: "Custom column imports require an inflow column to be selected" + document_types: + bank_statement: Bank Statement + credit_card_statement: Credit Card Statement + investment_statement: Investment Statement + financial_document: Financial Document + contract: Contract + other: Other Document + unknown: Unknown Document + pdf_import: + processing_title: Processing your PDF + processing_description: We're analyzing your document using AI. This may take a moment. You'll receive an email when the analysis is complete. + check_status: Check status + back_to_dashboard: Back to dashboard + failed_title: Processing failed + failed_description: We were unable to process your PDF document. Please try again or contact support. + try_again: Try again + delete_import: Delete import + complete_title: Document analyzed + complete_description: We've analyzed your PDF and here's what we found. + document_type_label: Document Type + summary_label: Summary + email_sent_notice: An email has been sent to you with next steps. + back_to_imports: Back to imports + unknown_state_title: Unknown state + unknown_state_description: This import is in an unexpected state. Please return to imports. + processing_failed_with_message: "%{message}" + processing_failed_generic: "Processing failed: %{error}" diff --git a/config/locales/views/pdf_import_mailer/en.yml b/config/locales/views/pdf_import_mailer/en.yml new file mode 100644 index 000000000..5298e9e32 --- /dev/null +++ b/config/locales/views/pdf_import_mailer/en.yml @@ -0,0 +1,17 @@ +--- +en: + pdf_import_mailer: + next_steps: + greeting: "Hi %{name}," + intro: "We've finished analyzing the PDF document you uploaded to %{product}." + document_type_label: Document Type + summary_label: AI Summary + transactions_note: This document appears to contain transactions. You can extract and review them now. + document_stored_note: This document has been stored for your reference. It can be used to provide context in future AI conversations. + next_steps_label: What's Next? + next_steps_intro: "You have several options:" + option_extract_transactions: Extract transactions from this statement + option_keep_reference: Keep this document for reference in future AI conversations + option_delete: Delete this import if you no longer need it + view_import_button: View Import Details + footer_note: This is an automated message. Please do not reply directly to this email. diff --git a/db/migrate/20260116100000_add_pdf_import_support.rb b/db/migrate/20260116100000_add_pdf_import_support.rb new file mode 100644 index 000000000..f9d561ee9 --- /dev/null +++ b/db/migrate/20260116100000_add_pdf_import_support.rb @@ -0,0 +1,6 @@ +class AddPdfImportSupport < ActiveRecord::Migration[7.2] + def change + add_column :imports, :ai_summary, :text + add_column :imports, :document_type, :string + end +end diff --git a/db/migrate/20260129200129_add_extracted_data_to_imports.rb b/db/migrate/20260129200129_add_extracted_data_to_imports.rb new file mode 100644 index 000000000..aafea804f --- /dev/null +++ b/db/migrate/20260129200129_add_extracted_data_to_imports.rb @@ -0,0 +1,5 @@ +class AddExtractedDataToImports < ActiveRecord::Migration[7.2] + def change + add_column :imports, :extracted_data, :jsonb + end +end diff --git a/db/schema.rb b/db/schema.rb index 9b7bcb296..3277c7385 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.2].define(version: 2026_01_24_180211) do +ActiveRecord::Schema[7.2].define(version: 2026_01_29_200129) do # These are extensions that must be enabled in order to support this database enable_extension "pgcrypto" enable_extension "plpgsql" @@ -660,6 +660,9 @@ ActiveRecord::Schema[7.2].define(version: 2026_01_24_180211) do t.integer "rows_to_skip", default: 0, null: false t.integer "rows_count", default: 0, null: false t.string "amount_type_identifier_value" + t.text "ai_summary" + t.string "document_type" + t.jsonb "extracted_data" t.index ["family_id"], name: "index_imports_on_family_id" end diff --git a/docs/hosting/ai.md b/docs/hosting/ai.md index 23ab1c12f..1106361a1 100644 --- a/docs/hosting/ai.md +++ b/docs/hosting/ai.md @@ -91,6 +91,9 @@ Sure supports any OpenAI-compatible API endpoint. Here are tested providers: ```bash OPENAI_ACCESS_TOKEN=sk-proj-... # No other configuration needed + +# Optional: Request timeout in seconds (default: 60) +# OPENAI_REQUEST_TIMEOUT=60 ``` **Recommended models:** diff --git a/test/controllers/import/uploads_controller_test.rb b/test/controllers/import/uploads_controller_test.rb index 647815d45..6aa3be8b6 100644 --- a/test/controllers/import/uploads_controller_test.rb +++ b/test/controllers/import/uploads_controller_test.rb @@ -26,7 +26,7 @@ class Import::UploadsControllerTest < ActionDispatch::IntegrationTest test "uploads valid csv by file" do patch import_upload_url(@import), params: { import: { - csv_file: file_fixture_upload("imports/valid.csv"), + import_file: file_fixture_upload("imports/valid.csv"), col_sep: "," } } @@ -38,7 +38,7 @@ class Import::UploadsControllerTest < ActionDispatch::IntegrationTest test "invalid csv cannot be uploaded" do patch import_upload_url(@import), params: { import: { - csv_file: file_fixture_upload("imports/invalid.csv"), + import_file: file_fixture_upload("imports/invalid.csv"), col_sep: "," } } diff --git a/test/fixtures/files/imports/sample_bank_statement.pdf b/test/fixtures/files/imports/sample_bank_statement.pdf new file mode 100644 index 000000000..377c27b4e Binary files /dev/null and b/test/fixtures/files/imports/sample_bank_statement.pdf differ diff --git a/test/fixtures/imports.yml b/test/fixtures/imports.yml index b01725327..3e9e185a9 100644 --- a/test/fixtures/imports.yml +++ b/test/fixtures/imports.yml @@ -12,3 +12,15 @@ account: family: dylan_family type: AccountImport status: pending + +pdf: + family: dylan_family + type: PdfImport + status: pending + +pdf_processed: + family: dylan_family + type: PdfImport + status: complete + ai_summary: "This is a bank statement from Chase Bank for the period January 1-31, 2024. It shows 15 transactions with an opening balance of $5,000 and closing balance of $4,500." + document_type: bank_statement diff --git a/test/jobs/process_pdf_job_test.rb b/test/jobs/process_pdf_job_test.rb new file mode 100644 index 000000000..c6374de23 --- /dev/null +++ b/test/jobs/process_pdf_job_test.rb @@ -0,0 +1,35 @@ +require "test_helper" + +class ProcessPdfJobTest < ActiveJob::TestCase + include ActionMailer::TestHelper + + setup do + @import = imports(:pdf) + @family = @import.family + end + + test "skips non-PdfImport imports" do + transaction_import = imports(:transaction) + + ProcessPdfJob.perform_now(transaction_import) + + assert_equal "pending", transaction_import.reload.status + end + + test "skips if PDF not uploaded" do + assert_not @import.pdf_uploaded? + + ProcessPdfJob.perform_now(@import) + + assert_equal "pending", @import.reload.status + end + + test "skips if already processed" do + processed_import = imports(:pdf_processed) + + ProcessPdfJob.perform_now(processed_import) + + # Should not change status since already complete + assert_equal "complete", processed_import.reload.status + end +end diff --git a/test/mailers/pdf_import_mailer_test.rb b/test/mailers/pdf_import_mailer_test.rb new file mode 100644 index 000000000..d5d118b27 --- /dev/null +++ b/test/mailers/pdf_import_mailer_test.rb @@ -0,0 +1,21 @@ +require "test_helper" + +class PdfImportMailerTest < ActionMailer::TestCase + setup do + @user = users(:family_admin) + @pdf_import = imports(:pdf_processed) + end + + test "next_steps email is sent to user" do + mail = PdfImportMailer.with(user: @user, pdf_import: @pdf_import).next_steps + + assert_equal [ @user.email ], mail.to + assert_includes mail.subject, "analyzed" + end + + test "next_steps email contains document summary" do + mail = PdfImportMailer.with(user: @user, pdf_import: @pdf_import).next_steps + + assert_match @pdf_import.ai_summary, mail.body.encoded + end +end diff --git a/test/models/pdf_import_test.rb b/test/models/pdf_import_test.rb new file mode 100644 index 000000000..3138c884b --- /dev/null +++ b/test/models/pdf_import_test.rb @@ -0,0 +1,69 @@ +require "test_helper" + +class PdfImportTest < ActiveSupport::TestCase + include ActiveJob::TestHelper + + setup do + @import = imports(:pdf) + @processed_import = imports(:pdf_processed) + end + + test "pdf_uploaded? returns false when no file attached" do + assert_not @import.pdf_uploaded? + end + + test "ai_processed? returns false when no summary present" do + assert_not @import.ai_processed? + end + + test "ai_processed? returns true when summary present" do + assert @processed_import.ai_processed? + end + + test "uploaded? delegates to pdf_uploaded?" do + assert_not @import.uploaded? + end + + test "configured? returns true when AI processed" do + assert @processed_import.configured? + assert_not @import.configured? + end + + test "cleaned? returns true when AI processed" do + assert @processed_import.cleaned? + assert_not @import.cleaned? + end + + test "publishable? always returns false for PDF imports" do + assert_not @import.publishable? + assert_not @processed_import.publishable? + end + + test "column_keys returns empty array" do + assert_equal [], @import.column_keys + end + + test "required_column_keys returns empty array" do + assert_equal [], @import.required_column_keys + end + + test "document_type validates against allowed types" do + @import.document_type = "bank_statement" + assert @import.valid? + + @import.document_type = "invalid_type" + assert_not @import.valid? + assert @import.errors[:document_type].present? + end + + test "document_type allows nil" do + @import.document_type = nil + assert @import.valid? + end + + test "process_with_ai_later enqueues ProcessPdfJob" do + assert_enqueued_with job: ProcessPdfJob, args: [ @import ] do + @import.process_with_ai_later + end + end +end diff --git a/test/system/drag_and_drop_import_test.rb b/test/system/drag_and_drop_import_test.rb index 6a2b94e9b..6a44a3d5b 100644 --- a/test/system/drag_and_drop_import_test.rb +++ b/test/system/drag_and_drop_import_test.rb @@ -20,12 +20,12 @@ class DragAndDropImportTest < ApplicationSystemTestCase execute_script(" var form = document.querySelector('form[action=\"#{imports_path}\"]'); form.classList.remove('hidden'); - var input = document.querySelector('input[name=\"import[csv_file]\"]'); + var input = document.querySelector('input[name=\"import[import_file]\"]'); input.classList.remove('hidden'); input.style.display = 'block'; ") - attach_file "import[csv_file]", file_path + attach_file "import[import_file]", file_path # Submit the form manually since we bypassed the 'drop' event listener which triggers submit find("form[action='#{imports_path}']").evaluate_script("this.requestSubmit()")