Index all PDF imports into vector store with type metadata

This commit is contained in:
Juan José Mata
2026-02-11 15:53:45 +01:00
parent 4e4ca916a1
commit e7c609dc91
4 changed files with 106 additions and 2 deletions

View File

@@ -11,6 +11,7 @@ class ProcessPdfJob < ApplicationJob
begin
pdf_import.process_with_ai
upload_to_vector_store(pdf_import)
# For bank statements, extract transactions and generate import rows
if pdf_import.bank_statement?
@@ -58,4 +59,19 @@ class ProcessPdfJob < ApplicationJob
error: error.class.name.demodulize)
end
end
def upload_to_vector_store(pdf_import)
filename = pdf_import.pdf_file.filename.to_s
file_content = pdf_import.pdf_file_content
family_document = pdf_import.family.upload_document(
file_content: file_content,
filename: filename,
metadata: { "type" => pdf_import.document_type }
)
return if family_document
Rails.logger.warn("ProcessPdfJob: Vector store upload failed for import #{pdf_import.id}")
end
end

View File

@@ -37,7 +37,7 @@ module Family::VectorSearchable
response.success? ? response.data : []
end
def upload_document(file_content:, filename:)
def upload_document(file_content:, filename:, metadata: {})
adapter = vector_store_adapter
return nil unless adapter
@@ -57,7 +57,8 @@ module Family::VectorSearchable
content_type: Marcel::MimeType.for(name: filename),
file_size: file_content.bytesize,
provider_file_id: response.data[:file_id],
status: "ready"
status: "ready",
metadata: metadata || {}
)
end

View File

@@ -32,4 +32,62 @@ class ProcessPdfJobTest < ActiveJob::TestCase
# Should not change status since already complete
assert_equal "complete", processed_import.reload.status
end
test "uploads non-bank PDF to vector store with classified type metadata" do
pdf_content = attach_pdf!(@import)
@import.stubs(:process_with_ai) do
@import.update!(ai_summary: "A tax return", document_type: "financial_document")
end
@import.stubs(:send_next_steps_email)
@import.expects(:extract_transactions).never
@family.expects(:upload_document).with do |file_content:, filename:, metadata:|
assert_equal pdf_content, file_content
assert_equal "sample_bank_statement.pdf", filename
assert_equal({ "type" => "financial_document" }, metadata)
true
end.returns(family_documents(:tax_return))
ProcessPdfJob.perform_now(@import)
assert_equal "complete", @import.reload.status
end
test "uploads bank statement PDF to vector store with classified type metadata" do
pdf_content = attach_pdf!(@import)
@import.stubs(:process_with_ai) do
@import.update!(ai_summary: "A bank statement", document_type: "bank_statement")
end
@import.expects(:extract_transactions).once
@import.expects(:generate_rows_from_extracted_data).once do
@import.update_column(:rows_count, 1)
end
@import.expects(:sync_mappings).once
@import.stubs(:send_next_steps_email)
@family.expects(:upload_document).with do |file_content:, filename:, metadata:|
assert_equal pdf_content, file_content
assert_equal "sample_bank_statement.pdf", filename
assert_equal({ "type" => "bank_statement" }, metadata)
true
end.returns(family_documents(:tax_return))
ProcessPdfJob.perform_now(@import)
assert_equal "pending", @import.reload.status
end
private
def attach_pdf!(import)
pdf_content = file_fixture("imports/sample_bank_statement.pdf").binread
import.pdf_file.attach(
io: StringIO.new(pdf_content),
filename: "sample_bank_statement.pdf",
content_type: "application/pdf"
)
pdf_content
end
end

View File

@@ -43,4 +43,33 @@ class FamilyTest < ActiveSupport::TestCase
assert_includes family.available_merchants, new_merchant
end
test "upload_document stores provided metadata on family document" do
family = families(:dylan_family)
family.update!(vector_store_id: nil)
adapter = mock("vector_store_adapter")
adapter.expects(:create_store).with(name: "Family #{family.id} Documents").returns(
VectorStore::Response.new(success?: true, data: { id: "vs_test123" }, error: nil)
)
adapter.expects(:upload_file).with(
store_id: "vs_test123",
file_content: "hello",
filename: "notes.txt"
).returns(
VectorStore::Response.new(success?: true, data: { file_id: "file-xyz" }, error: nil)
)
VectorStore::Registry.stubs(:adapter).returns(adapter)
document = family.upload_document(
file_content: "hello",
filename: "notes.txt",
metadata: { "type" => "financial_document" }
)
assert_not_nil document
assert_equal({ "type" => "financial_document" }, document.metadata)
assert_equal "vs_test123", family.reload.vector_store_id
end
end