mirror of
https://github.com/we-promise/sure.git
synced 2026-04-09 15:24:48 +00:00
* Add conditional migration for vector_store_chunks table Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector. Enables the vector extension, adds store_id/file_id indexes, and uses vector(1024) column type for embeddings. * Add VectorStore::Embeddable concern for text extraction and embedding Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is), paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars. * Implement VectorStore::Pgvector adapter with raw SQL Replaces the stub with a full implementation using ActiveRecord::Base.connection with parameterized binds. Supports create_store, delete_store, upload_file (extract+chunk+embed+insert), remove_file, and cosine-similarity search via the <=> operator. * Add registry test for pgvector adapter selection * Configure pgvector in compose.example.ai.yml Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER, EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include nomic-embed-text in Ollama's pre-loaded models. * Update pgvector docs from scaffolded to ready Document env vars, embedding model setup, pgvector Docker image requirement, and Ollama pull instructions. * Address PR review feedback - Migration: remove env guard, use pgvector_available? check so it runs on plain Postgres (CI) but creates the table on pgvector-capable servers. Add NOT NULL constraints on content/embedding/metadata, unique index on (store_id, file_id, chunk_index). - Pgvector adapter: wrap chunk inserts in a DB transaction to prevent partial file writes. Override supported_extensions to match formats that extract_text can actually parse. - Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE to avoid overflowing embedding model token limits. * Bump schema version to include vector_store_chunks migration CI uses db:schema:load which checks the version — without this bump, the migration is detected as pending and tests fail to start. * Update 20260316120000_create_vector_store_chunks.rb --------- Co-authored-by: sokiee <sokysrm@gmail.com>
142 lines
5.2 KiB
Ruby
142 lines
5.2 KiB
Ruby
require "test_helper"
|
|
|
|
class VectorStore::PgvectorTest < ActiveSupport::TestCase
|
|
setup do
|
|
@adapter = VectorStore::Pgvector.new
|
|
end
|
|
|
|
test "create_store returns a UUID" do
|
|
response = @adapter.create_store(name: "Test Store")
|
|
assert response.success?
|
|
assert_match(/\A[0-9a-f-]{36}\z/, response.data[:id])
|
|
end
|
|
|
|
test "delete_store executes delete query" do
|
|
mock_conn = mock("connection")
|
|
mock_conn.expects(:exec_delete).with(
|
|
"DELETE FROM vector_store_chunks WHERE store_id = $1",
|
|
"VectorStore::Pgvector DeleteStore",
|
|
anything
|
|
).returns(0)
|
|
|
|
@adapter.stubs(:connection).returns(mock_conn)
|
|
|
|
response = @adapter.delete_store(store_id: "store-123")
|
|
assert response.success?
|
|
end
|
|
|
|
test "upload_file extracts text, chunks, embeds, and inserts" do
|
|
file_content = "Hello world"
|
|
filename = "test.txt"
|
|
store_id = "store-123"
|
|
|
|
@adapter.expects(:extract_text).with(file_content, filename).returns("Hello world")
|
|
@adapter.expects(:chunk_text).with("Hello world").returns([ "Hello world" ])
|
|
@adapter.expects(:embed_batch).with([ "Hello world" ]).returns([ [ 0.1, 0.2, 0.3 ] ])
|
|
|
|
mock_conn = mock("connection")
|
|
mock_conn.expects(:transaction).yields
|
|
mock_conn.expects(:exec_insert).once
|
|
@adapter.stubs(:connection).returns(mock_conn)
|
|
|
|
response = @adapter.upload_file(store_id: store_id, file_content: file_content, filename: filename)
|
|
assert response.success?
|
|
assert_match(/\A[0-9a-f-]{36}\z/, response.data[:file_id])
|
|
end
|
|
|
|
test "upload_file fails when text extraction returns nil" do
|
|
@adapter.expects(:extract_text).returns(nil)
|
|
|
|
response = @adapter.upload_file(store_id: "store-123", file_content: "\x00binary", filename: "photo.png")
|
|
assert_not response.success?
|
|
assert_match(/Could not extract text/, response.error.message)
|
|
end
|
|
|
|
test "upload_file fails when no chunks produced" do
|
|
@adapter.expects(:extract_text).returns("some text")
|
|
@adapter.expects(:chunk_text).returns([])
|
|
|
|
response = @adapter.upload_file(store_id: "store-123", file_content: "some text", filename: "empty.txt")
|
|
assert_not response.success?
|
|
assert_match(/No chunks produced/, response.error.message)
|
|
end
|
|
|
|
test "upload_file inserts multiple chunks in a transaction" do
|
|
@adapter.expects(:extract_text).returns("chunk1\n\nchunk2")
|
|
@adapter.expects(:chunk_text).returns([ "chunk1", "chunk2" ])
|
|
@adapter.expects(:embed_batch).returns([ [ 0.1 ], [ 0.2 ] ])
|
|
|
|
mock_conn = mock("connection")
|
|
mock_conn.expects(:transaction).yields
|
|
mock_conn.expects(:exec_insert).twice
|
|
@adapter.stubs(:connection).returns(mock_conn)
|
|
|
|
response = @adapter.upload_file(store_id: "store-123", file_content: "chunk1\n\nchunk2", filename: "doc.txt")
|
|
assert response.success?
|
|
end
|
|
|
|
test "remove_file executes delete with store_id and file_id" do
|
|
mock_conn = mock("connection")
|
|
mock_conn.expects(:exec_delete).with(
|
|
"DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
|
|
"VectorStore::Pgvector RemoveFile",
|
|
anything
|
|
).returns(1)
|
|
|
|
@adapter.stubs(:connection).returns(mock_conn)
|
|
|
|
response = @adapter.remove_file(store_id: "store-123", file_id: "file-456")
|
|
assert response.success?
|
|
end
|
|
|
|
test "search embeds query and returns scored results" do
|
|
query_vector = [ 0.1, 0.2, 0.3 ]
|
|
@adapter.expects(:embed).with("income").returns(query_vector)
|
|
|
|
mock_result = [
|
|
{ "content" => "Total income: $85,000", "filename" => "tax_return.pdf", "file_id" => "file-xyz", "score" => 0.95 }
|
|
]
|
|
|
|
mock_conn = mock("connection")
|
|
mock_conn.expects(:exec_query).returns(mock_result)
|
|
@adapter.stubs(:connection).returns(mock_conn)
|
|
|
|
response = @adapter.search(store_id: "store-123", query: "income", max_results: 5)
|
|
assert response.success?
|
|
assert_equal 1, response.data.size
|
|
assert_equal "Total income: $85,000", response.data.first[:content]
|
|
assert_equal "tax_return.pdf", response.data.first[:filename]
|
|
assert_equal 0.95, response.data.first[:score]
|
|
assert_equal "file-xyz", response.data.first[:file_id]
|
|
end
|
|
|
|
test "search returns empty array when no results" do
|
|
@adapter.expects(:embed).returns([ 0.1 ])
|
|
|
|
mock_conn = mock("connection")
|
|
mock_conn.expects(:exec_query).returns([])
|
|
@adapter.stubs(:connection).returns(mock_conn)
|
|
|
|
response = @adapter.search(store_id: "store-123", query: "nothing")
|
|
assert response.success?
|
|
assert_empty response.data
|
|
end
|
|
|
|
test "wraps errors in failure response" do
|
|
@adapter.expects(:extract_text).raises(StandardError, "unexpected error")
|
|
|
|
response = @adapter.upload_file(store_id: "store-123", file_content: "data", filename: "test.txt")
|
|
assert_not response.success?
|
|
assert_equal "unexpected error", response.error.message
|
|
end
|
|
|
|
test "supported_extensions matches extractable formats only" do
|
|
assert_includes @adapter.supported_extensions, ".pdf"
|
|
assert_includes @adapter.supported_extensions, ".txt"
|
|
assert_includes @adapter.supported_extensions, ".csv"
|
|
assert_not_includes @adapter.supported_extensions, ".png"
|
|
assert_not_includes @adapter.supported_extensions, ".zip"
|
|
assert_not_includes @adapter.supported_extensions, ".docx"
|
|
end
|
|
end
|