feat(vector-store): Implement pgvector adapter for self-hosted RAG (#1211)

* Add conditional migration for vector_store_chunks table

Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector.
Enables the vector extension, adds store_id/file_id indexes, and uses
vector(1024) column type for embeddings.

* Add VectorStore::Embeddable concern for text extraction and embedding

Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is),
paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch
via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via
EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars.

* Implement VectorStore::Pgvector adapter with raw SQL

Replaces the stub with a full implementation using
ActiveRecord::Base.connection with parameterized binds. Supports
create_store, delete_store, upload_file (extract+chunk+embed+insert),
remove_file, and cosine-similarity search via the <=> operator.

* Add registry test for pgvector adapter selection

* Configure pgvector in compose.example.ai.yml

Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER,
EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include
nomic-embed-text in Ollama's pre-loaded models.

* Update pgvector docs from scaffolded to ready

Document env vars, embedding model setup, pgvector Docker image
requirement, and Ollama pull instructions.

* Address PR review feedback

- Migration: remove env guard, use pgvector_available? check so it runs
  on plain Postgres (CI) but creates the table on pgvector-capable servers.
  Add NOT NULL constraints on content/embedding/metadata, unique index on
  (store_id, file_id, chunk_index).
- Pgvector adapter: wrap chunk inserts in a DB transaction to prevent
  partial file writes. Override supported_extensions to match formats
  that extract_text can actually parse.
- Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE
  to avoid overflowing embedding model token limits.

* Bump schema version to include vector_store_chunks migration

CI uses db:schema:load which checks the version — without this bump,
the migration is detected as pending and tests fail to start.

* Update 20260316120000_create_vector_store_chunks.rb

---------

Co-authored-by: sokiee <sokysrm@gmail.com>
This commit is contained in:
Dream
2026-03-20 12:01:31 -04:00
committed by GitHub
parent 2cdddd28d7
commit 6d22514c01
9 changed files with 672 additions and 59 deletions

View File

@@ -0,0 +1,204 @@
require "test_helper"
class VectorStore::EmbeddableTest < ActiveSupport::TestCase
class EmbeddableHost
include VectorStore::Embeddable
# Expose private methods for testing
public :extract_text, :chunk_text, :embed, :embed_batch
end
setup do
@host = EmbeddableHost.new
end
# --- extract_text ---
test "extract_text returns plain text for .txt files" do
result = @host.extract_text("Hello world", "notes.txt")
assert_equal "Hello world", result
end
test "extract_text returns content for markdown files" do
result = @host.extract_text("# Heading\n\nBody", "readme.md")
assert_equal "# Heading\n\nBody", result
end
test "extract_text returns content for code files" do
result = @host.extract_text("def foo; end", "app.rb")
assert_equal "def foo; end", result
end
test "extract_text returns nil for unsupported binary formats" do
assert_nil @host.extract_text("\x00\x01binary", "photo.png")
assert_nil @host.extract_text("\x00\x01binary", "archive.zip")
end
test "extract_text handles PDF files" do
pdf_content = "fake pdf bytes"
mock_page = mock("page")
mock_page.stubs(:text).returns("Page 1 content")
mock_reader = mock("reader")
mock_reader.stubs(:pages).returns([ mock_page ])
PDF::Reader.expects(:new).with(instance_of(StringIO)).returns(mock_reader)
result = @host.extract_text(pdf_content, "document.pdf")
assert_equal "Page 1 content", result
end
test "extract_text returns nil when PDF extraction fails" do
PDF::Reader.expects(:new).raises(StandardError, "corrupt pdf")
result = @host.extract_text("bad data", "broken.pdf")
assert_nil result
end
# --- chunk_text ---
test "chunk_text returns empty array for blank text" do
assert_equal [], @host.chunk_text("")
assert_equal [], @host.chunk_text(nil)
end
test "chunk_text returns single chunk for short text" do
text = "Short paragraph."
chunks = @host.chunk_text(text)
assert_equal 1, chunks.size
assert_equal "Short paragraph.", chunks.first
end
test "chunk_text splits on paragraph boundaries" do
# Create text that exceeds CHUNK_SIZE when combined
para1 = "A" * 1200
para2 = "B" * 1200
text = "#{para1}\n\n#{para2}"
chunks = @host.chunk_text(text)
assert_equal 2, chunks.size
assert_includes chunks.first, "A" * 1200
assert_includes chunks.last, "B" * 1200
end
test "chunk_text includes overlap between chunks" do
para1 = "A" * 1500
para2 = "B" * 1500
text = "#{para1}\n\n#{para2}"
chunks = @host.chunk_text(text)
assert_equal 2, chunks.size
# Second chunk should start with overlap from end of first chunk
overlap = para1.last(VectorStore::Embeddable::CHUNK_OVERLAP)
assert chunks.last.start_with?(overlap)
end
test "chunk_text keeps small paragraphs together" do
paragraphs = Array.new(5) { |i| "Paragraph #{i} content." }
text = paragraphs.join("\n\n")
chunks = @host.chunk_text(text)
assert_equal 1, chunks.size
end
test "chunk_text hard-splits oversized paragraphs" do
# A single paragraph longer than CHUNK_SIZE with no paragraph breaks
long_para = "X" * 5000
chunks = @host.chunk_text(long_para)
assert chunks.size > 1
chunks.each do |chunk|
assert chunk.length <= VectorStore::Embeddable::CHUNK_SIZE + VectorStore::Embeddable::CHUNK_OVERLAP + 2,
"Chunk too large: #{chunk.length} chars"
end
end
# --- embed ---
test "embed calls embedding endpoint and returns vector" do
expected_vector = [ 0.1, 0.2, 0.3 ]
stub_response = { "data" => [ { "embedding" => expected_vector, "index" => 0 } ] }
mock_client = mock("faraday")
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
OpenStruct.new(body: stub_response)
)
@host.instance_variable_set(:@embedding_client, mock_client)
result = @host.embed("test text")
assert_equal expected_vector, result
end
test "embed raises on failed response" do
mock_client = mock("faraday")
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
OpenStruct.new(body: { "error" => "bad request" })
)
@host.instance_variable_set(:@embedding_client, mock_client)
assert_raises(VectorStore::Error) { @host.embed("test text") }
end
# --- embed_batch ---
test "embed_batch processes texts and returns ordered vectors" do
texts = [ "first", "second", "third" ]
vectors = [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ]
stub_response = {
"data" => [
{ "embedding" => vectors[0], "index" => 0 },
{ "embedding" => vectors[1], "index" => 1 },
{ "embedding" => vectors[2], "index" => 2 }
]
}
mock_client = mock("faraday")
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
OpenStruct.new(body: stub_response)
)
@host.instance_variable_set(:@embedding_client, mock_client)
result = @host.embed_batch(texts)
assert_equal vectors, result
end
test "embed_batch handles multiple batches" do
# Override batch size constant for testing
original = VectorStore::Embeddable::EMBED_BATCH_SIZE
VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, 2)
texts = [ "a", "b", "c" ]
batch1_response = {
"data" => [
{ "embedding" => [ 0.1 ], "index" => 0 },
{ "embedding" => [ 0.2 ], "index" => 1 }
]
}
batch2_response = {
"data" => [
{ "embedding" => [ 0.3 ], "index" => 0 }
]
}
mock_client = mock("faraday")
mock_client.expects(:post).with("embeddings").twice
.yields(mock_request)
.returns(OpenStruct.new(body: batch1_response))
.then.returns(OpenStruct.new(body: batch2_response))
@host.instance_variable_set(:@embedding_client, mock_client)
result = @host.embed_batch(texts)
assert_equal [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ], result
ensure
VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, original)
end
private
def mock_request
request = OpenStruct.new(body: nil)
request
end
end

View File

@@ -0,0 +1,141 @@
require "test_helper"
class VectorStore::PgvectorTest < ActiveSupport::TestCase
setup do
@adapter = VectorStore::Pgvector.new
end
test "create_store returns a UUID" do
response = @adapter.create_store(name: "Test Store")
assert response.success?
assert_match(/\A[0-9a-f-]{36}\z/, response.data[:id])
end
test "delete_store executes delete query" do
mock_conn = mock("connection")
mock_conn.expects(:exec_delete).with(
"DELETE FROM vector_store_chunks WHERE store_id = $1",
"VectorStore::Pgvector DeleteStore",
anything
).returns(0)
@adapter.stubs(:connection).returns(mock_conn)
response = @adapter.delete_store(store_id: "store-123")
assert response.success?
end
test "upload_file extracts text, chunks, embeds, and inserts" do
file_content = "Hello world"
filename = "test.txt"
store_id = "store-123"
@adapter.expects(:extract_text).with(file_content, filename).returns("Hello world")
@adapter.expects(:chunk_text).with("Hello world").returns([ "Hello world" ])
@adapter.expects(:embed_batch).with([ "Hello world" ]).returns([ [ 0.1, 0.2, 0.3 ] ])
mock_conn = mock("connection")
mock_conn.expects(:transaction).yields
mock_conn.expects(:exec_insert).once
@adapter.stubs(:connection).returns(mock_conn)
response = @adapter.upload_file(store_id: store_id, file_content: file_content, filename: filename)
assert response.success?
assert_match(/\A[0-9a-f-]{36}\z/, response.data[:file_id])
end
test "upload_file fails when text extraction returns nil" do
@adapter.expects(:extract_text).returns(nil)
response = @adapter.upload_file(store_id: "store-123", file_content: "\x00binary", filename: "photo.png")
assert_not response.success?
assert_match(/Could not extract text/, response.error.message)
end
test "upload_file fails when no chunks produced" do
@adapter.expects(:extract_text).returns("some text")
@adapter.expects(:chunk_text).returns([])
response = @adapter.upload_file(store_id: "store-123", file_content: "some text", filename: "empty.txt")
assert_not response.success?
assert_match(/No chunks produced/, response.error.message)
end
test "upload_file inserts multiple chunks in a transaction" do
@adapter.expects(:extract_text).returns("chunk1\n\nchunk2")
@adapter.expects(:chunk_text).returns([ "chunk1", "chunk2" ])
@adapter.expects(:embed_batch).returns([ [ 0.1 ], [ 0.2 ] ])
mock_conn = mock("connection")
mock_conn.expects(:transaction).yields
mock_conn.expects(:exec_insert).twice
@adapter.stubs(:connection).returns(mock_conn)
response = @adapter.upload_file(store_id: "store-123", file_content: "chunk1\n\nchunk2", filename: "doc.txt")
assert response.success?
end
test "remove_file executes delete with store_id and file_id" do
mock_conn = mock("connection")
mock_conn.expects(:exec_delete).with(
"DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
"VectorStore::Pgvector RemoveFile",
anything
).returns(1)
@adapter.stubs(:connection).returns(mock_conn)
response = @adapter.remove_file(store_id: "store-123", file_id: "file-456")
assert response.success?
end
test "search embeds query and returns scored results" do
query_vector = [ 0.1, 0.2, 0.3 ]
@adapter.expects(:embed).with("income").returns(query_vector)
mock_result = [
{ "content" => "Total income: $85,000", "filename" => "tax_return.pdf", "file_id" => "file-xyz", "score" => 0.95 }
]
mock_conn = mock("connection")
mock_conn.expects(:exec_query).returns(mock_result)
@adapter.stubs(:connection).returns(mock_conn)
response = @adapter.search(store_id: "store-123", query: "income", max_results: 5)
assert response.success?
assert_equal 1, response.data.size
assert_equal "Total income: $85,000", response.data.first[:content]
assert_equal "tax_return.pdf", response.data.first[:filename]
assert_equal 0.95, response.data.first[:score]
assert_equal "file-xyz", response.data.first[:file_id]
end
test "search returns empty array when no results" do
@adapter.expects(:embed).returns([ 0.1 ])
mock_conn = mock("connection")
mock_conn.expects(:exec_query).returns([])
@adapter.stubs(:connection).returns(mock_conn)
response = @adapter.search(store_id: "store-123", query: "nothing")
assert response.success?
assert_empty response.data
end
test "wraps errors in failure response" do
@adapter.expects(:extract_text).raises(StandardError, "unexpected error")
response = @adapter.upload_file(store_id: "store-123", file_content: "data", filename: "test.txt")
assert_not response.success?
assert_equal "unexpected error", response.error.message
end
test "supported_extensions matches extractable formats only" do
assert_includes @adapter.supported_extensions, ".pdf"
assert_includes @adapter.supported_extensions, ".txt"
assert_includes @adapter.supported_extensions, ".csv"
assert_not_includes @adapter.supported_extensions, ".png"
assert_not_includes @adapter.supported_extensions, ".zip"
assert_not_includes @adapter.supported_extensions, ".docx"
end
end

View File

@@ -43,6 +43,13 @@ class VectorStore::RegistryTest < ActiveSupport::TestCase
end
end
test "adapter returns VectorStore::Pgvector instance when pgvector configured" do
ClimateControl.modify(VECTOR_STORE_PROVIDER: "pgvector") do
adapter = VectorStore::Registry.adapter
assert_instance_of VectorStore::Pgvector, adapter
end
end
test "configured? delegates to adapter presence" do
VectorStore::Registry.stubs(:adapter).returns(nil)
assert_not VectorStore.configured?