feat(vector-store): Implement pgvector adapter for self-hosted RAG (#1211)

* Add conditional migration for vector_store_chunks table

Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector.
Enables the vector extension, adds store_id/file_id indexes, and uses
vector(1024) column type for embeddings.

* Add VectorStore::Embeddable concern for text extraction and embedding

Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is),
paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch
via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via
EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars.

* Implement VectorStore::Pgvector adapter with raw SQL

Replaces the stub with a full implementation using
ActiveRecord::Base.connection with parameterized binds. Supports
create_store, delete_store, upload_file (extract+chunk+embed+insert),
remove_file, and cosine-similarity search via the <=> operator.

* Add registry test for pgvector adapter selection

* Configure pgvector in compose.example.ai.yml

Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER,
EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include
nomic-embed-text in Ollama's pre-loaded models.

* Update pgvector docs from scaffolded to ready

Document env vars, embedding model setup, pgvector Docker image
requirement, and Ollama pull instructions.

* Address PR review feedback

- Migration: remove env guard, use pgvector_available? check so it runs
  on plain Postgres (CI) but creates the table on pgvector-capable servers.
  Add NOT NULL constraints on content/embedding/metadata, unique index on
  (store_id, file_id, chunk_index).
- Pgvector adapter: wrap chunk inserts in a DB transaction to prevent
  partial file writes. Override supported_extensions to match formats
  that extract_text can actually parse.
- Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE
  to avoid overflowing embedding model token limits.

* Bump schema version to include vector_store_chunks migration

CI uses db:schema:load which checks the version — without this bump,
the migration is detected as pending and tests fail to start.

* Update 20260316120000_create_vector_store_chunks.rb

---------

Co-authored-by: sokiee <sokysrm@gmail.com>
This commit is contained in:
Dream
2026-03-20 12:01:31 -04:00
committed by GitHub
parent 2cdddd28d7
commit 6d22514c01
9 changed files with 672 additions and 59 deletions

View File

@@ -2,88 +2,137 @@
#
# This keeps all data on your own infrastructure — no external vector-store
# service required. You still need an embedding provider (e.g. OpenAI, or a
# local model served via an OpenAI-compatible endpoint) to turn text into
# vectors before insertion and at query time.
# local model served via an OpenAI-compatible endpoint such as Ollama) to turn
# text into vectors before insertion and at query time.
#
# Requirements (not yet wired up):
# - PostgreSQL with the `vector` extension enabled
# - gem "neighbor" (for ActiveRecord integration) or raw SQL
# - An embedding model endpoint (EMBEDDING_MODEL_URL / EMBEDDING_MODEL_NAME)
# - A chunking strategy (see #chunk_file below)
#
# Schema sketch (for reference — migration not included):
#
# create_table :vector_store_chunks do |t|
# t.string :store_id, null: false # logical namespace
# t.string :file_id, null: false
# t.string :filename
# t.text :content # the original text chunk
# t.vector :embedding, limit: 1536 # adjust dimensions to your model
# t.jsonb :metadata, default: {}
# t.timestamps
# end
# add_index :vector_store_chunks, :store_id
# add_index :vector_store_chunks, :file_id
# Requirements:
# - PostgreSQL with the `vector` extension enabled (use pgvector/pgvector Docker image)
# - An embedding model endpoint (EMBEDDING_URI_BASE / EMBEDDING_MODEL)
# - Migration: CreateVectorStoreChunks (run with VECTOR_STORE_PROVIDER=pgvector)
#
class VectorStore::Pgvector < VectorStore::Base
include VectorStore::Embeddable
PGVECTOR_SUPPORTED_EXTENSIONS = (VectorStore::Embeddable::TEXT_EXTENSIONS + [ ".pdf" ]).uniq.freeze
def supported_extensions
PGVECTOR_SUPPORTED_EXTENSIONS
end
def create_store(name:)
with_response do
# A "store" is just a logical namespace (a UUID).
# No external resource to create.
# { id: SecureRandom.uuid }
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
{ id: SecureRandom.uuid }
end
end
def delete_store(store_id:)
with_response do
# TODO: DELETE FROM vector_store_chunks WHERE store_id = ?
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
connection.exec_delete(
"DELETE FROM vector_store_chunks WHERE store_id = $1",
"VectorStore::Pgvector DeleteStore",
[ bind_param("store_id", store_id) ]
)
end
end
def upload_file(store_id:, file_content:, filename:)
with_response do
# 1. chunk_file(file_content, filename) → array of text chunks
# 2. embed each chunk via the configured embedding model
# 3. INSERT INTO vector_store_chunks (store_id, file_id, filename, content, embedding)
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
text = extract_text(file_content, filename)
raise VectorStore::Error, "Could not extract text from #{filename}" if text.blank?
chunks = chunk_text(text)
raise VectorStore::Error, "No chunks produced from #{filename}" if chunks.empty?
vectors = embed_batch(chunks)
file_id = SecureRandom.uuid
now = Time.current
connection.transaction do
chunks.each_with_index do |chunk_content, index|
embedding_literal = "[#{vectors[index].join(',')}]"
connection.exec_insert(
<<~SQL,
INSERT INTO vector_store_chunks
(id, store_id, file_id, filename, chunk_index, content, embedding, metadata, created_at, updated_at)
VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
SQL
"VectorStore::Pgvector InsertChunk",
[
bind_param("id", SecureRandom.uuid),
bind_param("store_id", store_id),
bind_param("file_id", file_id),
bind_param("filename", filename),
bind_param("chunk_index", index),
bind_param("content", chunk_content),
bind_param("embedding", embedding_literal, ActiveRecord::Type::String.new),
bind_param("metadata", "{}"),
bind_param("created_at", now),
bind_param("updated_at", now)
]
)
end
end
{ file_id: file_id }
end
end
def remove_file(store_id:, file_id:)
with_response do
# TODO: DELETE FROM vector_store_chunks WHERE store_id = ? AND file_id = ?
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
connection.exec_delete(
"DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
"VectorStore::Pgvector RemoveFile",
[
bind_param("store_id", store_id),
bind_param("file_id", file_id)
]
)
end
end
def search(store_id:, query:, max_results: 10)
with_response do
# 1. embed(query) → vector
# 2. SELECT content, filename, file_id,
# 1 - (embedding <=> query_vector) AS score
# FROM vector_store_chunks
# WHERE store_id = ?
# ORDER BY embedding <=> query_vector
# LIMIT max_results
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
query_vector = embed(query)
vector_literal = "[#{query_vector.join(',')}]"
results = connection.exec_query(
<<~SQL,
SELECT content, filename, file_id,
1 - (embedding <=> $1::vector) AS score
FROM vector_store_chunks
WHERE store_id = $2
ORDER BY embedding <=> $1::vector
LIMIT $3
SQL
"VectorStore::Pgvector Search",
[
bind_param("embedding", vector_literal, ActiveRecord::Type::String.new),
bind_param("store_id", store_id),
bind_param("limit", max_results)
]
)
results.map do |row|
{
content: row["content"],
filename: row["filename"],
score: row["score"].to_f,
file_id: row["file_id"]
}
end
end
end
private
# Placeholder: split file content into overlapping text windows.
# A real implementation would handle PDFs, DOCX, etc. via
# libraries like `pdf-reader`, `docx`, or an extraction service.
def chunk_file(file_content, filename)
# TODO: implement format-aware chunking
[]
def connection
ActiveRecord::Base.connection
end
# Placeholder: call an embedding API to turn text into a vector.
def embed(text)
# TODO: call EMBEDDING_MODEL_URL or OpenAI embeddings endpoint
raise VectorStore::Error, "Embedding model not configured"
def bind_param(name, value, type = nil)
type ||= ActiveModel::Type::Value.new
ActiveRecord::Relation::QueryAttribute.new(name, value, type)
end
end