mirror of
https://github.com/we-promise/sure.git
synced 2026-04-09 15:24:48 +00:00
* Add conditional migration for vector_store_chunks table Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector. Enables the vector extension, adds store_id/file_id indexes, and uses vector(1024) column type for embeddings. * Add VectorStore::Embeddable concern for text extraction and embedding Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is), paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars. * Implement VectorStore::Pgvector adapter with raw SQL Replaces the stub with a full implementation using ActiveRecord::Base.connection with parameterized binds. Supports create_store, delete_store, upload_file (extract+chunk+embed+insert), remove_file, and cosine-similarity search via the <=> operator. * Add registry test for pgvector adapter selection * Configure pgvector in compose.example.ai.yml Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER, EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include nomic-embed-text in Ollama's pre-loaded models. * Update pgvector docs from scaffolded to ready Document env vars, embedding model setup, pgvector Docker image requirement, and Ollama pull instructions. * Address PR review feedback - Migration: remove env guard, use pgvector_available? check so it runs on plain Postgres (CI) but creates the table on pgvector-capable servers. Add NOT NULL constraints on content/embedding/metadata, unique index on (store_id, file_id, chunk_index). - Pgvector adapter: wrap chunk inserts in a DB transaction to prevent partial file writes. Override supported_extensions to match formats that extract_text can actually parse. - Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE to avoid overflowing embedding model token limits. * Bump schema version to include vector_store_chunks migration CI uses db:schema:load which checks the version — without this bump, the migration is detected as pending and tests fail to start. * Update 20260316120000_create_vector_store_chunks.rb --------- Co-authored-by: sokiee <sokysrm@gmail.com>
139 lines
4.2 KiB
Ruby
139 lines
4.2 KiB
Ruby
# Adapter that stores embeddings locally in PostgreSQL using the pgvector extension.
|
|
#
|
|
# This keeps all data on your own infrastructure — no external vector-store
|
|
# service required. You still need an embedding provider (e.g. OpenAI, or a
|
|
# local model served via an OpenAI-compatible endpoint such as Ollama) to turn
|
|
# text into vectors before insertion and at query time.
|
|
#
|
|
# Requirements:
|
|
# - PostgreSQL with the `vector` extension enabled (use pgvector/pgvector Docker image)
|
|
# - An embedding model endpoint (EMBEDDING_URI_BASE / EMBEDDING_MODEL)
|
|
# - Migration: CreateVectorStoreChunks (run with VECTOR_STORE_PROVIDER=pgvector)
|
|
#
|
|
class VectorStore::Pgvector < VectorStore::Base
|
|
include VectorStore::Embeddable
|
|
|
|
PGVECTOR_SUPPORTED_EXTENSIONS = (VectorStore::Embeddable::TEXT_EXTENSIONS + [ ".pdf" ]).uniq.freeze
|
|
|
|
def supported_extensions
|
|
PGVECTOR_SUPPORTED_EXTENSIONS
|
|
end
|
|
|
|
def create_store(name:)
|
|
with_response do
|
|
{ id: SecureRandom.uuid }
|
|
end
|
|
end
|
|
|
|
def delete_store(store_id:)
|
|
with_response do
|
|
connection.exec_delete(
|
|
"DELETE FROM vector_store_chunks WHERE store_id = $1",
|
|
"VectorStore::Pgvector DeleteStore",
|
|
[ bind_param("store_id", store_id) ]
|
|
)
|
|
end
|
|
end
|
|
|
|
def upload_file(store_id:, file_content:, filename:)
|
|
with_response do
|
|
text = extract_text(file_content, filename)
|
|
raise VectorStore::Error, "Could not extract text from #{filename}" if text.blank?
|
|
|
|
chunks = chunk_text(text)
|
|
raise VectorStore::Error, "No chunks produced from #{filename}" if chunks.empty?
|
|
|
|
vectors = embed_batch(chunks)
|
|
file_id = SecureRandom.uuid
|
|
now = Time.current
|
|
|
|
connection.transaction do
|
|
chunks.each_with_index do |chunk_content, index|
|
|
embedding_literal = "[#{vectors[index].join(',')}]"
|
|
|
|
connection.exec_insert(
|
|
<<~SQL,
|
|
INSERT INTO vector_store_chunks
|
|
(id, store_id, file_id, filename, chunk_index, content, embedding, metadata, created_at, updated_at)
|
|
VALUES
|
|
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
|
SQL
|
|
"VectorStore::Pgvector InsertChunk",
|
|
[
|
|
bind_param("id", SecureRandom.uuid),
|
|
bind_param("store_id", store_id),
|
|
bind_param("file_id", file_id),
|
|
bind_param("filename", filename),
|
|
bind_param("chunk_index", index),
|
|
bind_param("content", chunk_content),
|
|
bind_param("embedding", embedding_literal, ActiveRecord::Type::String.new),
|
|
bind_param("metadata", "{}"),
|
|
bind_param("created_at", now),
|
|
bind_param("updated_at", now)
|
|
]
|
|
)
|
|
end
|
|
end
|
|
|
|
{ file_id: file_id }
|
|
end
|
|
end
|
|
|
|
def remove_file(store_id:, file_id:)
|
|
with_response do
|
|
connection.exec_delete(
|
|
"DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
|
|
"VectorStore::Pgvector RemoveFile",
|
|
[
|
|
bind_param("store_id", store_id),
|
|
bind_param("file_id", file_id)
|
|
]
|
|
)
|
|
end
|
|
end
|
|
|
|
def search(store_id:, query:, max_results: 10)
|
|
with_response do
|
|
query_vector = embed(query)
|
|
vector_literal = "[#{query_vector.join(',')}]"
|
|
|
|
results = connection.exec_query(
|
|
<<~SQL,
|
|
SELECT content, filename, file_id,
|
|
1 - (embedding <=> $1::vector) AS score
|
|
FROM vector_store_chunks
|
|
WHERE store_id = $2
|
|
ORDER BY embedding <=> $1::vector
|
|
LIMIT $3
|
|
SQL
|
|
"VectorStore::Pgvector Search",
|
|
[
|
|
bind_param("embedding", vector_literal, ActiveRecord::Type::String.new),
|
|
bind_param("store_id", store_id),
|
|
bind_param("limit", max_results)
|
|
]
|
|
)
|
|
|
|
results.map do |row|
|
|
{
|
|
content: row["content"],
|
|
filename: row["filename"],
|
|
score: row["score"].to_f,
|
|
file_id: row["file_id"]
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def connection
|
|
ActiveRecord::Base.connection
|
|
end
|
|
|
|
def bind_param(name, value, type = nil)
|
|
type ||= ActiveModel::Type::Value.new
|
|
ActiveRecord::Relation::QueryAttribute.new(name, value, type)
|
|
end
|
|
end
|