mirror of
https://github.com/we-promise/sure.git
synced 2026-04-09 15:24:48 +00:00
* Add conditional migration for vector_store_chunks table Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector. Enables the vector extension, adds store_id/file_id indexes, and uses vector(1024) column type for embeddings. * Add VectorStore::Embeddable concern for text extraction and embedding Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is), paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars. * Implement VectorStore::Pgvector adapter with raw SQL Replaces the stub with a full implementation using ActiveRecord::Base.connection with parameterized binds. Supports create_store, delete_store, upload_file (extract+chunk+embed+insert), remove_file, and cosine-similarity search via the <=> operator. * Add registry test for pgvector adapter selection * Configure pgvector in compose.example.ai.yml Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER, EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include nomic-embed-text in Ollama's pre-loaded models. * Update pgvector docs from scaffolded to ready Document env vars, embedding model setup, pgvector Docker image requirement, and Ollama pull instructions. * Address PR review feedback - Migration: remove env guard, use pgvector_available? check so it runs on plain Postgres (CI) but creates the table on pgvector-capable servers. Add NOT NULL constraints on content/embedding/metadata, unique index on (store_id, file_id, chunk_index). - Pgvector adapter: wrap chunk inserts in a DB transaction to prevent partial file writes. Override supported_extensions to match formats that extract_text can actually parse. - Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE to avoid overflowing embedding model token limits. * Bump schema version to include vector_store_chunks migration CI uses db:schema:load which checks the version — without this bump, the migration is detected as pending and tests fail to start. * Update 20260316120000_create_vector_store_chunks.rb --------- Co-authored-by: sokiee <sokysrm@gmail.com>
153 lines
4.2 KiB
Ruby
153 lines
4.2 KiB
Ruby
module VectorStore::Embeddable
|
|
extend ActiveSupport::Concern
|
|
|
|
CHUNK_SIZE = 2000
|
|
CHUNK_OVERLAP = 200
|
|
EMBED_BATCH_SIZE = 50
|
|
|
|
TEXT_EXTENSIONS = %w[
|
|
.txt .md .csv .json .xml .html .css
|
|
.js .ts .py .rb .go .java .php .c .cpp .sh .tex
|
|
].freeze
|
|
|
|
private
|
|
|
|
# Dispatch by extension: PDF via PDF::Reader, plain-text types as-is.
|
|
# Returns nil for unsupported binary formats.
|
|
def extract_text(file_content, filename)
|
|
ext = File.extname(filename).downcase
|
|
|
|
case ext
|
|
when ".pdf"
|
|
extract_pdf_text(file_content)
|
|
when *TEXT_EXTENSIONS
|
|
file_content.to_s.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
|
|
def extract_pdf_text(file_content)
|
|
io = StringIO.new(file_content)
|
|
reader = PDF::Reader.new(io)
|
|
reader.pages.map(&:text).join("\n\n")
|
|
rescue => e
|
|
Rails.logger.error("VectorStore::Embeddable PDF extraction error: #{e.message}")
|
|
nil
|
|
end
|
|
|
|
# Split text on paragraph boundaries (~2000 char chunks, ~200 char overlap).
|
|
# Paragraphs longer than CHUNK_SIZE are hard-split to avoid overflowing
|
|
# embedding model token limits.
|
|
def chunk_text(text)
|
|
return [] if text.blank?
|
|
|
|
paragraphs = text.split(/\n\s*\n/)
|
|
chunks = []
|
|
current_chunk = +""
|
|
|
|
paragraphs.each do |para|
|
|
para = para.strip
|
|
next if para.empty?
|
|
|
|
# Hard-split oversized paragraphs into CHUNK_SIZE slices with overlap
|
|
slices = if para.length > CHUNK_SIZE
|
|
hard_split(para)
|
|
else
|
|
[ para ]
|
|
end
|
|
|
|
slices.each do |slice|
|
|
if current_chunk.empty?
|
|
current_chunk << slice
|
|
elsif (current_chunk.length + slice.length + 2) <= CHUNK_SIZE
|
|
current_chunk << "\n\n" << slice
|
|
else
|
|
chunks << current_chunk.freeze
|
|
overlap = current_chunk.last(CHUNK_OVERLAP)
|
|
current_chunk = +""
|
|
current_chunk << overlap << "\n\n" << slice
|
|
end
|
|
end
|
|
end
|
|
|
|
chunks << current_chunk.freeze unless current_chunk.empty?
|
|
chunks
|
|
end
|
|
|
|
# Hard-split a single long string into CHUNK_SIZE slices with CHUNK_OVERLAP.
|
|
def hard_split(text)
|
|
slices = []
|
|
offset = 0
|
|
while offset < text.length
|
|
slices << text[offset, CHUNK_SIZE]
|
|
offset += CHUNK_SIZE - CHUNK_OVERLAP
|
|
end
|
|
slices
|
|
end
|
|
|
|
# Embed a single text string → vector array.
|
|
def embed(text)
|
|
response = embedding_client.post("embeddings") do |req|
|
|
req.body = {
|
|
model: embedding_model,
|
|
input: text
|
|
}
|
|
end
|
|
|
|
data = response.body
|
|
raise VectorStore::Error, "Embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]
|
|
|
|
data["data"].first["embedding"]
|
|
end
|
|
|
|
# Batch embed, processing in groups of EMBED_BATCH_SIZE.
|
|
def embed_batch(texts)
|
|
vectors = []
|
|
|
|
texts.each_slice(EMBED_BATCH_SIZE) do |batch|
|
|
response = embedding_client.post("embeddings") do |req|
|
|
req.body = {
|
|
model: embedding_model,
|
|
input: batch
|
|
}
|
|
end
|
|
|
|
data = response.body
|
|
raise VectorStore::Error, "Batch embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]
|
|
|
|
# Sort by index to preserve order
|
|
sorted = data["data"].sort_by { |d| d["index"] }
|
|
vectors.concat(sorted.map { |d| d["embedding"] })
|
|
end
|
|
|
|
vectors
|
|
end
|
|
|
|
def embedding_client
|
|
@embedding_client ||= Faraday.new(url: embedding_uri_base) do |f|
|
|
f.request :json
|
|
f.response :json
|
|
f.headers["Authorization"] = "Bearer #{embedding_access_token}" if embedding_access_token.present?
|
|
f.options.timeout = 120
|
|
f.options.open_timeout = 10
|
|
end
|
|
end
|
|
|
|
def embedding_model
|
|
ENV.fetch("EMBEDDING_MODEL", "nomic-embed-text")
|
|
end
|
|
|
|
def embedding_dimensions
|
|
ENV.fetch("EMBEDDING_DIMENSIONS", "1024").to_i
|
|
end
|
|
|
|
def embedding_uri_base
|
|
ENV["EMBEDDING_URI_BASE"].presence || ENV["OPENAI_URI_BASE"].presence || "https://api.openai.com/v1/"
|
|
end
|
|
|
|
def embedding_access_token
|
|
ENV["EMBEDDING_ACCESS_TOKEN"].presence || ENV["OPENAI_ACCESS_TOKEN"].presence
|
|
end
|
|
end
|