feat(vector-store): Implement pgvector adapter for self-hosted RAG (#1211)

* Add conditional migration for vector_store_chunks table Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector. Enables the vector extension, adds store_id/file_id indexes, and uses vector(1024) column type for embeddings. * Add VectorStore::Embeddable concern for text extraction and embedding Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is), paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars. * Implement VectorStore::Pgvector adapter with raw SQL Replaces the stub with a full implementation using ActiveRecord::Base.connection with parameterized binds. Supports create_store, delete_store, upload_file (extract+chunk+embed+insert), remove_file, and cosine-similarity search via the <=> operator. * Add registry test for pgvector adapter selection * Configure pgvector in compose.example.ai.yml Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER, EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include nomic-embed-text in Ollama's pre-loaded models. * Update pgvector docs from scaffolded to ready Document env vars, embedding model setup, pgvector Docker image requirement, and Ollama pull instructions. * Address PR review feedback - Migration: remove env guard, use pgvector_available? check so it runs on plain Postgres (CI) but creates the table on pgvector-capable servers. Add NOT NULL constraints on content/embedding/metadata, unique index on (store_id, file_id, chunk_index). - Pgvector adapter: wrap chunk inserts in a DB transaction to prevent partial file writes. Override supported_extensions to match formats that extract_text can actually parse. - Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE to avoid overflowing embedding model token limits. * Bump schema version to include vector_store_chunks migration CI uses db:schema:load which checks the version — without this bump, the migration is detected as pending and tests fail to start. * Update 20260316120000_create_vector_store_chunks.rb --------- Co-authored-by: sokiee <sokysrm@gmail.com>
2026-04-19 03:54:08 +00:00 · 2026-03-20 12:01:31 -04:00
parent 2cdddd28d7
commit 6d22514c01
9 changed files with 672 additions and 59 deletions
--- a/app/models/vector_store/embeddable.rb
+++ b/app/models/vector_store/embeddable.rb
@@ -0,0 +1,152 @@
+module VectorStore::Embeddable
+  extend ActiveSupport::Concern
+
+  CHUNK_SIZE = 2000
+  CHUNK_OVERLAP = 200
+  EMBED_BATCH_SIZE = 50
+
+  TEXT_EXTENSIONS = %w[
+    .txt .md .csv .json .xml .html .css
+    .js .ts .py .rb .go .java .php .c .cpp .sh .tex
+  ].freeze
+
+  private
+
+    # Dispatch by extension: PDF via PDF::Reader, plain-text types as-is.
+    # Returns nil for unsupported binary formats.
+    def extract_text(file_content, filename)
+      ext = File.extname(filename).downcase
+
+      case ext
+      when ".pdf"
+        extract_pdf_text(file_content)
+      when *TEXT_EXTENSIONS
+        file_content.to_s.encode("UTF-8", invalid: :replace, undef: :replace)
+      else
+        nil
+      end
+    end
+
+    def extract_pdf_text(file_content)
+      io = StringIO.new(file_content)
+      reader = PDF::Reader.new(io)
+      reader.pages.map(&:text).join("\n\n")
+    rescue => e
+      Rails.logger.error("VectorStore::Embeddable PDF extraction error: #{e.message}")
+      nil
+    end
+
+    # Split text on paragraph boundaries (~2000 char chunks, ~200 char overlap).
+    # Paragraphs longer than CHUNK_SIZE are hard-split to avoid overflowing
+    # embedding model token limits.
+    def chunk_text(text)
+      return [] if text.blank?
+
+      paragraphs = text.split(/\n\s*\n/)
+      chunks = []
+      current_chunk = +""
+
+      paragraphs.each do |para|
+        para = para.strip
+        next if para.empty?
+
+        # Hard-split oversized paragraphs into CHUNK_SIZE slices with overlap
+        slices = if para.length > CHUNK_SIZE
+          hard_split(para)
+        else
+          [ para ]
+        end
+
+        slices.each do |slice|
+          if current_chunk.empty?
+            current_chunk << slice
+          elsif (current_chunk.length + slice.length + 2) <= CHUNK_SIZE
+            current_chunk << "\n\n" << slice
+          else
+            chunks << current_chunk.freeze
+            overlap = current_chunk.last(CHUNK_OVERLAP)
+            current_chunk = +""
+            current_chunk << overlap << "\n\n" << slice
+          end
+        end
+      end
+
+      chunks << current_chunk.freeze unless current_chunk.empty?
+      chunks
+    end
+
+    # Hard-split a single long string into CHUNK_SIZE slices with CHUNK_OVERLAP.
+    def hard_split(text)
+      slices = []
+      offset = 0
+      while offset < text.length
+        slices << text[offset, CHUNK_SIZE]
+        offset += CHUNK_SIZE - CHUNK_OVERLAP
+      end
+      slices
+    end
+
+    # Embed a single text string → vector array.
+    def embed(text)
+      response = embedding_client.post("embeddings") do |req|
+        req.body = {
+          model: embedding_model,
+          input: text
+        }
+      end
+
+      data = response.body
+      raise VectorStore::Error, "Embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]
+
+      data["data"].first["embedding"]
+    end
+
+    # Batch embed, processing in groups of EMBED_BATCH_SIZE.
+    def embed_batch(texts)
+      vectors = []
+
+      texts.each_slice(EMBED_BATCH_SIZE) do |batch|
+        response = embedding_client.post("embeddings") do |req|
+          req.body = {
+            model: embedding_model,
+            input: batch
+          }
+        end
+
+        data = response.body
+        raise VectorStore::Error, "Batch embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]
+
+        # Sort by index to preserve order
+        sorted = data["data"].sort_by { |d| d["index"] }
+        vectors.concat(sorted.map { |d| d["embedding"] })
+      end
+
+      vectors
+    end
+
+    def embedding_client
+      @embedding_client ||= Faraday.new(url: embedding_uri_base) do |f|
+        f.request :json
+        f.response :json
+        f.headers["Authorization"] = "Bearer #{embedding_access_token}" if embedding_access_token.present?
+        f.options.timeout = 120
+        f.options.open_timeout = 10
+      end
+    end
+
+    def embedding_model
+      ENV.fetch("EMBEDDING_MODEL", "nomic-embed-text")
+    end
+
+    def embedding_dimensions
+      ENV.fetch("EMBEDDING_DIMENSIONS", "1024").to_i
+    end
+
+    def embedding_uri_base
+      ENV["EMBEDDING_URI_BASE"].presence || ENV["OPENAI_URI_BASE"].presence || "https://api.openai.com/v1/"
+    end
+
+    def embedding_access_token
+      ENV["EMBEDDING_ACCESS_TOKEN"].presence || ENV["OPENAI_ACCESS_TOKEN"].presence
+    end
+end
--- a/app/models/vector_store/pgvector.rb
+++ b/app/models/vector_store/pgvector.rb
@@ -2,88 +2,137 @@
 #
 # This keeps all data on your own infrastructure — no external vector-store
 # service required. You still need an embedding provider (e.g. OpenAI, or a
-# local model served via an OpenAI-compatible endpoint) to turn text into
-# vectors before insertion and at query time.
+# local model served via an OpenAI-compatible endpoint such as Ollama) to turn
+# text into vectors before insertion and at query time.
 #
-# Requirements (not yet wired up):
-#   - PostgreSQL with the `vector` extension enabled
-#   - gem "neighbor" (for ActiveRecord integration) or raw SQL
-#   - An embedding model endpoint (EMBEDDING_MODEL_URL / EMBEDDING_MODEL_NAME)
-#   - A chunking strategy (see #chunk_file below)
-#
-# Schema sketch (for reference — migration not included):
-#
-#   create_table :vector_store_chunks do |t|
-#     t.string  :store_id,  null: false  # logical namespace
-#     t.string  :file_id,   null: false
-#     t.string  :filename
-#     t.text    :content                 # the original text chunk
-#     t.vector  :embedding, limit: 1536  # adjust dimensions to your model
-#     t.jsonb   :metadata,  default: {}
-#     t.timestamps
-#   end
-#   add_index :vector_store_chunks, :store_id
-#   add_index :vector_store_chunks, :file_id
+# Requirements:
+#   - PostgreSQL with the `vector` extension enabled (use pgvector/pgvector Docker image)
+#   - An embedding model endpoint (EMBEDDING_URI_BASE / EMBEDDING_MODEL)
+#   - Migration: CreateVectorStoreChunks (run with VECTOR_STORE_PROVIDER=pgvector)
 #
 class VectorStore::Pgvector < VectorStore::Base
+  include VectorStore::Embeddable
+
+  PGVECTOR_SUPPORTED_EXTENSIONS = (VectorStore::Embeddable::TEXT_EXTENSIONS + [ ".pdf" ]).uniq.freeze
+
+  def supported_extensions
+    PGVECTOR_SUPPORTED_EXTENSIONS
+  end
+
  def create_store(name:)
    with_response do
-      # A "store" is just a logical namespace (a UUID).
-      # No external resource to create.
-      # { id: SecureRandom.uuid }
-      raise VectorStore::Error, "Pgvector adapter is not yet implemented"
+      { id: SecureRandom.uuid }
    end
  end

  def delete_store(store_id:)
    with_response do
-      # TODO: DELETE FROM vector_store_chunks WHERE store_id = ?
-      raise VectorStore::Error, "Pgvector adapter is not yet implemented"
+      connection.exec_delete(
+        "DELETE FROM vector_store_chunks WHERE store_id = $1",
+        "VectorStore::Pgvector DeleteStore",
+        [ bind_param("store_id", store_id) ]
+      )
    end
  end

  def upload_file(store_id:, file_content:, filename:)
    with_response do
-      # 1. chunk_file(file_content, filename) → array of text chunks
-      # 2. embed each chunk via the configured embedding model
-      # 3. INSERT INTO vector_store_chunks (store_id, file_id, filename, content, embedding)
-      raise VectorStore::Error, "Pgvector adapter is not yet implemented"
+      text = extract_text(file_content, filename)
+      raise VectorStore::Error, "Could not extract text from #{filename}" if text.blank?
+
+      chunks = chunk_text(text)
+      raise VectorStore::Error, "No chunks produced from #{filename}" if chunks.empty?
+
+      vectors = embed_batch(chunks)
+      file_id = SecureRandom.uuid
+      now = Time.current
+
+      connection.transaction do
+        chunks.each_with_index do |chunk_content, index|
+          embedding_literal = "[#{vectors[index].join(',')}]"
+
+          connection.exec_insert(
+            <<~SQL,
+              INSERT INTO vector_store_chunks
+                (id, store_id, file_id, filename, chunk_index, content, embedding, metadata, created_at, updated_at)
+              VALUES
+                ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
+            SQL
+            "VectorStore::Pgvector InsertChunk",
+            [
+              bind_param("id", SecureRandom.uuid),
+              bind_param("store_id", store_id),
+              bind_param("file_id", file_id),
+              bind_param("filename", filename),
+              bind_param("chunk_index", index),
+              bind_param("content", chunk_content),
+              bind_param("embedding", embedding_literal, ActiveRecord::Type::String.new),
+              bind_param("metadata", "{}"),
+              bind_param("created_at", now),
+              bind_param("updated_at", now)
+            ]
+          )
+        end
+      end
+
+      { file_id: file_id }
    end
  end

  def remove_file(store_id:, file_id:)
    with_response do
-      # TODO: DELETE FROM vector_store_chunks WHERE store_id = ? AND file_id = ?
-      raise VectorStore::Error, "Pgvector adapter is not yet implemented"
+      connection.exec_delete(
+        "DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
+        "VectorStore::Pgvector RemoveFile",
+        [
+          bind_param("store_id", store_id),
+          bind_param("file_id", file_id)
+        ]
+      )
    end
  end

  def search(store_id:, query:, max_results: 10)
    with_response do
-      # 1. embed(query) → vector
-      # 2. SELECT content, filename, file_id,
-      #           1 - (embedding <=> query_vector) AS score
-      #    FROM   vector_store_chunks
-      #    WHERE  store_id = ?
-      #    ORDER  BY embedding <=> query_vector
-      #    LIMIT  max_results
-      raise VectorStore::Error, "Pgvector adapter is not yet implemented"
+      query_vector = embed(query)
+      vector_literal = "[#{query_vector.join(',')}]"
+
+      results = connection.exec_query(
+        <<~SQL,
+          SELECT content, filename, file_id,
+                 1 - (embedding <=> $1::vector) AS score
+          FROM   vector_store_chunks
+          WHERE  store_id = $2
+          ORDER  BY embedding <=> $1::vector
+          LIMIT  $3
+        SQL
+        "VectorStore::Pgvector Search",
+        [
+          bind_param("embedding", vector_literal, ActiveRecord::Type::String.new),
+          bind_param("store_id", store_id),
+          bind_param("limit", max_results)
+        ]
+      )
+
+      results.map do |row|
+        {
+          content: row["content"],
+          filename: row["filename"],
+          score: row["score"].to_f,
+          file_id: row["file_id"]
+        }
+      end
    end
  end

  private

-    # Placeholder: split file content into overlapping text windows.
-    # A real implementation would handle PDFs, DOCX, etc. via
-    # libraries like `pdf-reader`, `docx`, or an extraction service.
-    def chunk_file(file_content, filename)
-      # TODO: implement format-aware chunking
-      []
+    def connection
+      ActiveRecord::Base.connection
    end

-    # Placeholder: call an embedding API to turn text into a vector.
-    def embed(text)
-      # TODO: call EMBEDDING_MODEL_URL or OpenAI embeddings endpoint
-      raise VectorStore::Error, "Embedding model not configured"
+    def bind_param(name, value, type = nil)
+      type ||= ActiveModel::Type::Value.new
+      ActiveRecord::Relation::QueryAttribute.new(name, value, type)
    end
 end