mirror of
https://github.com/we-promise/sure.git
synced 2026-04-09 07:14:47 +00:00
* Add conditional migration for vector_store_chunks table Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector. Enables the vector extension, adds store_id/file_id indexes, and uses vector(1024) column type for embeddings. * Add VectorStore::Embeddable concern for text extraction and embedding Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is), paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars. * Implement VectorStore::Pgvector adapter with raw SQL Replaces the stub with a full implementation using ActiveRecord::Base.connection with parameterized binds. Supports create_store, delete_store, upload_file (extract+chunk+embed+insert), remove_file, and cosine-similarity search via the <=> operator. * Add registry test for pgvector adapter selection * Configure pgvector in compose.example.ai.yml Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER, EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include nomic-embed-text in Ollama's pre-loaded models. * Update pgvector docs from scaffolded to ready Document env vars, embedding model setup, pgvector Docker image requirement, and Ollama pull instructions. * Address PR review feedback - Migration: remove env guard, use pgvector_available? check so it runs on plain Postgres (CI) but creates the table on pgvector-capable servers. Add NOT NULL constraints on content/embedding/metadata, unique index on (store_id, file_id, chunk_index). - Pgvector adapter: wrap chunk inserts in a DB transaction to prevent partial file writes. Override supported_extensions to match formats that extract_text can actually parse. - Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE to avoid overflowing embedding model token limits. * Bump schema version to include vector_store_chunks migration CI uses db:schema:load which checks the version — without this bump, the migration is detected as pending and tests fail to start. * Update 20260316120000_create_vector_store_chunks.rb --------- Co-authored-by: sokiee <sokysrm@gmail.com>
205 lines
6.1 KiB
Ruby
205 lines
6.1 KiB
Ruby
require "test_helper"
|
|
|
|
class VectorStore::EmbeddableTest < ActiveSupport::TestCase
|
|
class EmbeddableHost
|
|
include VectorStore::Embeddable
|
|
# Expose private methods for testing
|
|
public :extract_text, :chunk_text, :embed, :embed_batch
|
|
end
|
|
|
|
setup do
|
|
@host = EmbeddableHost.new
|
|
end
|
|
|
|
# --- extract_text ---
|
|
|
|
test "extract_text returns plain text for .txt files" do
|
|
result = @host.extract_text("Hello world", "notes.txt")
|
|
assert_equal "Hello world", result
|
|
end
|
|
|
|
test "extract_text returns content for markdown files" do
|
|
result = @host.extract_text("# Heading\n\nBody", "readme.md")
|
|
assert_equal "# Heading\n\nBody", result
|
|
end
|
|
|
|
test "extract_text returns content for code files" do
|
|
result = @host.extract_text("def foo; end", "app.rb")
|
|
assert_equal "def foo; end", result
|
|
end
|
|
|
|
test "extract_text returns nil for unsupported binary formats" do
|
|
assert_nil @host.extract_text("\x00\x01binary", "photo.png")
|
|
assert_nil @host.extract_text("\x00\x01binary", "archive.zip")
|
|
end
|
|
|
|
test "extract_text handles PDF files" do
|
|
pdf_content = "fake pdf bytes"
|
|
mock_page = mock("page")
|
|
mock_page.stubs(:text).returns("Page 1 content")
|
|
|
|
mock_reader = mock("reader")
|
|
mock_reader.stubs(:pages).returns([ mock_page ])
|
|
|
|
PDF::Reader.expects(:new).with(instance_of(StringIO)).returns(mock_reader)
|
|
|
|
result = @host.extract_text(pdf_content, "document.pdf")
|
|
assert_equal "Page 1 content", result
|
|
end
|
|
|
|
test "extract_text returns nil when PDF extraction fails" do
|
|
PDF::Reader.expects(:new).raises(StandardError, "corrupt pdf")
|
|
|
|
result = @host.extract_text("bad data", "broken.pdf")
|
|
assert_nil result
|
|
end
|
|
|
|
# --- chunk_text ---
|
|
|
|
test "chunk_text returns empty array for blank text" do
|
|
assert_equal [], @host.chunk_text("")
|
|
assert_equal [], @host.chunk_text(nil)
|
|
end
|
|
|
|
test "chunk_text returns single chunk for short text" do
|
|
text = "Short paragraph."
|
|
chunks = @host.chunk_text(text)
|
|
assert_equal 1, chunks.size
|
|
assert_equal "Short paragraph.", chunks.first
|
|
end
|
|
|
|
test "chunk_text splits on paragraph boundaries" do
|
|
# Create text that exceeds CHUNK_SIZE when combined
|
|
para1 = "A" * 1200
|
|
para2 = "B" * 1200
|
|
text = "#{para1}\n\n#{para2}"
|
|
|
|
chunks = @host.chunk_text(text)
|
|
assert_equal 2, chunks.size
|
|
assert_includes chunks.first, "A" * 1200
|
|
assert_includes chunks.last, "B" * 1200
|
|
end
|
|
|
|
test "chunk_text includes overlap between chunks" do
|
|
para1 = "A" * 1500
|
|
para2 = "B" * 1500
|
|
text = "#{para1}\n\n#{para2}"
|
|
|
|
chunks = @host.chunk_text(text)
|
|
assert_equal 2, chunks.size
|
|
# Second chunk should start with overlap from end of first chunk
|
|
overlap = para1.last(VectorStore::Embeddable::CHUNK_OVERLAP)
|
|
assert chunks.last.start_with?(overlap)
|
|
end
|
|
|
|
test "chunk_text keeps small paragraphs together" do
|
|
paragraphs = Array.new(5) { |i| "Paragraph #{i} content." }
|
|
text = paragraphs.join("\n\n")
|
|
|
|
chunks = @host.chunk_text(text)
|
|
assert_equal 1, chunks.size
|
|
end
|
|
|
|
test "chunk_text hard-splits oversized paragraphs" do
|
|
# A single paragraph longer than CHUNK_SIZE with no paragraph breaks
|
|
long_para = "X" * 5000
|
|
chunks = @host.chunk_text(long_para)
|
|
|
|
assert chunks.size > 1
|
|
chunks.each do |chunk|
|
|
assert chunk.length <= VectorStore::Embeddable::CHUNK_SIZE + VectorStore::Embeddable::CHUNK_OVERLAP + 2,
|
|
"Chunk too large: #{chunk.length} chars"
|
|
end
|
|
end
|
|
|
|
# --- embed ---
|
|
|
|
test "embed calls embedding endpoint and returns vector" do
|
|
expected_vector = [ 0.1, 0.2, 0.3 ]
|
|
stub_response = { "data" => [ { "embedding" => expected_vector, "index" => 0 } ] }
|
|
|
|
mock_client = mock("faraday")
|
|
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
|
|
OpenStruct.new(body: stub_response)
|
|
)
|
|
@host.instance_variable_set(:@embedding_client, mock_client)
|
|
|
|
result = @host.embed("test text")
|
|
assert_equal expected_vector, result
|
|
end
|
|
|
|
test "embed raises on failed response" do
|
|
mock_client = mock("faraday")
|
|
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
|
|
OpenStruct.new(body: { "error" => "bad request" })
|
|
)
|
|
@host.instance_variable_set(:@embedding_client, mock_client)
|
|
|
|
assert_raises(VectorStore::Error) { @host.embed("test text") }
|
|
end
|
|
|
|
# --- embed_batch ---
|
|
|
|
test "embed_batch processes texts and returns ordered vectors" do
|
|
texts = [ "first", "second", "third" ]
|
|
vectors = [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ]
|
|
stub_response = {
|
|
"data" => [
|
|
{ "embedding" => vectors[0], "index" => 0 },
|
|
{ "embedding" => vectors[1], "index" => 1 },
|
|
{ "embedding" => vectors[2], "index" => 2 }
|
|
]
|
|
}
|
|
|
|
mock_client = mock("faraday")
|
|
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
|
|
OpenStruct.new(body: stub_response)
|
|
)
|
|
@host.instance_variable_set(:@embedding_client, mock_client)
|
|
|
|
result = @host.embed_batch(texts)
|
|
assert_equal vectors, result
|
|
end
|
|
|
|
test "embed_batch handles multiple batches" do
|
|
# Override batch size constant for testing
|
|
original = VectorStore::Embeddable::EMBED_BATCH_SIZE
|
|
VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
|
|
VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, 2)
|
|
|
|
texts = [ "a", "b", "c" ]
|
|
|
|
batch1_response = {
|
|
"data" => [
|
|
{ "embedding" => [ 0.1 ], "index" => 0 },
|
|
{ "embedding" => [ 0.2 ], "index" => 1 }
|
|
]
|
|
}
|
|
batch2_response = {
|
|
"data" => [
|
|
{ "embedding" => [ 0.3 ], "index" => 0 }
|
|
]
|
|
}
|
|
|
|
mock_client = mock("faraday")
|
|
mock_client.expects(:post).with("embeddings").twice
|
|
.yields(mock_request)
|
|
.returns(OpenStruct.new(body: batch1_response))
|
|
.then.returns(OpenStruct.new(body: batch2_response))
|
|
@host.instance_variable_set(:@embedding_client, mock_client)
|
|
|
|
result = @host.embed_batch(texts)
|
|
assert_equal [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ], result
|
|
ensure
|
|
VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
|
|
VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, original)
|
|
end
|
|
|
|
private
|
|
|
|
def mock_request
|
|
request = OpenStruct.new(body: nil)
|
|
request
|
|
end
|
|
end
|