sure/test/models/vector_store/embeddable_test.rb

require "test_helper"

class VectorStore::EmbeddableTest < ActiveSupport::TestCase
  class EmbeddableHost
    include VectorStore::Embeddable
    # Expose private methods for testing
    public :extract_text, :chunk_text, :embed, :embed_batch
  end

  setup do
    @host = EmbeddableHost.new
  end

  # --- extract_text ---

  test "extract_text returns plain text for .txt files" do
    result = @host.extract_text("Hello world", "notes.txt")
    assert_equal "Hello world", result
  end

  test "extract_text returns content for markdown files" do
    result = @host.extract_text("# Heading\n\nBody", "readme.md")
    assert_equal "# Heading\n\nBody", result
  end

  test "extract_text returns content for code files" do
    result = @host.extract_text("def foo; end", "app.rb")
    assert_equal "def foo; end", result
  end

  test "extract_text returns nil for unsupported binary formats" do
    assert_nil @host.extract_text("\x00\x01binary", "photo.png")
    assert_nil @host.extract_text("\x00\x01binary", "archive.zip")
  end

  test "extract_text handles PDF files" do
    pdf_content = "fake pdf bytes"
    mock_page = mock("page")
    mock_page.stubs(:text).returns("Page 1 content")

    mock_reader = mock("reader")
    mock_reader.stubs(:pages).returns([ mock_page ])

    PDF::Reader.expects(:new).with(instance_of(StringIO)).returns(mock_reader)

    result = @host.extract_text(pdf_content, "document.pdf")
    assert_equal "Page 1 content", result
  end

  test "extract_text returns nil when PDF extraction fails" do
    PDF::Reader.expects(:new).raises(StandardError, "corrupt pdf")

    result = @host.extract_text("bad data", "broken.pdf")
    assert_nil result
  end

  # --- chunk_text ---

  test "chunk_text returns empty array for blank text" do
    assert_equal [], @host.chunk_text("")
    assert_equal [], @host.chunk_text(nil)
  end

  test "chunk_text returns single chunk for short text" do
    text = "Short paragraph."
    chunks = @host.chunk_text(text)
    assert_equal 1, chunks.size
    assert_equal "Short paragraph.", chunks.first
  end

  test "chunk_text splits on paragraph boundaries" do
    # Create text that exceeds CHUNK_SIZE when combined
    para1 = "A" * 1200
    para2 = "B" * 1200
    text = "#{para1}\n\n#{para2}"

    chunks = @host.chunk_text(text)
    assert_equal 2, chunks.size
    assert_includes chunks.first, "A" * 1200
    assert_includes chunks.last, "B" * 1200
  end

  test "chunk_text includes overlap between chunks" do
    para1 = "A" * 1500
    para2 = "B" * 1500
    text = "#{para1}\n\n#{para2}"

    chunks = @host.chunk_text(text)
    assert_equal 2, chunks.size
    # Second chunk should start with overlap from end of first chunk
    overlap = para1.last(VectorStore::Embeddable::CHUNK_OVERLAP)
    assert chunks.last.start_with?(overlap)
  end

  test "chunk_text keeps small paragraphs together" do
    paragraphs = Array.new(5) { |i| "Paragraph #{i} content." }
    text = paragraphs.join("\n\n")

    chunks = @host.chunk_text(text)
    assert_equal 1, chunks.size
  end

  test "chunk_text hard-splits oversized paragraphs" do
    # A single paragraph longer than CHUNK_SIZE with no paragraph breaks
    long_para = "X" * 5000
    chunks = @host.chunk_text(long_para)

    assert chunks.size > 1
    chunks.each do |chunk|
      assert chunk.length <= VectorStore::Embeddable::CHUNK_SIZE + VectorStore::Embeddable::CHUNK_OVERLAP + 2,
        "Chunk too large: #{chunk.length} chars"
    end
  end

  # --- embed ---

  test "embed calls embedding endpoint and returns vector" do
    expected_vector = [ 0.1, 0.2, 0.3 ]
    stub_response = { "data" => [ { "embedding" => expected_vector, "index" => 0 } ] }

    mock_client = mock("faraday")
    mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
      OpenStruct.new(body: stub_response)
    )
    @host.instance_variable_set(:@embedding_client, mock_client)

    result = @host.embed("test text")
    assert_equal expected_vector, result
  end

  test "embed raises on failed response" do
    mock_client = mock("faraday")
    mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
      OpenStruct.new(body: { "error" => "bad request" })
    )
    @host.instance_variable_set(:@embedding_client, mock_client)

    assert_raises(VectorStore::Error) { @host.embed("test text") }
  end

  # --- embed_batch ---

  test "embed_batch processes texts and returns ordered vectors" do
    texts = [ "first", "second", "third" ]
    vectors = [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ]
    stub_response = {
      "data" => [
        { "embedding" => vectors[0], "index" => 0 },
        { "embedding" => vectors[1], "index" => 1 },
        { "embedding" => vectors[2], "index" => 2 }
      ]
    }

    mock_client = mock("faraday")
    mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
      OpenStruct.new(body: stub_response)
    )
    @host.instance_variable_set(:@embedding_client, mock_client)

    result = @host.embed_batch(texts)
    assert_equal vectors, result
  end

  test "embed_batch handles multiple batches" do
    # Override batch size constant for testing
    original = VectorStore::Embeddable::EMBED_BATCH_SIZE
    VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
    VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, 2)

    texts = [ "a", "b", "c" ]

    batch1_response = {
      "data" => [
        { "embedding" => [ 0.1 ], "index" => 0 },
        { "embedding" => [ 0.2 ], "index" => 1 }
      ]
    }
    batch2_response = {
      "data" => [
        { "embedding" => [ 0.3 ], "index" => 0 }
      ]
    }

    mock_client = mock("faraday")
    mock_client.expects(:post).with("embeddings").twice
      .yields(mock_request)
      .returns(OpenStruct.new(body: batch1_response))
      .then.returns(OpenStruct.new(body: batch2_response))
    @host.instance_variable_set(:@embedding_client, mock_client)

    result = @host.embed_batch(texts)
    assert_equal [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ], result
  ensure
    VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
    VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, original)
  end

  private

    def mock_request
      request = OpenStruct.new(body: nil)
      request
    end
end