sure/app/models/vector_store/embeddable.rb

module VectorStore::Embeddable
  extend ActiveSupport::Concern

  CHUNK_SIZE = 2000
  CHUNK_OVERLAP = 200
  EMBED_BATCH_SIZE = 50

  TEXT_EXTENSIONS = %w[
    .txt .md .csv .json .xml .html .css
    .js .ts .py .rb .go .java .php .c .cpp .sh .tex
  ].freeze

  private

    # Dispatch by extension: PDF via PDF::Reader, plain-text types as-is.
    # Returns nil for unsupported binary formats.
    def extract_text(file_content, filename)
      ext = File.extname(filename).downcase

      case ext
      when ".pdf"
        extract_pdf_text(file_content)
      when *TEXT_EXTENSIONS
        file_content.to_s.encode("UTF-8", invalid: :replace, undef: :replace)
      else
        nil
      end
    end

    def extract_pdf_text(file_content)
      io = StringIO.new(file_content)
      reader = PDF::Reader.new(io)
      reader.pages.map(&:text).join("\n\n")
    rescue => e
      Rails.logger.error("VectorStore::Embeddable PDF extraction error: #{e.message}")
      nil
    end

    # Split text on paragraph boundaries (~2000 char chunks, ~200 char overlap).
    # Paragraphs longer than CHUNK_SIZE are hard-split to avoid overflowing
    # embedding model token limits.
    def chunk_text(text)
      return [] if text.blank?

      paragraphs = text.split(/\n\s*\n/)
      chunks = []
      current_chunk = +""

      paragraphs.each do |para|
        para = para.strip
        next if para.empty?

        # Hard-split oversized paragraphs into CHUNK_SIZE slices with overlap
        slices = if para.length > CHUNK_SIZE
          hard_split(para)
        else
          [ para ]
        end

        slices.each do |slice|
          if current_chunk.empty?
            current_chunk << slice
          elsif (current_chunk.length + slice.length + 2) <= CHUNK_SIZE
            current_chunk << "\n\n" << slice
          else
            chunks << current_chunk.freeze
            overlap = current_chunk.last(CHUNK_OVERLAP)
            current_chunk = +""
            current_chunk << overlap << "\n\n" << slice
          end
        end
      end

      chunks << current_chunk.freeze unless current_chunk.empty?
      chunks
    end

    # Hard-split a single long string into CHUNK_SIZE slices with CHUNK_OVERLAP.
    def hard_split(text)
      slices = []
      offset = 0
      while offset < text.length
        slices << text[offset, CHUNK_SIZE]
        offset += CHUNK_SIZE - CHUNK_OVERLAP
      end
      slices
    end

    # Embed a single text string → vector array.
    def embed(text)
      response = embedding_client.post("embeddings") do |req|
        req.body = {
          model: embedding_model,
          input: text
        }
      end

      data = response.body
      raise VectorStore::Error, "Embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]

      data["data"].first["embedding"]
    end

    # Batch embed, processing in groups of EMBED_BATCH_SIZE.
    def embed_batch(texts)
      vectors = []

      texts.each_slice(EMBED_BATCH_SIZE) do |batch|
        response = embedding_client.post("embeddings") do |req|
          req.body = {
            model: embedding_model,
            input: batch
          }
        end

        data = response.body
        raise VectorStore::Error, "Batch embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]

        # Sort by index to preserve order
        sorted = data["data"].sort_by { |d| d["index"] }
        vectors.concat(sorted.map { |d| d["embedding"] })
      end

      vectors
    end

    def embedding_client
      @embedding_client ||= Faraday.new(url: embedding_uri_base) do |f|
        f.request :json
        f.response :json
        f.headers["Authorization"] = "Bearer #{embedding_access_token}" if embedding_access_token.present?
        f.options.timeout = 120
        f.options.open_timeout = 10
      end
    end

    def embedding_model
      ENV.fetch("EMBEDDING_MODEL", "nomic-embed-text")
    end

    def embedding_dimensions
      ENV.fetch("EMBEDDING_DIMENSIONS", "1024").to_i
    end

    def embedding_uri_base
      ENV["EMBEDDING_URI_BASE"].presence || ENV["OPENAI_URI_BASE"].presence || "https://api.openai.com/v1/"
    end

    def embedding_access_token
      ENV["EMBEDDING_ACCESS_TOKEN"].presence || ENV["OPENAI_ACCESS_TOKEN"].presence
    end
end