Files
sure/app/models/eval/langfuse/dataset_exporter.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

116 lines
3.0 KiB
Ruby

class Eval::Langfuse::DatasetExporter
attr_reader :dataset, :client
def initialize(dataset, client: nil)
@dataset = dataset
@client = client || Eval::Langfuse::Client.new
end
def export
Rails.logger.info("[Langfuse] Exporting dataset '#{dataset.name}' to Langfuse...")
# Create or update dataset in Langfuse
create_langfuse_dataset
# Export all samples as dataset items
exported_count = export_samples
Rails.logger.info("[Langfuse] Exported #{exported_count} items to dataset '#{langfuse_dataset_name}'")
{
dataset_name: langfuse_dataset_name,
items_exported: exported_count
}
end
private
def langfuse_dataset_name
# Use a consistent naming convention
"eval_#{dataset.name}"
end
def create_langfuse_dataset
client.create_dataset(
name: langfuse_dataset_name,
description: dataset.description || "Evaluation dataset: #{dataset.name}",
metadata: {
eval_type: dataset.eval_type,
version: dataset.version,
source: "sure_eval_framework",
exported_at: Time.current.iso8601
}
)
rescue Eval::Langfuse::Client::ApiError => e
# Dataset might already exist (409 conflict), which is fine
raise unless e.status == 409
Rails.logger.info("[Langfuse] Dataset '#{langfuse_dataset_name}' already exists, updating items...")
end
def export_samples
count = 0
dataset.samples.find_each do |sample|
export_sample(sample)
count += 1
# Log progress every 25 samples
if (count % 25).zero?
Rails.logger.info("[Langfuse] Exported #{count}/#{dataset.sample_count} items...")
print " Exported #{count}/#{dataset.sample_count} items...\r"
end
# Small delay to avoid rate limiting (Langfuse free tier has limits)
sleep(0.1)
end
count
end
def export_sample(sample)
client.create_dataset_item(
dataset_name: langfuse_dataset_name,
id: sample.id, # Use the same ID for idempotency
input: build_input(sample),
expected_output: build_expected_output(sample),
metadata: build_metadata(sample)
)
end
def build_input(sample)
case dataset.eval_type
when "categorization"
{
transaction: sample.input_data,
categories: sample.categories_context
}
when "merchant_detection"
{
transaction: sample.input_data,
merchants: sample.merchants_context
}
when "chat"
{
prompt: sample.chat_prompt,
mock_data: sample.mock_data
}
else
sample.input_data
end
end
def build_expected_output(sample)
sample.expected_output
end
def build_metadata(sample)
{
difficulty: sample.difficulty,
tags: sample.tags,
eval_type: dataset.eval_type,
sample_id: sample.id
}.merge(sample.metadata || {})
end
end