mirror of
https://github.com/we-promise/sure.git
synced 2026-04-08 14:54:49 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
116 lines
3.0 KiB
Ruby
116 lines
3.0 KiB
Ruby
class Eval::Langfuse::DatasetExporter
|
|
attr_reader :dataset, :client
|
|
|
|
def initialize(dataset, client: nil)
|
|
@dataset = dataset
|
|
@client = client || Eval::Langfuse::Client.new
|
|
end
|
|
|
|
def export
|
|
Rails.logger.info("[Langfuse] Exporting dataset '#{dataset.name}' to Langfuse...")
|
|
|
|
# Create or update dataset in Langfuse
|
|
create_langfuse_dataset
|
|
|
|
# Export all samples as dataset items
|
|
exported_count = export_samples
|
|
|
|
Rails.logger.info("[Langfuse] Exported #{exported_count} items to dataset '#{langfuse_dataset_name}'")
|
|
|
|
{
|
|
dataset_name: langfuse_dataset_name,
|
|
items_exported: exported_count
|
|
}
|
|
end
|
|
|
|
private
|
|
|
|
def langfuse_dataset_name
|
|
# Use a consistent naming convention
|
|
"eval_#{dataset.name}"
|
|
end
|
|
|
|
def create_langfuse_dataset
|
|
client.create_dataset(
|
|
name: langfuse_dataset_name,
|
|
description: dataset.description || "Evaluation dataset: #{dataset.name}",
|
|
metadata: {
|
|
eval_type: dataset.eval_type,
|
|
version: dataset.version,
|
|
source: "sure_eval_framework",
|
|
exported_at: Time.current.iso8601
|
|
}
|
|
)
|
|
rescue Eval::Langfuse::Client::ApiError => e
|
|
# Dataset might already exist (409 conflict), which is fine
|
|
raise unless e.status == 409
|
|
|
|
Rails.logger.info("[Langfuse] Dataset '#{langfuse_dataset_name}' already exists, updating items...")
|
|
end
|
|
|
|
def export_samples
|
|
count = 0
|
|
|
|
dataset.samples.find_each do |sample|
|
|
export_sample(sample)
|
|
count += 1
|
|
|
|
# Log progress every 25 samples
|
|
if (count % 25).zero?
|
|
Rails.logger.info("[Langfuse] Exported #{count}/#{dataset.sample_count} items...")
|
|
print " Exported #{count}/#{dataset.sample_count} items...\r"
|
|
end
|
|
|
|
# Small delay to avoid rate limiting (Langfuse free tier has limits)
|
|
sleep(0.1)
|
|
end
|
|
|
|
count
|
|
end
|
|
|
|
def export_sample(sample)
|
|
client.create_dataset_item(
|
|
dataset_name: langfuse_dataset_name,
|
|
id: sample.id, # Use the same ID for idempotency
|
|
input: build_input(sample),
|
|
expected_output: build_expected_output(sample),
|
|
metadata: build_metadata(sample)
|
|
)
|
|
end
|
|
|
|
def build_input(sample)
|
|
case dataset.eval_type
|
|
when "categorization"
|
|
{
|
|
transaction: sample.input_data,
|
|
categories: sample.categories_context
|
|
}
|
|
when "merchant_detection"
|
|
{
|
|
transaction: sample.input_data,
|
|
merchants: sample.merchants_context
|
|
}
|
|
when "chat"
|
|
{
|
|
prompt: sample.chat_prompt,
|
|
mock_data: sample.mock_data
|
|
}
|
|
else
|
|
sample.input_data
|
|
end
|
|
end
|
|
|
|
def build_expected_output(sample)
|
|
sample.expected_output
|
|
end
|
|
|
|
def build_metadata(sample)
|
|
{
|
|
difficulty: sample.difficulty,
|
|
tags: sample.tags,
|
|
eval_type: dataset.eval_type,
|
|
sample_id: sample.id
|
|
}.merge(sample.metadata || {})
|
|
end
|
|
end
|