Files
sure/lib/tasks/evals.rake
2025-12-20 00:08:10 +00:00

762 lines
22 KiB
Ruby

namespace :evals do
desc "List all evaluation datasets"
task list_datasets: :environment do
datasets = Eval::Dataset.order(:eval_type, :name)
if datasets.empty?
puts "No datasets found. Import a dataset with: rake evals:import_dataset[path/to/file.yml]"
next
end
puts "=" * 80
puts "Available Evaluation Datasets"
puts "=" * 80
puts
datasets.group_by(&:eval_type).each do |eval_type, type_datasets|
puts "#{eval_type.titleize}:"
puts "-" * 40
type_datasets.each do |dataset|
status = dataset.active ? "active" : "inactive"
puts " #{dataset.name} (v#{dataset.version}) - #{dataset.sample_count} samples [#{status}]"
puts " #{dataset.description}" if dataset.description.present?
end
puts
end
end
desc "Import dataset from YAML file"
task :import_dataset, [ :file_path ] => :environment do |_t, args|
file_path = args[:file_path] || ENV["FILE"]
if file_path.blank?
puts "Usage: rake evals:import_dataset[path/to/file.yml]"
puts " or: FILE=path/to/file.yml rake evals:import_dataset"
exit 1
end
unless File.exist?(file_path)
puts "Error: File not found: #{file_path}"
exit 1
end
puts "Importing dataset from #{file_path}..."
dataset = Eval::Dataset.import_from_yaml(file_path)
puts "Successfully imported dataset:"
puts " Name: #{dataset.name}"
puts " Type: #{dataset.eval_type}"
puts " Version: #{dataset.version}"
puts " Samples: #{dataset.sample_count}"
stats = dataset.statistics
puts " By difficulty: #{stats[:by_difficulty].map { |k, v| "#{k}=#{v}" }.join(', ')}"
end
desc "Run evaluation against a model"
task :run, [ :dataset_name, :model ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
provider = ENV["PROVIDER"] || "openai"
if dataset_name.blank?
puts "Usage: rake evals:run[dataset_name,model]"
puts " or: DATASET=name MODEL=gpt-4 rake evals:run"
exit 1
end
dataset = Eval::Dataset.find_by(name: dataset_name)
if dataset.nil?
puts "Error: Dataset '#{dataset_name}' not found"
puts "Available datasets:"
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
exit 1
end
run_name = "#{dataset_name}_#{model}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
puts "=" * 80
puts "Starting Evaluation Run"
puts "=" * 80
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
puts " Type: #{dataset.eval_type}"
puts " Model: #{model}"
puts " Provider: #{provider}"
puts " Run Name: #{run_name}"
puts
eval_run = Eval::Run.create!(
dataset: dataset,
provider: provider,
model: model,
name: run_name,
status: "pending"
)
runner = dataset.runner_class.new(eval_run)
puts "Running evaluation..."
start_time = Time.current
begin
result = runner.run
duration = (Time.current - start_time).round(1)
puts
puts "=" * 80
puts "Evaluation Complete"
puts "=" * 80
puts " Status: #{result.status}"
puts " Duration: #{duration}s"
puts " Run ID: #{result.id}"
puts
puts "Metrics:"
result.metrics.each do |key, value|
next if value.is_a?(Hash) # Skip nested metrics for summary
puts " #{key}: #{format_metric_value(value)}"
end
# Show difficulty breakdown if available
if result.metrics["by_difficulty"].present?
puts
puts "By Difficulty:"
result.metrics["by_difficulty"].each do |difficulty, stats|
puts " #{difficulty}: #{stats['accuracy']}% accuracy (#{stats['correct']}/#{stats['count']})"
end
end
rescue => e
puts
puts "Evaluation FAILED: #{e.message}"
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
exit 1
end
end
desc "Compare multiple models on a dataset"
task :compare, [ :dataset_name ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
models = (ENV["MODELS"] || "gpt-4.1,gpt-4o-mini").split(",").map(&:strip)
provider = ENV["PROVIDER"] || "openai"
if dataset_name.blank?
puts "Usage: MODELS=model1,model2 rake evals:compare[dataset_name]"
exit 1
end
dataset = Eval::Dataset.find_by!(name: dataset_name)
puts "=" * 80
puts "Model Comparison"
puts "=" * 80
puts " Dataset: #{dataset.name}"
puts " Models: #{models.join(', ')}"
puts
runs = models.map do |model|
puts "Running evaluation for #{model}..."
eval_run = Eval::Run.create!(
dataset: dataset,
provider: provider,
model: model,
name: "compare_#{model}_#{Time.current.to_i}",
status: "pending"
)
runner = dataset.runner_class.new(eval_run)
runner.run
end
puts
puts "=" * 80
puts "Comparison Results"
puts "=" * 80
puts
reporter = Eval::Reporters::ComparisonReporter.new(runs)
puts reporter.to_table
summary = reporter.summary
if summary.present?
puts
puts "Recommendations:"
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
puts
puts " #{summary[:recommendation]}"
end
# Export to CSV if requested
if ENV["CSV"].present?
csv_path = reporter.to_csv(ENV["CSV"])
puts
puts "Exported to: #{csv_path}"
end
end
desc "Generate report for specific runs"
task :report, [ :run_ids ] => :environment do |_t, args|
run_ids = (args[:run_ids] || ENV["RUN_IDS"])&.split(",")
runs = if run_ids.present?
Eval::Run.where(id: run_ids)
else
Eval::Run.completed.order(created_at: :desc).limit(5)
end
if runs.empty?
puts "No runs found."
exit 1
end
reporter = Eval::Reporters::ComparisonReporter.new(runs)
puts reporter.to_table
summary = reporter.summary
if summary.present?
puts
puts "Summary:"
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
end
if ENV["CSV"].present?
csv_path = reporter.to_csv(ENV["CSV"])
puts
puts "Exported to: #{csv_path}"
end
end
desc "Quick smoke test to verify provider configuration"
task smoke_test: :environment do
puts "Running smoke test..."
provider = Provider::Registry.get_provider(:openai)
unless provider
puts "FAIL: OpenAI provider not configured"
puts "Set OPENAI_ACCESS_TOKEN environment variable or configure in settings"
exit 1
end
puts " Provider: #{provider.provider_name}"
puts " Model: #{provider.instance_variable_get(:@default_model)}"
# Test with a single categorization sample
result = provider.auto_categorize(
transactions: [
{ id: "test", amount: 10, classification: "expense", description: "McDonalds" }
],
user_categories: [
{ id: "1", name: "Food & Drink", classification: "expense" }
]
)
if result.success?
category = result.data.first&.category_name
puts " Test result: #{category || 'null'}"
puts
puts "PASS: Provider is working correctly"
else
puts "FAIL: #{result.error.message}"
exit 1
end
end
desc "Run CI regression test"
task ci_regression: :environment do
dataset_name = ENV["EVAL_DATASET"] || "categorization_golden_v1"
model = ENV["EVAL_MODEL"] || "gpt-4.1-mini"
threshold = (ENV["EVAL_THRESHOLD"] || "80").to_f
dataset = Eval::Dataset.find_by(name: dataset_name)
unless dataset
puts "Dataset '#{dataset_name}' not found. Skipping CI regression test."
exit 0
end
# Get baseline from last successful run
baseline_run = dataset.runs.completed.for_model(model).order(created_at: :desc).first
# Run new evaluation
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: model,
name: "ci_regression_#{Time.current.to_i}",
status: "pending"
)
runner = dataset.runner_class.new(eval_run)
result = runner.run
current_accuracy = result.metrics["accuracy"] || 0
puts "CI Regression Test Results:"
puts " Model: #{model}"
puts " Current Accuracy: #{current_accuracy}%"
if baseline_run
baseline_accuracy = baseline_run.metrics["accuracy"] || 0
puts " Baseline Accuracy: #{baseline_accuracy}%"
accuracy_diff = current_accuracy - baseline_accuracy
if accuracy_diff < -5
puts
puts "REGRESSION DETECTED!"
puts "Accuracy dropped by #{accuracy_diff.abs}% (threshold: 5%)"
exit 1
end
puts " Difference: #{accuracy_diff > 0 ? '+' : ''}#{accuracy_diff.round(2)}%"
end
if current_accuracy < threshold
puts
puts "BELOW THRESHOLD!"
puts "Accuracy #{current_accuracy}% is below required #{threshold}%"
exit 1
end
puts
puts "CI Regression Test PASSED"
end
desc "List recent evaluation runs"
task list_runs: :environment do
runs = Eval::Run.order(created_at: :desc).limit(20)
if runs.empty?
puts "No runs found."
next
end
puts "=" * 100
puts "Recent Evaluation Runs"
puts "=" * 100
runs.each do |run|
status_icon = case run.status
when "completed" then "[OK]"
when "failed" then "[FAIL]"
when "running" then "[...]"
else "[?]"
end
accuracy = run.metrics["accuracy"] ? "#{run.metrics['accuracy']}%" : "-"
puts "#{status_icon} #{run.id[0..7]} | #{run.model.ljust(15)} | #{run.dataset.name.ljust(25)} | #{accuracy.rjust(8)} | #{run.created_at.strftime('%Y-%m-%d %H:%M')}"
end
end
desc "Show details for a specific run"
task :show_run, [ :run_id ] => :environment do |_t, args|
run_id = args[:run_id] || ENV["RUN_ID"]
if run_id.blank?
puts "Usage: rake evals:show_run[run_id]"
exit 1
end
run = Eval::Run.find_by(id: run_id) || Eval::Run.find_by("id::text LIKE ?", "#{run_id}%")
unless run
puts "Run not found: #{run_id}"
exit 1
end
puts "=" * 80
puts "Evaluation Run Details"
puts "=" * 80
puts
puts "Run ID: #{run.id}"
puts "Name: #{run.name}"
puts "Dataset: #{run.dataset.name}"
puts "Model: #{run.model}"
puts "Provider: #{run.provider}"
puts "Status: #{run.status}"
puts "Created: #{run.created_at}"
puts "Duration: #{run.duration_seconds}s" if run.duration_seconds
if run.error_message.present?
puts
puts "Error: #{run.error_message}"
end
if run.metrics.present?
puts
puts "Metrics:"
run.metrics.each do |key, value|
if value.is_a?(Hash)
puts " #{key}:"
value.each { |k, v| puts " #{k}: #{v}" }
else
puts " #{key}: #{format_metric_value(value)}"
end
end
end
# Show sample of incorrect results
incorrect = run.results.incorrect.limit(5)
if incorrect.any?
puts
puts "Sample Incorrect Results (#{run.results.incorrect.count} total):"
incorrect.each do |result|
puts " Sample: #{result.sample_id[0..7]}"
puts " Expected: #{result.sample.expected_output}"
puts " Actual: #{result.actual_output}"
puts
end
end
end
# =============================================================================
# Langfuse Integration
# =============================================================================
namespace :langfuse do
desc "Check Langfuse configuration"
task check: :environment do
begin
client = Eval::Langfuse::Client.new
# Obfuscate keys for display
public_key = ENV["LANGFUSE_PUBLIC_KEY"]
secret_key = ENV["LANGFUSE_SECRET_KEY"]
obfuscate_key = lambda do |key|
return "null" if key.blank?
return "#{key[3..8]}***" if key.length <= 8
"#{key[0..7]}...#{key[-4..-1]}"
end
# Determine region
region = if ENV["LANGFUSE_HOST"].present?
"custom (#{ENV['LANGFUSE_HOST']})"
elsif ENV["LANGFUSE_REGION"].present?
ENV["LANGFUSE_REGION"]
else
"eu (default)"
end
puts "✓ Langfuse credentials configured"
puts " 🔑 Public Key: #{obfuscate_key.call(public_key)}"
puts " 🔐 Secret Key: #{obfuscate_key.call(secret_key)}"
puts " 🌍 Region: #{region}"
# Try to list datasets to verify connection
response = client.list_datasets(limit: 1)
puts "✓ Successfully connected to Langfuse"
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Failed to connect to Langfuse: #{e.message}"
exit 1
end
end
desc "Upload dataset to Langfuse"
task :upload_dataset, [ :dataset_name ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
if dataset_name.blank?
puts "Usage: rake evals:langfuse:upload_dataset[dataset_name]"
puts " or: DATASET=name rake evals:langfuse:upload_dataset"
exit 1
end
dataset = Eval::Dataset.find_by(name: dataset_name)
if dataset.nil?
puts "Error: Dataset '#{dataset_name}' not found"
puts "Available datasets:"
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
exit 1
end
puts "=" * 80
puts "Uploading Dataset to Langfuse"
puts "=" * 80
puts " Dataset: #{dataset.name}"
puts " Type: #{dataset.eval_type}"
puts " Samples: #{dataset.sample_count}"
puts
begin
exporter = Eval::Langfuse::DatasetExporter.new(dataset)
result = exporter.export
puts
puts "✓ Successfully uploaded dataset to Langfuse"
puts " Langfuse dataset name: #{result[:dataset_name]}"
puts " Items exported: #{result[:items_exported]}"
puts
puts "View in Langfuse: https://cloud.langfuse.com/project/datasets"
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Langfuse API error: #{e.message}"
exit 1
end
end
desc "Run experiment in Langfuse"
task :run_experiment, [ :dataset_name, :model ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
provider = ENV["PROVIDER"] || "openai"
run_name = ENV["RUN_NAME"]
if dataset_name.blank?
puts "Usage: rake evals:langfuse:run_experiment[dataset_name,model]"
puts " or: DATASET=name MODEL=gpt-4.1 rake evals:langfuse:run_experiment"
puts
puts "Optional environment variables:"
puts " PROVIDER=openai (default)"
puts " RUN_NAME=custom_run_name"
exit 1
end
dataset = Eval::Dataset.find_by(name: dataset_name)
if dataset.nil?
puts "Error: Dataset '#{dataset_name}' not found"
puts "Available datasets:"
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
exit 1
end
puts "=" * 80
puts "Running Langfuse Experiment"
puts "=" * 80
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
puts " Type: #{dataset.eval_type}"
puts " Model: #{model}"
puts " Provider: #{provider}"
puts
begin
runner = Eval::Langfuse::ExperimentRunner.new(
dataset,
model: model,
provider: provider
)
start_time = Time.current
result = runner.run(run_name: run_name)
duration = (Time.current - start_time).round(1)
puts
puts "=" * 80
puts "Experiment Complete"
puts "=" * 80
puts " Run Name: #{result[:run_name]}"
puts " Duration: #{duration}s"
puts
puts "Results:"
puts " Accuracy: #{result[:metrics][:accuracy]}%"
puts " Correct: #{result[:metrics][:correct]}/#{result[:metrics][:total]}"
puts " Avg Latency: #{result[:metrics][:avg_latency_ms]}ms"
puts
puts "View in Langfuse:"
puts " Dataset: https://cloud.langfuse.com/project/datasets"
puts " Traces: https://cloud.langfuse.com/project/traces"
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Langfuse API error: #{e.message}"
exit 1
rescue => e
puts "✗ Error: #{e.message}"
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
exit 1
end
end
desc "List datasets in Langfuse"
task list_datasets: :environment do
begin
client = Eval::Langfuse::Client.new
response = client.list_datasets(limit: 100)
datasets = response["data"] || []
if datasets.empty?
puts "No datasets found in Langfuse."
puts "Upload a dataset with: rake evals:langfuse:upload_dataset[dataset_name]"
next
end
puts "=" * 80
puts "Langfuse Datasets"
puts "=" * 80
puts
datasets.each do |ds|
puts " #{ds['name']}"
puts " Description: #{ds['description']}" if ds["description"].present?
puts " Created: #{ds['createdAt']}"
puts " Metadata: #{ds['metadata']}" if ds["metadata"].present?
puts
end
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Langfuse API error: #{e.message}"
exit 1
end
end
end
desc "Export manually categorized transactions as golden data"
task :export_manual_categories, [ :family_id ] => :environment do |_t, args|
family_id = args[:family_id] || ENV["FAMILY_ID"]
output_path = ENV["OUTPUT"] || "db/eval_data/categorization_manual_export.yml"
limit = (ENV["LIMIT"] || 500).to_i
if family_id.blank?
puts "Usage: rake evals:export_manual_categories[family_id]"
puts " or: FAMILY_ID=uuid rake evals:export_manual_categories"
puts
puts "Optional environment variables:"
puts " OUTPUT=path/to/output.yml (default: db/eval_data/categorization_manual_export.yml)"
puts " LIMIT=500 (default: 500)"
exit 1
end
family = Family.find_by(id: family_id)
if family.nil?
puts "Error: Family '#{family_id}' not found"
exit 1
end
puts "=" * 80
puts "Exporting Manually Categorized Transactions"
puts "=" * 80
puts " Family: #{family.name}"
puts " Output: #{output_path}"
puts " Limit: #{limit}"
puts
# Find transactions that have:
# 1. A category assigned
# 2. locked_attributes contains "category_id" (meaning user manually set it)
# 3. No DataEnrichment record for category_id (meaning it wasn't set by AI/rules/etc)
manually_categorized = Transaction
.joins(:entry)
.joins("INNER JOIN accounts ON accounts.id = entries.account_id")
.where(accounts: { family_id: family_id })
.where.not(category_id: nil)
.where("transactions.locked_attributes ? 'category_id'")
.where.not(
id: DataEnrichment
.where(enrichable_type: "Transaction", attribute_name: "category_id")
.select(:enrichable_id)
)
.includes(:category, entry: :account)
.limit(limit)
count = manually_categorized.count
if count == 0
puts "No manually categorized transactions found."
puts
puts "Manually categorized transactions are those where:"
puts " - User set a category manually (locked_attributes contains 'category_id')"
puts " - Category was NOT set by AI, rules, or data enrichment sources"
exit 0
end
puts "Found #{count} manually categorized transactions"
puts
# Build category context from family's categories
categories = family.categories.includes(:parent).map do |cat|
{
"id" => cat.id.to_s,
"name" => cat.name,
"classification" => cat.classification,
"is_subcategory" => cat.subcategory?,
"parent_id" => cat.parent_id&.to_s
}.compact
end
# Build samples
samples = manually_categorized.map.with_index do |txn, idx|
entry = txn.entry
sample_id = "manual_#{idx + 1}"
{
"id" => sample_id,
"difficulty" => "manual",
"tags" => [ txn.category.name.parameterize.underscore, "manual_export" ],
"input" => {
"id" => txn.id.to_s,
"amount" => entry.amount.to_f.abs,
"classification" => entry.classification,
"description" => entry.name
},
"expected" => {
"category_name" => txn.category.name
}
}
end
# Build output structure
output = {
"name" => "categorization_manual_export",
"description" => "Golden dataset exported from manually categorized user transactions",
"eval_type" => "categorization",
"version" => "1.0",
"metadata" => {
"created_at" => Time.current.strftime("%Y-%m-%d"),
"source" => "manual_export",
"family_id" => family_id,
"exported_count" => samples.size
},
"context" => {
"categories" => categories
},
"samples" => samples
}
# Write to file
FileUtils.mkdir_p(File.dirname(output_path))
File.write(output_path, output.to_yaml)
puts "✓ Successfully exported #{samples.size} samples"
puts " Difficulty: manual"
puts
puts "Output written to: #{output_path}"
puts
puts "To import this dataset, run:"
puts " rake evals:import_dataset[#{output_path}]"
end
private
def format_metric_value(value)
case value
when Float
value.round(4)
when BigDecimal
value.to_f.round(4)
else
value
end
end
end