mirror of
https://github.com/we-promise/sure.git
synced 2026-04-07 14:31:25 +00:00
762 lines
22 KiB
Ruby
762 lines
22 KiB
Ruby
namespace :evals do
|
|
desc "List all evaluation datasets"
|
|
task list_datasets: :environment do
|
|
datasets = Eval::Dataset.order(:eval_type, :name)
|
|
|
|
if datasets.empty?
|
|
puts "No datasets found. Import a dataset with: rake evals:import_dataset[path/to/file.yml]"
|
|
next
|
|
end
|
|
|
|
puts "=" * 80
|
|
puts "Available Evaluation Datasets"
|
|
puts "=" * 80
|
|
puts
|
|
|
|
datasets.group_by(&:eval_type).each do |eval_type, type_datasets|
|
|
puts "#{eval_type.titleize}:"
|
|
puts "-" * 40
|
|
|
|
type_datasets.each do |dataset|
|
|
status = dataset.active ? "active" : "inactive"
|
|
puts " #{dataset.name} (v#{dataset.version}) - #{dataset.sample_count} samples [#{status}]"
|
|
puts " #{dataset.description}" if dataset.description.present?
|
|
end
|
|
puts
|
|
end
|
|
end
|
|
|
|
desc "Import dataset from YAML file"
|
|
task :import_dataset, [ :file_path ] => :environment do |_t, args|
|
|
file_path = args[:file_path] || ENV["FILE"]
|
|
|
|
if file_path.blank?
|
|
puts "Usage: rake evals:import_dataset[path/to/file.yml]"
|
|
puts " or: FILE=path/to/file.yml rake evals:import_dataset"
|
|
exit 1
|
|
end
|
|
|
|
unless File.exist?(file_path)
|
|
puts "Error: File not found: #{file_path}"
|
|
exit 1
|
|
end
|
|
|
|
puts "Importing dataset from #{file_path}..."
|
|
|
|
dataset = Eval::Dataset.import_from_yaml(file_path)
|
|
|
|
puts "Successfully imported dataset:"
|
|
puts " Name: #{dataset.name}"
|
|
puts " Type: #{dataset.eval_type}"
|
|
puts " Version: #{dataset.version}"
|
|
puts " Samples: #{dataset.sample_count}"
|
|
|
|
stats = dataset.statistics
|
|
puts " By difficulty: #{stats[:by_difficulty].map { |k, v| "#{k}=#{v}" }.join(', ')}"
|
|
end
|
|
|
|
desc "Run evaluation against a model"
|
|
task :run, [ :dataset_name, :model ] => :environment do |_t, args|
|
|
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
|
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
|
|
provider = ENV["PROVIDER"] || "openai"
|
|
|
|
if dataset_name.blank?
|
|
puts "Usage: rake evals:run[dataset_name,model]"
|
|
puts " or: DATASET=name MODEL=gpt-4 rake evals:run"
|
|
exit 1
|
|
end
|
|
|
|
dataset = Eval::Dataset.find_by(name: dataset_name)
|
|
|
|
if dataset.nil?
|
|
puts "Error: Dataset '#{dataset_name}' not found"
|
|
puts "Available datasets:"
|
|
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
|
|
exit 1
|
|
end
|
|
|
|
run_name = "#{dataset_name}_#{model}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
|
|
|
|
puts "=" * 80
|
|
puts "Starting Evaluation Run"
|
|
puts "=" * 80
|
|
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
|
|
puts " Type: #{dataset.eval_type}"
|
|
puts " Model: #{model}"
|
|
puts " Provider: #{provider}"
|
|
puts " Run Name: #{run_name}"
|
|
puts
|
|
|
|
eval_run = Eval::Run.create!(
|
|
dataset: dataset,
|
|
provider: provider,
|
|
model: model,
|
|
name: run_name,
|
|
status: "pending"
|
|
)
|
|
|
|
runner = dataset.runner_class.new(eval_run)
|
|
|
|
puts "Running evaluation..."
|
|
start_time = Time.current
|
|
|
|
begin
|
|
result = runner.run
|
|
duration = (Time.current - start_time).round(1)
|
|
|
|
puts
|
|
puts "=" * 80
|
|
puts "Evaluation Complete"
|
|
puts "=" * 80
|
|
puts " Status: #{result.status}"
|
|
puts " Duration: #{duration}s"
|
|
puts " Run ID: #{result.id}"
|
|
puts
|
|
puts "Metrics:"
|
|
result.metrics.each do |key, value|
|
|
next if value.is_a?(Hash) # Skip nested metrics for summary
|
|
puts " #{key}: #{format_metric_value(value)}"
|
|
end
|
|
|
|
# Show difficulty breakdown if available
|
|
if result.metrics["by_difficulty"].present?
|
|
puts
|
|
puts "By Difficulty:"
|
|
result.metrics["by_difficulty"].each do |difficulty, stats|
|
|
puts " #{difficulty}: #{stats['accuracy']}% accuracy (#{stats['correct']}/#{stats['count']})"
|
|
end
|
|
end
|
|
rescue => e
|
|
puts
|
|
puts "Evaluation FAILED: #{e.message}"
|
|
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
desc "Compare multiple models on a dataset"
|
|
task :compare, [ :dataset_name ] => :environment do |_t, args|
|
|
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
|
models = (ENV["MODELS"] || "gpt-4.1,gpt-4o-mini").split(",").map(&:strip)
|
|
provider = ENV["PROVIDER"] || "openai"
|
|
|
|
if dataset_name.blank?
|
|
puts "Usage: MODELS=model1,model2 rake evals:compare[dataset_name]"
|
|
exit 1
|
|
end
|
|
|
|
dataset = Eval::Dataset.find_by!(name: dataset_name)
|
|
|
|
puts "=" * 80
|
|
puts "Model Comparison"
|
|
puts "=" * 80
|
|
puts " Dataset: #{dataset.name}"
|
|
puts " Models: #{models.join(', ')}"
|
|
puts
|
|
|
|
runs = models.map do |model|
|
|
puts "Running evaluation for #{model}..."
|
|
|
|
eval_run = Eval::Run.create!(
|
|
dataset: dataset,
|
|
provider: provider,
|
|
model: model,
|
|
name: "compare_#{model}_#{Time.current.to_i}",
|
|
status: "pending"
|
|
)
|
|
|
|
runner = dataset.runner_class.new(eval_run)
|
|
runner.run
|
|
end
|
|
|
|
puts
|
|
puts "=" * 80
|
|
puts "Comparison Results"
|
|
puts "=" * 80
|
|
puts
|
|
|
|
reporter = Eval::Reporters::ComparisonReporter.new(runs)
|
|
puts reporter.to_table
|
|
|
|
summary = reporter.summary
|
|
if summary.present?
|
|
puts
|
|
puts "Recommendations:"
|
|
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
|
|
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
|
|
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
|
|
puts
|
|
puts " #{summary[:recommendation]}"
|
|
end
|
|
|
|
# Export to CSV if requested
|
|
if ENV["CSV"].present?
|
|
csv_path = reporter.to_csv(ENV["CSV"])
|
|
puts
|
|
puts "Exported to: #{csv_path}"
|
|
end
|
|
end
|
|
|
|
desc "Generate report for specific runs"
|
|
task :report, [ :run_ids ] => :environment do |_t, args|
|
|
run_ids = (args[:run_ids] || ENV["RUN_IDS"])&.split(",")
|
|
|
|
runs = if run_ids.present?
|
|
Eval::Run.where(id: run_ids)
|
|
else
|
|
Eval::Run.completed.order(created_at: :desc).limit(5)
|
|
end
|
|
|
|
if runs.empty?
|
|
puts "No runs found."
|
|
exit 1
|
|
end
|
|
|
|
reporter = Eval::Reporters::ComparisonReporter.new(runs)
|
|
|
|
puts reporter.to_table
|
|
|
|
summary = reporter.summary
|
|
if summary.present?
|
|
puts
|
|
puts "Summary:"
|
|
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
|
|
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
|
|
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
|
|
end
|
|
|
|
if ENV["CSV"].present?
|
|
csv_path = reporter.to_csv(ENV["CSV"])
|
|
puts
|
|
puts "Exported to: #{csv_path}"
|
|
end
|
|
end
|
|
|
|
desc "Quick smoke test to verify provider configuration"
|
|
task smoke_test: :environment do
|
|
puts "Running smoke test..."
|
|
|
|
provider = Provider::Registry.get_provider(:openai)
|
|
|
|
unless provider
|
|
puts "FAIL: OpenAI provider not configured"
|
|
puts "Set OPENAI_ACCESS_TOKEN environment variable or configure in settings"
|
|
exit 1
|
|
end
|
|
|
|
puts " Provider: #{provider.provider_name}"
|
|
puts " Model: #{provider.instance_variable_get(:@default_model)}"
|
|
|
|
# Test with a single categorization sample
|
|
result = provider.auto_categorize(
|
|
transactions: [
|
|
{ id: "test", amount: 10, classification: "expense", description: "McDonalds" }
|
|
],
|
|
user_categories: [
|
|
{ id: "1", name: "Food & Drink", classification: "expense" }
|
|
]
|
|
)
|
|
|
|
if result.success?
|
|
category = result.data.first&.category_name
|
|
puts " Test result: #{category || 'null'}"
|
|
puts
|
|
puts "PASS: Provider is working correctly"
|
|
else
|
|
puts "FAIL: #{result.error.message}"
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
desc "Run CI regression test"
|
|
task ci_regression: :environment do
|
|
dataset_name = ENV["EVAL_DATASET"] || "categorization_golden_v1"
|
|
model = ENV["EVAL_MODEL"] || "gpt-4.1-mini"
|
|
threshold = (ENV["EVAL_THRESHOLD"] || "80").to_f
|
|
|
|
dataset = Eval::Dataset.find_by(name: dataset_name)
|
|
|
|
unless dataset
|
|
puts "Dataset '#{dataset_name}' not found. Skipping CI regression test."
|
|
exit 0
|
|
end
|
|
|
|
# Get baseline from last successful run
|
|
baseline_run = dataset.runs.completed.for_model(model).order(created_at: :desc).first
|
|
|
|
# Run new evaluation
|
|
eval_run = Eval::Run.create!(
|
|
dataset: dataset,
|
|
provider: "openai",
|
|
model: model,
|
|
name: "ci_regression_#{Time.current.to_i}",
|
|
status: "pending"
|
|
)
|
|
|
|
runner = dataset.runner_class.new(eval_run)
|
|
result = runner.run
|
|
|
|
current_accuracy = result.metrics["accuracy"] || 0
|
|
|
|
puts "CI Regression Test Results:"
|
|
puts " Model: #{model}"
|
|
puts " Current Accuracy: #{current_accuracy}%"
|
|
|
|
if baseline_run
|
|
baseline_accuracy = baseline_run.metrics["accuracy"] || 0
|
|
puts " Baseline Accuracy: #{baseline_accuracy}%"
|
|
|
|
accuracy_diff = current_accuracy - baseline_accuracy
|
|
|
|
if accuracy_diff < -5
|
|
puts
|
|
puts "REGRESSION DETECTED!"
|
|
puts "Accuracy dropped by #{accuracy_diff.abs}% (threshold: 5%)"
|
|
exit 1
|
|
end
|
|
|
|
puts " Difference: #{accuracy_diff > 0 ? '+' : ''}#{accuracy_diff.round(2)}%"
|
|
end
|
|
|
|
if current_accuracy < threshold
|
|
puts
|
|
puts "BELOW THRESHOLD!"
|
|
puts "Accuracy #{current_accuracy}% is below required #{threshold}%"
|
|
exit 1
|
|
end
|
|
|
|
puts
|
|
puts "CI Regression Test PASSED"
|
|
end
|
|
|
|
desc "List recent evaluation runs"
|
|
task list_runs: :environment do
|
|
runs = Eval::Run.order(created_at: :desc).limit(20)
|
|
|
|
if runs.empty?
|
|
puts "No runs found."
|
|
next
|
|
end
|
|
|
|
puts "=" * 100
|
|
puts "Recent Evaluation Runs"
|
|
puts "=" * 100
|
|
|
|
runs.each do |run|
|
|
status_icon = case run.status
|
|
when "completed" then "[OK]"
|
|
when "failed" then "[FAIL]"
|
|
when "running" then "[...]"
|
|
else "[?]"
|
|
end
|
|
|
|
accuracy = run.metrics["accuracy"] ? "#{run.metrics['accuracy']}%" : "-"
|
|
|
|
puts "#{status_icon} #{run.id[0..7]} | #{run.model.ljust(15)} | #{run.dataset.name.ljust(25)} | #{accuracy.rjust(8)} | #{run.created_at.strftime('%Y-%m-%d %H:%M')}"
|
|
end
|
|
end
|
|
|
|
desc "Show details for a specific run"
|
|
task :show_run, [ :run_id ] => :environment do |_t, args|
|
|
run_id = args[:run_id] || ENV["RUN_ID"]
|
|
|
|
if run_id.blank?
|
|
puts "Usage: rake evals:show_run[run_id]"
|
|
exit 1
|
|
end
|
|
|
|
run = Eval::Run.find_by(id: run_id) || Eval::Run.find_by("id::text LIKE ?", "#{run_id}%")
|
|
|
|
unless run
|
|
puts "Run not found: #{run_id}"
|
|
exit 1
|
|
end
|
|
|
|
puts "=" * 80
|
|
puts "Evaluation Run Details"
|
|
puts "=" * 80
|
|
puts
|
|
puts "Run ID: #{run.id}"
|
|
puts "Name: #{run.name}"
|
|
puts "Dataset: #{run.dataset.name}"
|
|
puts "Model: #{run.model}"
|
|
puts "Provider: #{run.provider}"
|
|
puts "Status: #{run.status}"
|
|
puts "Created: #{run.created_at}"
|
|
puts "Duration: #{run.duration_seconds}s" if run.duration_seconds
|
|
|
|
if run.error_message.present?
|
|
puts
|
|
puts "Error: #{run.error_message}"
|
|
end
|
|
|
|
if run.metrics.present?
|
|
puts
|
|
puts "Metrics:"
|
|
run.metrics.each do |key, value|
|
|
if value.is_a?(Hash)
|
|
puts " #{key}:"
|
|
value.each { |k, v| puts " #{k}: #{v}" }
|
|
else
|
|
puts " #{key}: #{format_metric_value(value)}"
|
|
end
|
|
end
|
|
end
|
|
|
|
# Show sample of incorrect results
|
|
incorrect = run.results.incorrect.limit(5)
|
|
if incorrect.any?
|
|
puts
|
|
puts "Sample Incorrect Results (#{run.results.incorrect.count} total):"
|
|
incorrect.each do |result|
|
|
puts " Sample: #{result.sample_id[0..7]}"
|
|
puts " Expected: #{result.sample.expected_output}"
|
|
puts " Actual: #{result.actual_output}"
|
|
puts
|
|
end
|
|
end
|
|
end
|
|
|
|
# =============================================================================
|
|
# Langfuse Integration
|
|
# =============================================================================
|
|
|
|
namespace :langfuse do
|
|
desc "Check Langfuse configuration"
|
|
task check: :environment do
|
|
begin
|
|
client = Eval::Langfuse::Client.new
|
|
|
|
# Obfuscate keys for display
|
|
public_key = ENV["LANGFUSE_PUBLIC_KEY"]
|
|
secret_key = ENV["LANGFUSE_SECRET_KEY"]
|
|
|
|
obfuscate_key = lambda do |key|
|
|
return "null" if key.blank?
|
|
return "#{key[3..8]}***" if key.length <= 8
|
|
"#{key[0..7]}...#{key[-4..-1]}"
|
|
end
|
|
|
|
# Determine region
|
|
region = if ENV["LANGFUSE_HOST"].present?
|
|
"custom (#{ENV['LANGFUSE_HOST']})"
|
|
elsif ENV["LANGFUSE_REGION"].present?
|
|
ENV["LANGFUSE_REGION"]
|
|
else
|
|
"eu (default)"
|
|
end
|
|
|
|
puts "✓ Langfuse credentials configured"
|
|
puts " 🔑 Public Key: #{obfuscate_key.call(public_key)}"
|
|
puts " 🔐 Secret Key: #{obfuscate_key.call(secret_key)}"
|
|
puts " 🌍 Region: #{region}"
|
|
|
|
# Try to list datasets to verify connection
|
|
response = client.list_datasets(limit: 1)
|
|
puts "✓ Successfully connected to Langfuse"
|
|
rescue Eval::Langfuse::Client::ConfigurationError => e
|
|
puts "✗ #{e.message}"
|
|
exit 1
|
|
rescue Eval::Langfuse::Client::ApiError => e
|
|
puts "✗ Failed to connect to Langfuse: #{e.message}"
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
desc "Upload dataset to Langfuse"
|
|
task :upload_dataset, [ :dataset_name ] => :environment do |_t, args|
|
|
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
|
|
|
if dataset_name.blank?
|
|
puts "Usage: rake evals:langfuse:upload_dataset[dataset_name]"
|
|
puts " or: DATASET=name rake evals:langfuse:upload_dataset"
|
|
exit 1
|
|
end
|
|
|
|
dataset = Eval::Dataset.find_by(name: dataset_name)
|
|
|
|
if dataset.nil?
|
|
puts "Error: Dataset '#{dataset_name}' not found"
|
|
puts "Available datasets:"
|
|
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
|
|
exit 1
|
|
end
|
|
|
|
puts "=" * 80
|
|
puts "Uploading Dataset to Langfuse"
|
|
puts "=" * 80
|
|
puts " Dataset: #{dataset.name}"
|
|
puts " Type: #{dataset.eval_type}"
|
|
puts " Samples: #{dataset.sample_count}"
|
|
puts
|
|
|
|
begin
|
|
exporter = Eval::Langfuse::DatasetExporter.new(dataset)
|
|
result = exporter.export
|
|
|
|
puts
|
|
puts "✓ Successfully uploaded dataset to Langfuse"
|
|
puts " Langfuse dataset name: #{result[:dataset_name]}"
|
|
puts " Items exported: #{result[:items_exported]}"
|
|
puts
|
|
puts "View in Langfuse: https://cloud.langfuse.com/project/datasets"
|
|
rescue Eval::Langfuse::Client::ConfigurationError => e
|
|
puts "✗ #{e.message}"
|
|
exit 1
|
|
rescue Eval::Langfuse::Client::ApiError => e
|
|
puts "✗ Langfuse API error: #{e.message}"
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
desc "Run experiment in Langfuse"
|
|
task :run_experiment, [ :dataset_name, :model ] => :environment do |_t, args|
|
|
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
|
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
|
|
provider = ENV["PROVIDER"] || "openai"
|
|
run_name = ENV["RUN_NAME"]
|
|
|
|
if dataset_name.blank?
|
|
puts "Usage: rake evals:langfuse:run_experiment[dataset_name,model]"
|
|
puts " or: DATASET=name MODEL=gpt-4.1 rake evals:langfuse:run_experiment"
|
|
puts
|
|
puts "Optional environment variables:"
|
|
puts " PROVIDER=openai (default)"
|
|
puts " RUN_NAME=custom_run_name"
|
|
exit 1
|
|
end
|
|
|
|
dataset = Eval::Dataset.find_by(name: dataset_name)
|
|
|
|
if dataset.nil?
|
|
puts "Error: Dataset '#{dataset_name}' not found"
|
|
puts "Available datasets:"
|
|
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
|
|
exit 1
|
|
end
|
|
|
|
puts "=" * 80
|
|
puts "Running Langfuse Experiment"
|
|
puts "=" * 80
|
|
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
|
|
puts " Type: #{dataset.eval_type}"
|
|
puts " Model: #{model}"
|
|
puts " Provider: #{provider}"
|
|
puts
|
|
|
|
begin
|
|
runner = Eval::Langfuse::ExperimentRunner.new(
|
|
dataset,
|
|
model: model,
|
|
provider: provider
|
|
)
|
|
|
|
start_time = Time.current
|
|
result = runner.run(run_name: run_name)
|
|
duration = (Time.current - start_time).round(1)
|
|
|
|
puts
|
|
puts "=" * 80
|
|
puts "Experiment Complete"
|
|
puts "=" * 80
|
|
puts " Run Name: #{result[:run_name]}"
|
|
puts " Duration: #{duration}s"
|
|
puts
|
|
puts "Results:"
|
|
puts " Accuracy: #{result[:metrics][:accuracy]}%"
|
|
puts " Correct: #{result[:metrics][:correct]}/#{result[:metrics][:total]}"
|
|
puts " Avg Latency: #{result[:metrics][:avg_latency_ms]}ms"
|
|
puts
|
|
puts "View in Langfuse:"
|
|
puts " Dataset: https://cloud.langfuse.com/project/datasets"
|
|
puts " Traces: https://cloud.langfuse.com/project/traces"
|
|
rescue Eval::Langfuse::Client::ConfigurationError => e
|
|
puts "✗ #{e.message}"
|
|
exit 1
|
|
rescue Eval::Langfuse::Client::ApiError => e
|
|
puts "✗ Langfuse API error: #{e.message}"
|
|
exit 1
|
|
rescue => e
|
|
puts "✗ Error: #{e.message}"
|
|
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
desc "List datasets in Langfuse"
|
|
task list_datasets: :environment do
|
|
begin
|
|
client = Eval::Langfuse::Client.new
|
|
response = client.list_datasets(limit: 100)
|
|
|
|
datasets = response["data"] || []
|
|
|
|
if datasets.empty?
|
|
puts "No datasets found in Langfuse."
|
|
puts "Upload a dataset with: rake evals:langfuse:upload_dataset[dataset_name]"
|
|
next
|
|
end
|
|
|
|
puts "=" * 80
|
|
puts "Langfuse Datasets"
|
|
puts "=" * 80
|
|
puts
|
|
|
|
datasets.each do |ds|
|
|
puts " #{ds['name']}"
|
|
puts " Description: #{ds['description']}" if ds["description"].present?
|
|
puts " Created: #{ds['createdAt']}"
|
|
puts " Metadata: #{ds['metadata']}" if ds["metadata"].present?
|
|
puts
|
|
end
|
|
rescue Eval::Langfuse::Client::ConfigurationError => e
|
|
puts "✗ #{e.message}"
|
|
exit 1
|
|
rescue Eval::Langfuse::Client::ApiError => e
|
|
puts "✗ Langfuse API error: #{e.message}"
|
|
exit 1
|
|
end
|
|
end
|
|
end
|
|
|
|
desc "Export manually categorized transactions as golden data"
|
|
task :export_manual_categories, [ :family_id ] => :environment do |_t, args|
|
|
family_id = args[:family_id] || ENV["FAMILY_ID"]
|
|
output_path = ENV["OUTPUT"] || "db/eval_data/categorization_manual_export.yml"
|
|
limit = (ENV["LIMIT"] || 500).to_i
|
|
|
|
if family_id.blank?
|
|
puts "Usage: rake evals:export_manual_categories[family_id]"
|
|
puts " or: FAMILY_ID=uuid rake evals:export_manual_categories"
|
|
puts
|
|
puts "Optional environment variables:"
|
|
puts " OUTPUT=path/to/output.yml (default: db/eval_data/categorization_manual_export.yml)"
|
|
puts " LIMIT=500 (default: 500)"
|
|
exit 1
|
|
end
|
|
|
|
family = Family.find_by(id: family_id)
|
|
|
|
if family.nil?
|
|
puts "Error: Family '#{family_id}' not found"
|
|
exit 1
|
|
end
|
|
|
|
puts "=" * 80
|
|
puts "Exporting Manually Categorized Transactions"
|
|
puts "=" * 80
|
|
puts " Family: #{family.name}"
|
|
puts " Output: #{output_path}"
|
|
puts " Limit: #{limit}"
|
|
puts
|
|
|
|
# Find transactions that have:
|
|
# 1. A category assigned
|
|
# 2. locked_attributes contains "category_id" (meaning user manually set it)
|
|
# 3. No DataEnrichment record for category_id (meaning it wasn't set by AI/rules/etc)
|
|
manually_categorized = Transaction
|
|
.joins(:entry)
|
|
.joins("INNER JOIN accounts ON accounts.id = entries.account_id")
|
|
.where(accounts: { family_id: family_id })
|
|
.where.not(category_id: nil)
|
|
.where("transactions.locked_attributes ? 'category_id'")
|
|
.where.not(
|
|
id: DataEnrichment
|
|
.where(enrichable_type: "Transaction", attribute_name: "category_id")
|
|
.select(:enrichable_id)
|
|
)
|
|
.includes(:category, entry: :account)
|
|
.limit(limit)
|
|
|
|
count = manually_categorized.count
|
|
|
|
if count == 0
|
|
puts "No manually categorized transactions found."
|
|
puts
|
|
puts "Manually categorized transactions are those where:"
|
|
puts " - User set a category manually (locked_attributes contains 'category_id')"
|
|
puts " - Category was NOT set by AI, rules, or data enrichment sources"
|
|
exit 0
|
|
end
|
|
|
|
puts "Found #{count} manually categorized transactions"
|
|
puts
|
|
|
|
# Build category context from family's categories
|
|
categories = family.categories.includes(:parent).map do |cat|
|
|
{
|
|
"id" => cat.id.to_s,
|
|
"name" => cat.name,
|
|
"classification" => cat.classification,
|
|
"is_subcategory" => cat.subcategory?,
|
|
"parent_id" => cat.parent_id&.to_s
|
|
}.compact
|
|
end
|
|
|
|
# Build samples
|
|
samples = manually_categorized.map.with_index do |txn, idx|
|
|
entry = txn.entry
|
|
sample_id = "manual_#{idx + 1}"
|
|
|
|
{
|
|
"id" => sample_id,
|
|
"difficulty" => "manual",
|
|
"tags" => [ txn.category.name.parameterize.underscore, "manual_export" ],
|
|
"input" => {
|
|
"id" => txn.id.to_s,
|
|
"amount" => entry.amount.to_f.abs,
|
|
"classification" => entry.classification,
|
|
"description" => entry.name
|
|
},
|
|
"expected" => {
|
|
"category_name" => txn.category.name
|
|
}
|
|
}
|
|
end
|
|
|
|
# Build output structure
|
|
output = {
|
|
"name" => "categorization_manual_export",
|
|
"description" => "Golden dataset exported from manually categorized user transactions",
|
|
"eval_type" => "categorization",
|
|
"version" => "1.0",
|
|
"metadata" => {
|
|
"created_at" => Time.current.strftime("%Y-%m-%d"),
|
|
"source" => "manual_export",
|
|
"family_id" => family_id,
|
|
"exported_count" => samples.size
|
|
},
|
|
"context" => {
|
|
"categories" => categories
|
|
},
|
|
"samples" => samples
|
|
}
|
|
|
|
# Write to file
|
|
FileUtils.mkdir_p(File.dirname(output_path))
|
|
File.write(output_path, output.to_yaml)
|
|
|
|
puts "✓ Successfully exported #{samples.size} samples"
|
|
puts " Difficulty: manual"
|
|
puts
|
|
puts "Output written to: #{output_path}"
|
|
puts
|
|
puts "To import this dataset, run:"
|
|
puts " rake evals:import_dataset[#{output_path}]"
|
|
end
|
|
|
|
private
|
|
|
|
def format_metric_value(value)
|
|
case value
|
|
when Float
|
|
value.round(4)
|
|
when BigDecimal
|
|
value.to_f.round(4)
|
|
else
|
|
value
|
|
end
|
|
end
|
|
end
|