mirror of
https://github.com/we-promise/sure.git
synced 2026-04-08 06:44:52 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
200 lines
7.5 KiB
Ruby
200 lines
7.5 KiB
Ruby
class Eval::Runners::CategorizationRunner < Eval::Runners::Base
|
|
DEFAULT_BATCH_SIZE = 25 # Matches Provider::Openai limit
|
|
|
|
protected
|
|
|
|
def process_samples
|
|
all_samples = samples.to_a
|
|
batch_size = effective_batch_size
|
|
log_progress("Processing #{all_samples.size} samples in batches of #{batch_size}")
|
|
|
|
all_samples.each_slice(batch_size).with_index do |batch, batch_idx|
|
|
log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / batch_size).ceil}")
|
|
process_batch(batch)
|
|
end
|
|
end
|
|
|
|
# Use smaller batches for custom providers (local LLMs) to reduce context length
|
|
def effective_batch_size
|
|
eval_run.provider_config["batch_size"]&.to_i || DEFAULT_BATCH_SIZE
|
|
end
|
|
|
|
# Get JSON mode from provider config (optional override)
|
|
# Valid values: "strict", "json_object", "none"
|
|
def json_mode
|
|
eval_run.provider_config["json_mode"]
|
|
end
|
|
|
|
def calculate_metrics
|
|
Eval::Metrics::CategorizationMetrics.new(eval_run).calculate
|
|
end
|
|
|
|
private
|
|
|
|
def process_batch(batch_samples)
|
|
return if batch_samples.empty?
|
|
|
|
# Build inputs for the provider
|
|
transactions = batch_samples.map do |sample|
|
|
sample.to_transaction_input.merge(id: sample.id)
|
|
end
|
|
|
|
# Get categories from first sample's context (should be shared)
|
|
# Symbolize keys since Provider::Openai::AutoCategorizer expects symbol keys
|
|
categories = batch_samples.first.categories_context.map(&:deep_symbolize_keys)
|
|
|
|
# Determine effective JSON mode for this batch
|
|
# If the batch has many expected nulls and we're using auto mode, force strict mode
|
|
# to prevent the auto-categorizer from incorrectly retrying (it would see many nulls
|
|
# and think strict mode is broken, when actually the nulls are expected)
|
|
effective_json_mode = json_mode_for_batch(batch_samples)
|
|
|
|
start_time = Time.current
|
|
|
|
begin
|
|
response = provider.auto_categorize(
|
|
transactions: transactions,
|
|
user_categories: categories,
|
|
model: model,
|
|
json_mode: effective_json_mode
|
|
)
|
|
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
per_sample_latency = latency_ms / batch_samples.size
|
|
|
|
if response.success?
|
|
record_batch_results(batch_samples, response.data, per_sample_latency)
|
|
else
|
|
record_batch_errors(batch_samples, response.error, per_sample_latency)
|
|
end
|
|
rescue => e
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
per_sample_latency = latency_ms / batch_samples.size
|
|
record_batch_errors(batch_samples, e, per_sample_latency)
|
|
end
|
|
end
|
|
|
|
def record_batch_results(batch_samples, categorizations, per_sample_latency)
|
|
batch_samples.each do |sample|
|
|
# Find the categorization result for this sample
|
|
categorization = categorizations.find { |c| c.transaction_id.to_s == sample.id.to_s }
|
|
actual_category = categorization&.category_name
|
|
|
|
# Normalize "null" string to nil
|
|
actual_category = nil if actual_category == "null"
|
|
|
|
expected_category = sample.expected_category_name
|
|
acceptable_categories = sample.all_acceptable_categories
|
|
|
|
# Evaluate correctness - check primary expected and alternatives
|
|
correct = evaluate_correctness_with_alternatives(actual_category, expected_category, acceptable_categories)
|
|
exact_match = actual_category == expected_category
|
|
alternative_match = acceptable_categories.include?(actual_category) && !exact_match
|
|
hierarchical = evaluate_hierarchical_match(actual_category, expected_category, sample)
|
|
|
|
record_result(
|
|
sample: sample,
|
|
actual_output: { "category_name" => actual_category },
|
|
correct: correct,
|
|
exact_match: exact_match,
|
|
alternative_match: alternative_match,
|
|
hierarchical_match: hierarchical,
|
|
null_expected: expected_category.nil?,
|
|
null_returned: actual_category.nil?,
|
|
latency_ms: per_sample_latency
|
|
)
|
|
end
|
|
end
|
|
|
|
def record_batch_errors(batch_samples, error, per_sample_latency)
|
|
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
|
|
|
batch_samples.each do |sample|
|
|
record_result(
|
|
sample: sample,
|
|
actual_output: { "error" => error_message },
|
|
correct: false,
|
|
exact_match: false,
|
|
hierarchical_match: false,
|
|
null_expected: sample.expected_category_name.nil?,
|
|
null_returned: true,
|
|
latency_ms: per_sample_latency,
|
|
metadata: { "error" => error_message }
|
|
)
|
|
end
|
|
end
|
|
|
|
# Determine the effective JSON mode for a batch based on expected null ratio
|
|
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
|
|
def json_mode_for_batch(batch_samples)
|
|
# If a specific mode is configured (not "auto"), always use it
|
|
return json_mode if json_mode.present? && json_mode != "auto"
|
|
|
|
# Calculate expected null ratio for this batch
|
|
expected_null_count = batch_samples.count { |s| s.expected_category_name.nil? }
|
|
expected_null_ratio = expected_null_count.to_f / batch_samples.size
|
|
|
|
# If >50% of the batch is expected to return null, force strict mode
|
|
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
|
|
# and prevents unnecessary retries when nulls are legitimate
|
|
if expected_null_ratio > 0.5
|
|
log_progress("Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode to prevent false retry")
|
|
"strict"
|
|
else
|
|
# Use auto mode - let the auto-categorizer decide
|
|
"auto"
|
|
end
|
|
end
|
|
|
|
def evaluate_correctness(actual, expected)
|
|
# Both null = correct
|
|
return true if actual.nil? && expected.nil?
|
|
# Expected null but got value = incorrect
|
|
return false if expected.nil? && actual.present?
|
|
# Expected value but got null = incorrect
|
|
return false if actual.nil? && expected.present?
|
|
# Compare values
|
|
actual == expected
|
|
end
|
|
|
|
def evaluate_correctness_with_alternatives(actual, expected, acceptable_categories)
|
|
# Both null = correct
|
|
return true if actual.nil? && expected.nil?
|
|
# Expected null but got value = incorrect
|
|
return false if expected.nil? && actual.present?
|
|
# Expected value but got null = incorrect
|
|
return false if actual.nil? && expected.present?
|
|
# Check if actual matches any acceptable category (primary or alternatives)
|
|
acceptable_categories.include?(actual)
|
|
end
|
|
|
|
def evaluate_hierarchical_match(actual, expected, sample)
|
|
return false if actual.nil? || expected.nil?
|
|
return true if actual == expected
|
|
|
|
# Check if actual matches parent of expected category
|
|
categories = sample.categories_context
|
|
|
|
# Find the expected category
|
|
expected_cat = categories.find { |c| c["name"] == expected }
|
|
return false unless expected_cat
|
|
|
|
# If expected has a parent, check if actual matches the parent
|
|
if expected_cat["parent_id"]
|
|
parent = categories.find { |c| c["id"].to_s == expected_cat["parent_id"].to_s }
|
|
return parent && parent["name"] == actual
|
|
end
|
|
|
|
# Also check if actual is a subcategory of expected (reverse direction)
|
|
actual_cat = categories.find { |c| c["name"] == actual }
|
|
return false unless actual_cat
|
|
|
|
if actual_cat["parent_id"]
|
|
parent = categories.find { |c| c["id"].to_s == actual_cat["parent_id"].to_s }
|
|
return parent && parent["name"] == expected
|
|
end
|
|
|
|
false
|
|
end
|
|
end
|