mirror of
https://github.com/we-promise/sure.git
synced 2026-04-19 03:54:08 +00:00
Small llms improvements (#400)
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
118
test/models/eval/dataset_test.rb
Normal file
118
test/models/eval/dataset_test.rb
Normal file
@@ -0,0 +1,118 @@
|
||||
require "test_helper"
|
||||
|
||||
class Eval::DatasetTest < ActiveSupport::TestCase
|
||||
test "validates presence of name and eval_type" do
|
||||
dataset = Eval::Dataset.new
|
||||
|
||||
assert_not dataset.valid?
|
||||
assert_includes dataset.errors[:name], "can't be blank"
|
||||
assert_includes dataset.errors[:eval_type], "can't be blank"
|
||||
end
|
||||
|
||||
test "validates eval_type is one of allowed values" do
|
||||
dataset = Eval::Dataset.new(name: "test", eval_type: "invalid")
|
||||
|
||||
assert_not dataset.valid?
|
||||
assert_includes dataset.errors[:eval_type], "is not included in the list"
|
||||
|
||||
dataset.eval_type = "categorization"
|
||||
dataset.valid?
|
||||
assert_empty dataset.errors[:eval_type]
|
||||
end
|
||||
|
||||
test "validates name uniqueness" do
|
||||
Eval::Dataset.create!(name: "unique_test", eval_type: "categorization")
|
||||
|
||||
duplicate = Eval::Dataset.new(name: "unique_test", eval_type: "categorization")
|
||||
assert_not duplicate.valid?
|
||||
assert_includes duplicate.errors[:name], "has already been taken"
|
||||
end
|
||||
|
||||
test "scopes filter by eval_type" do
|
||||
cat_dataset = Eval::Dataset.create!(name: "cat_test", eval_type: "categorization")
|
||||
merch_dataset = Eval::Dataset.create!(name: "merch_test", eval_type: "merchant_detection")
|
||||
chat_dataset = Eval::Dataset.create!(name: "chat_test", eval_type: "chat")
|
||||
|
||||
assert_includes Eval::Dataset.for_categorization, cat_dataset
|
||||
assert_not_includes Eval::Dataset.for_categorization, merch_dataset
|
||||
|
||||
assert_includes Eval::Dataset.for_merchant_detection, merch_dataset
|
||||
assert_not_includes Eval::Dataset.for_merchant_detection, cat_dataset
|
||||
|
||||
assert_includes Eval::Dataset.for_chat, chat_dataset
|
||||
assert_not_includes Eval::Dataset.for_chat, cat_dataset
|
||||
end
|
||||
|
||||
test "import_from_yaml creates dataset with samples" do
|
||||
yaml_content = <<~YAML
|
||||
name: test_import
|
||||
description: Test dataset
|
||||
eval_type: categorization
|
||||
version: "1.0"
|
||||
context:
|
||||
categories:
|
||||
- id: "food"
|
||||
name: "Food"
|
||||
classification: "expense"
|
||||
samples:
|
||||
- id: sample_1
|
||||
difficulty: easy
|
||||
tags: [test]
|
||||
input:
|
||||
id: txn_1
|
||||
amount: 10
|
||||
classification: expense
|
||||
description: "Test transaction"
|
||||
expected:
|
||||
category_name: "Food"
|
||||
YAML
|
||||
|
||||
file_path = Rails.root.join("tmp", "test_import.yml")
|
||||
File.write(file_path, yaml_content)
|
||||
|
||||
dataset = Eval::Dataset.import_from_yaml(file_path)
|
||||
|
||||
assert_equal "test_import", dataset.name
|
||||
assert_equal "categorization", dataset.eval_type
|
||||
assert_equal 1, dataset.samples.count
|
||||
assert_equal "easy", dataset.samples.first.difficulty
|
||||
assert_equal "Food", dataset.samples.first.expected_output["category_name"]
|
||||
ensure
|
||||
File.delete(file_path) if File.exist?(file_path)
|
||||
end
|
||||
|
||||
test "statistics returns sample breakdown" do
|
||||
dataset = Eval::Dataset.create!(name: "stats_test", eval_type: "categorization")
|
||||
|
||||
dataset.samples.create!(
|
||||
input_data: { id: "1" },
|
||||
expected_output: { category_name: "Food" },
|
||||
difficulty: "easy",
|
||||
tags: [ "food" ]
|
||||
)
|
||||
|
||||
dataset.samples.create!(
|
||||
input_data: { id: "2" },
|
||||
expected_output: { category_name: "Travel" },
|
||||
difficulty: "medium",
|
||||
tags: [ "travel" ]
|
||||
)
|
||||
|
||||
stats = dataset.statistics
|
||||
|
||||
assert_equal 2, stats[:total_samples]
|
||||
assert_equal({ "easy" => 1, "medium" => 1 }, stats[:by_difficulty])
|
||||
assert_includes stats[:by_tags], "food"
|
||||
assert_includes stats[:by_tags], "travel"
|
||||
end
|
||||
|
||||
test "runner_class returns correct class for each eval_type" do
|
||||
cat_dataset = Eval::Dataset.new(eval_type: "categorization")
|
||||
merch_dataset = Eval::Dataset.new(eval_type: "merchant_detection")
|
||||
chat_dataset = Eval::Dataset.new(eval_type: "chat")
|
||||
|
||||
assert_equal Eval::Runners::CategorizationRunner, cat_dataset.runner_class
|
||||
assert_equal Eval::Runners::MerchantDetectionRunner, merch_dataset.runner_class
|
||||
assert_equal Eval::Runners::ChatRunner, chat_dataset.runner_class
|
||||
end
|
||||
end
|
||||
212
test/models/eval/runners/categorization_runner_test.rb
Normal file
212
test/models/eval/runners/categorization_runner_test.rb
Normal file
@@ -0,0 +1,212 @@
|
||||
require "test_helper"
|
||||
|
||||
class Eval::Runners::CategorizationRunnerTest < ActiveSupport::TestCase
|
||||
include ProviderTestHelper
|
||||
|
||||
setup do
|
||||
@categories = [
|
||||
{ "id" => "food", "name" => "Food & Drink", "classification" => "expense" },
|
||||
{ "id" => "fast_food", "name" => "Fast Food", "classification" => "expense", "parent_id" => "food" }
|
||||
]
|
||||
end
|
||||
|
||||
|
||||
test "run processes all samples and calculates metrics" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample1 = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "easy"
|
||||
)
|
||||
|
||||
sample2 = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
|
||||
expected_output: { "category_name" => nil },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "edge_case"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample1.id, category_name: "Fast Food"),
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample2.id, category_name: "null")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
result = runner.run
|
||||
|
||||
assert_equal "completed", result.status
|
||||
assert_equal 2, result.results.count
|
||||
assert result.metrics["accuracy"].present?
|
||||
end
|
||||
|
||||
test "records correct result when category matches" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_match_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "easy"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Fast Food")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
runner.run
|
||||
|
||||
result = eval_run.results.find_by(eval_sample_id: sample.id)
|
||||
|
||||
assert result.correct
|
||||
assert result.exact_match
|
||||
assert_equal "Fast Food", result.actual_output["category_name"]
|
||||
end
|
||||
|
||||
test "records hierarchical match when parent category returned" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_hier_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_3", "amount" => 50, "classification" => "expense", "description" => "Olive Garden" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "medium"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_hierarchical",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
# Model returns parent category instead of subcategory
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Food & Drink")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
runner.run
|
||||
|
||||
result = eval_run.results.find_by(eval_sample_id: sample.id)
|
||||
|
||||
assert_not result.exact_match
|
||||
assert result.hierarchical_match
|
||||
end
|
||||
|
||||
test "handles null correctly when expected" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_null_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
|
||||
expected_output: { "category_name" => nil },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "edge_case"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "null")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
runner.run
|
||||
|
||||
result = eval_run.results.find_by(eval_sample_id: sample.id)
|
||||
|
||||
assert result.correct
|
||||
assert result.null_expected
|
||||
assert result.null_returned
|
||||
end
|
||||
|
||||
test "records error results on provider error but completes run" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_err_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "easy"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).raises(StandardError.new("API Error"))
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
result = runner.run
|
||||
|
||||
# Run completes but with error results
|
||||
assert_equal "completed", result.status
|
||||
assert_equal 1, result.results.count
|
||||
|
||||
error_result = result.results.find_by(eval_sample_id: sample.id)
|
||||
assert_not error_result.correct
|
||||
assert_includes error_result.actual_output["error"], "API Error"
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user