Files
sure/test/models/eval/runners/categorization_runner_test.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

213 lines
6.8 KiB
Ruby

require "test_helper"
class Eval::Runners::CategorizationRunnerTest < ActiveSupport::TestCase
include ProviderTestHelper
setup do
@categories = [
{ "id" => "food", "name" => "Food & Drink", "classification" => "expense" },
{ "id" => "fast_food", "name" => "Fast Food", "classification" => "expense", "parent_id" => "food" }
]
end
test "run processes all samples and calculates metrics" do
dataset = Eval::Dataset.create!(
name: "test_cat_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample1 = dataset.samples.create!(
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "easy"
)
sample2 = dataset.samples.create!(
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
expected_output: { "category_name" => nil },
context_data: { "categories" => @categories },
difficulty: "edge_case"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample1.id, category_name: "Fast Food"),
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample2.id, category_name: "null")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
result = runner.run
assert_equal "completed", result.status
assert_equal 2, result.results.count
assert result.metrics["accuracy"].present?
end
test "records correct result when category matches" do
dataset = Eval::Dataset.create!(
name: "test_cat_match_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "easy"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Fast Food")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
runner.run
result = eval_run.results.find_by(eval_sample_id: sample.id)
assert result.correct
assert result.exact_match
assert_equal "Fast Food", result.actual_output["category_name"]
end
test "records hierarchical match when parent category returned" do
dataset = Eval::Dataset.create!(
name: "test_cat_hier_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_3", "amount" => 50, "classification" => "expense", "description" => "Olive Garden" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "medium"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_hierarchical",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
# Model returns parent category instead of subcategory
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Food & Drink")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
runner.run
result = eval_run.results.find_by(eval_sample_id: sample.id)
assert_not result.exact_match
assert result.hierarchical_match
end
test "handles null correctly when expected" do
dataset = Eval::Dataset.create!(
name: "test_cat_null_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
expected_output: { "category_name" => nil },
context_data: { "categories" => @categories },
difficulty: "edge_case"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "null")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
runner.run
result = eval_run.results.find_by(eval_sample_id: sample.id)
assert result.correct
assert result.null_expected
assert result.null_returned
end
test "records error results on provider error but completes run" do
dataset = Eval::Dataset.create!(
name: "test_cat_err_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "easy"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
Provider::Openai.any_instance.stubs(:auto_categorize).raises(StandardError.new("API Error"))
runner = Eval::Runners::CategorizationRunner.new(eval_run)
result = runner.run
# Run completes but with error results
assert_equal "completed", result.status
assert_equal 1, result.results.count
error_result = result.results.find_by(eval_sample_id: sample.id)
assert_not error_result.correct
assert_includes error_result.actual_output["error"], "API Error"
end
end