Files
sure/test/models/eval/dataset_test.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

119 lines
3.9 KiB
Ruby

require "test_helper"
class Eval::DatasetTest < ActiveSupport::TestCase
test "validates presence of name and eval_type" do
dataset = Eval::Dataset.new
assert_not dataset.valid?
assert_includes dataset.errors[:name], "can't be blank"
assert_includes dataset.errors[:eval_type], "can't be blank"
end
test "validates eval_type is one of allowed values" do
dataset = Eval::Dataset.new(name: "test", eval_type: "invalid")
assert_not dataset.valid?
assert_includes dataset.errors[:eval_type], "is not included in the list"
dataset.eval_type = "categorization"
dataset.valid?
assert_empty dataset.errors[:eval_type]
end
test "validates name uniqueness" do
Eval::Dataset.create!(name: "unique_test", eval_type: "categorization")
duplicate = Eval::Dataset.new(name: "unique_test", eval_type: "categorization")
assert_not duplicate.valid?
assert_includes duplicate.errors[:name], "has already been taken"
end
test "scopes filter by eval_type" do
cat_dataset = Eval::Dataset.create!(name: "cat_test", eval_type: "categorization")
merch_dataset = Eval::Dataset.create!(name: "merch_test", eval_type: "merchant_detection")
chat_dataset = Eval::Dataset.create!(name: "chat_test", eval_type: "chat")
assert_includes Eval::Dataset.for_categorization, cat_dataset
assert_not_includes Eval::Dataset.for_categorization, merch_dataset
assert_includes Eval::Dataset.for_merchant_detection, merch_dataset
assert_not_includes Eval::Dataset.for_merchant_detection, cat_dataset
assert_includes Eval::Dataset.for_chat, chat_dataset
assert_not_includes Eval::Dataset.for_chat, cat_dataset
end
test "import_from_yaml creates dataset with samples" do
yaml_content = <<~YAML
name: test_import
description: Test dataset
eval_type: categorization
version: "1.0"
context:
categories:
- id: "food"
name: "Food"
classification: "expense"
samples:
- id: sample_1
difficulty: easy
tags: [test]
input:
id: txn_1
amount: 10
classification: expense
description: "Test transaction"
expected:
category_name: "Food"
YAML
file_path = Rails.root.join("tmp", "test_import.yml")
File.write(file_path, yaml_content)
dataset = Eval::Dataset.import_from_yaml(file_path)
assert_equal "test_import", dataset.name
assert_equal "categorization", dataset.eval_type
assert_equal 1, dataset.samples.count
assert_equal "easy", dataset.samples.first.difficulty
assert_equal "Food", dataset.samples.first.expected_output["category_name"]
ensure
File.delete(file_path) if File.exist?(file_path)
end
test "statistics returns sample breakdown" do
dataset = Eval::Dataset.create!(name: "stats_test", eval_type: "categorization")
dataset.samples.create!(
input_data: { id: "1" },
expected_output: { category_name: "Food" },
difficulty: "easy",
tags: [ "food" ]
)
dataset.samples.create!(
input_data: { id: "2" },
expected_output: { category_name: "Travel" },
difficulty: "medium",
tags: [ "travel" ]
)
stats = dataset.statistics
assert_equal 2, stats[:total_samples]
assert_equal({ "easy" => 1, "medium" => 1 }, stats[:by_difficulty])
assert_includes stats[:by_tags], "food"
assert_includes stats[:by_tags], "travel"
end
test "runner_class returns correct class for each eval_type" do
cat_dataset = Eval::Dataset.new(eval_type: "categorization")
merch_dataset = Eval::Dataset.new(eval_type: "merchant_detection")
chat_dataset = Eval::Dataset.new(eval_type: "chat")
assert_equal Eval::Runners::CategorizationRunner, cat_dataset.runner_class
assert_equal Eval::Runners::MerchantDetectionRunner, merch_dataset.runner_class
assert_equal Eval::Runners::ChatRunner, chat_dataset.runner_class
end
end