mirror of
https://github.com/we-promise/sure.git
synced 2026-04-07 22:34:47 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
114 lines
3.4 KiB
Ruby
114 lines
3.4 KiB
Ruby
class Eval::Dataset < ApplicationRecord
|
|
self.table_name = "eval_datasets"
|
|
|
|
has_many :samples, class_name: "Eval::Sample", foreign_key: :eval_dataset_id, dependent: :destroy
|
|
has_many :runs, class_name: "Eval::Run", foreign_key: :eval_dataset_id, dependent: :destroy
|
|
|
|
validates :name, presence: true, uniqueness: true
|
|
validates :eval_type, presence: true, inclusion: { in: %w[categorization merchant_detection chat] }
|
|
validates :version, presence: true
|
|
|
|
scope :active, -> { where(active: true) }
|
|
scope :for_categorization, -> { where(eval_type: "categorization") }
|
|
scope :for_merchant_detection, -> { where(eval_type: "merchant_detection") }
|
|
scope :for_chat, -> { where(eval_type: "chat") }
|
|
|
|
# Import dataset from a YAML file
|
|
def self.import_from_yaml(file_path)
|
|
data = YAML.load_file(file_path, permitted_classes: [ Symbol, Date, Time ])
|
|
|
|
transaction do
|
|
dataset = find_or_initialize_by(name: data["name"])
|
|
dataset.assign_attributes(
|
|
description: data["description"],
|
|
eval_type: data["eval_type"],
|
|
version: data["version"] || "1.0",
|
|
metadata: data["metadata"] || {},
|
|
active: true
|
|
)
|
|
dataset.save!
|
|
|
|
# Clear existing samples if reimporting
|
|
dataset.samples.destroy_all
|
|
|
|
# Shared context for all samples
|
|
shared_context = data["context"] || {}
|
|
|
|
# Import samples
|
|
samples_data = data["samples"] || []
|
|
samples_data.each do |sample_data|
|
|
dataset.samples.create!(
|
|
input_data: sample_data["input"],
|
|
expected_output: sample_data["expected"],
|
|
context_data: sample_data["context"] || shared_context,
|
|
difficulty: sample_data["difficulty"] || "medium",
|
|
tags: sample_data["tags"] || [],
|
|
metadata: sample_data["metadata"] || {}
|
|
)
|
|
end
|
|
|
|
dataset.update!(sample_count: dataset.samples.count)
|
|
dataset
|
|
end
|
|
end
|
|
|
|
# Export dataset to YAML format
|
|
def export_to_yaml
|
|
{
|
|
"name" => name,
|
|
"description" => description,
|
|
"eval_type" => eval_type,
|
|
"version" => version,
|
|
"metadata" => metadata,
|
|
"context" => samples.first&.context_data || {},
|
|
"samples" => samples.map do |sample|
|
|
{
|
|
"id" => sample.id,
|
|
"difficulty" => sample.difficulty,
|
|
"tags" => sample.tags,
|
|
"input" => sample.input_data,
|
|
"expected" => sample.expected_output,
|
|
"metadata" => sample.metadata
|
|
}.compact
|
|
end
|
|
}.to_yaml
|
|
end
|
|
|
|
# Generate summary statistics
|
|
def statistics
|
|
{
|
|
total_samples: samples.count,
|
|
by_difficulty: samples.group(:difficulty).count,
|
|
by_tags: samples.flat_map(&:tags).tally.sort_by { |_, v| -v }.to_h
|
|
}
|
|
end
|
|
|
|
# Get the appropriate runner class for this dataset type
|
|
def runner_class
|
|
case eval_type
|
|
when "categorization"
|
|
Eval::Runners::CategorizationRunner
|
|
when "merchant_detection"
|
|
Eval::Runners::MerchantDetectionRunner
|
|
when "chat"
|
|
Eval::Runners::ChatRunner
|
|
else
|
|
raise "Unknown eval_type: #{eval_type}"
|
|
end
|
|
end
|
|
|
|
# Get the appropriate metrics class for this dataset type
|
|
def metrics_class
|
|
case eval_type
|
|
when "categorization"
|
|
Eval::Metrics::CategorizationMetrics
|
|
when "merchant_detection"
|
|
Eval::Metrics::MerchantDetectionMetrics
|
|
when "chat"
|
|
Eval::Metrics::ChatMetrics
|
|
else
|
|
raise "Unknown eval_type: #{eval_type}"
|
|
end
|
|
end
|
|
end
|