mirror of
https://github.com/we-promise/sure.git
synced 2026-04-19 12:04:08 +00:00
Small llms improvements (#400)
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
113
app/models/eval/dataset.rb
Normal file
113
app/models/eval/dataset.rb
Normal file
@@ -0,0 +1,113 @@
|
||||
class Eval::Dataset < ApplicationRecord
|
||||
self.table_name = "eval_datasets"
|
||||
|
||||
has_many :samples, class_name: "Eval::Sample", foreign_key: :eval_dataset_id, dependent: :destroy
|
||||
has_many :runs, class_name: "Eval::Run", foreign_key: :eval_dataset_id, dependent: :destroy
|
||||
|
||||
validates :name, presence: true, uniqueness: true
|
||||
validates :eval_type, presence: true, inclusion: { in: %w[categorization merchant_detection chat] }
|
||||
validates :version, presence: true
|
||||
|
||||
scope :active, -> { where(active: true) }
|
||||
scope :for_categorization, -> { where(eval_type: "categorization") }
|
||||
scope :for_merchant_detection, -> { where(eval_type: "merchant_detection") }
|
||||
scope :for_chat, -> { where(eval_type: "chat") }
|
||||
|
||||
# Import dataset from a YAML file
|
||||
def self.import_from_yaml(file_path)
|
||||
data = YAML.load_file(file_path, permitted_classes: [ Symbol, Date, Time ])
|
||||
|
||||
transaction do
|
||||
dataset = find_or_initialize_by(name: data["name"])
|
||||
dataset.assign_attributes(
|
||||
description: data["description"],
|
||||
eval_type: data["eval_type"],
|
||||
version: data["version"] || "1.0",
|
||||
metadata: data["metadata"] || {},
|
||||
active: true
|
||||
)
|
||||
dataset.save!
|
||||
|
||||
# Clear existing samples if reimporting
|
||||
dataset.samples.destroy_all
|
||||
|
||||
# Shared context for all samples
|
||||
shared_context = data["context"] || {}
|
||||
|
||||
# Import samples
|
||||
samples_data = data["samples"] || []
|
||||
samples_data.each do |sample_data|
|
||||
dataset.samples.create!(
|
||||
input_data: sample_data["input"],
|
||||
expected_output: sample_data["expected"],
|
||||
context_data: sample_data["context"] || shared_context,
|
||||
difficulty: sample_data["difficulty"] || "medium",
|
||||
tags: sample_data["tags"] || [],
|
||||
metadata: sample_data["metadata"] || {}
|
||||
)
|
||||
end
|
||||
|
||||
dataset.update!(sample_count: dataset.samples.count)
|
||||
dataset
|
||||
end
|
||||
end
|
||||
|
||||
# Export dataset to YAML format
|
||||
def export_to_yaml
|
||||
{
|
||||
"name" => name,
|
||||
"description" => description,
|
||||
"eval_type" => eval_type,
|
||||
"version" => version,
|
||||
"metadata" => metadata,
|
||||
"context" => samples.first&.context_data || {},
|
||||
"samples" => samples.map do |sample|
|
||||
{
|
||||
"id" => sample.id,
|
||||
"difficulty" => sample.difficulty,
|
||||
"tags" => sample.tags,
|
||||
"input" => sample.input_data,
|
||||
"expected" => sample.expected_output,
|
||||
"metadata" => sample.metadata
|
||||
}.compact
|
||||
end
|
||||
}.to_yaml
|
||||
end
|
||||
|
||||
# Generate summary statistics
|
||||
def statistics
|
||||
{
|
||||
total_samples: samples.count,
|
||||
by_difficulty: samples.group(:difficulty).count,
|
||||
by_tags: samples.flat_map(&:tags).tally.sort_by { |_, v| -v }.to_h
|
||||
}
|
||||
end
|
||||
|
||||
# Get the appropriate runner class for this dataset type
|
||||
def runner_class
|
||||
case eval_type
|
||||
when "categorization"
|
||||
Eval::Runners::CategorizationRunner
|
||||
when "merchant_detection"
|
||||
Eval::Runners::MerchantDetectionRunner
|
||||
when "chat"
|
||||
Eval::Runners::ChatRunner
|
||||
else
|
||||
raise "Unknown eval_type: #{eval_type}"
|
||||
end
|
||||
end
|
||||
|
||||
# Get the appropriate metrics class for this dataset type
|
||||
def metrics_class
|
||||
case eval_type
|
||||
when "categorization"
|
||||
Eval::Metrics::CategorizationMetrics
|
||||
when "merchant_detection"
|
||||
Eval::Metrics::MerchantDetectionMetrics
|
||||
when "chat"
|
||||
Eval::Metrics::ChatMetrics
|
||||
else
|
||||
raise "Unknown eval_type: #{eval_type}"
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user