mirror of
https://github.com/we-promise/sure.git
synced 2026-04-07 14:31:25 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
89 lines
2.8 KiB
Ruby
89 lines
2.8 KiB
Ruby
class Eval::Sample < ApplicationRecord
|
|
self.table_name = "eval_samples"
|
|
|
|
belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
|
|
has_many :results, class_name: "Eval::Result", foreign_key: :eval_sample_id, dependent: :destroy
|
|
|
|
validates :input_data, :expected_output, presence: true
|
|
validates :difficulty, inclusion: { in: %w[easy medium hard manual edge_case] }
|
|
|
|
scope :easy, -> { where(difficulty: "easy") }
|
|
scope :medium, -> { where(difficulty: "medium") }
|
|
scope :hard, -> { where(difficulty: "hard") }
|
|
scope :edge_cases, -> { where(difficulty: "edge_case") }
|
|
scope :with_tag, ->(tag) { where("? = ANY(tags)", tag) }
|
|
scope :with_any_tags, ->(tags) { where("tags && ARRAY[?]::varchar[]", tags) }
|
|
|
|
# Convert to format expected by AutoCategorizer
|
|
def to_transaction_input
|
|
input_data.deep_symbolize_keys
|
|
end
|
|
|
|
# Get categories from context (for categorization evals)
|
|
def categories_context
|
|
context_data.dig("categories") || []
|
|
end
|
|
|
|
# Get merchants from context (for merchant detection evals)
|
|
def merchants_context
|
|
context_data.dig("merchants") || []
|
|
end
|
|
|
|
# Get mock data from context (for chat evals)
|
|
def mock_data
|
|
context_data.dig("mock_data") || input_data.dig("mock_data") || {}
|
|
end
|
|
|
|
# Get the chat prompt (for chat evals)
|
|
def chat_prompt
|
|
input_data.dig("prompt") || input_data["prompt"]
|
|
end
|
|
|
|
# Get expected functions (for chat evals)
|
|
def expected_functions
|
|
expected_output.dig("functions") || expected_output["functions"] || []
|
|
end
|
|
|
|
# Get expected response keywords (for chat evals)
|
|
def expected_response_contains
|
|
expected_output.dig("response_contains") || expected_output["response_contains"] || []
|
|
end
|
|
|
|
# Get expected category name (for categorization evals)
|
|
def expected_category_name
|
|
expected_output.dig("category_name") || expected_output["category_name"]
|
|
end
|
|
|
|
# Get acceptable alternative category names (for categorization evals)
|
|
# These are categories that are also considered correct answers
|
|
def acceptable_alternatives
|
|
expected_output.dig("acceptable_alternatives") || expected_output["acceptable_alternatives"] || []
|
|
end
|
|
|
|
# Get all acceptable category names (primary + alternatives)
|
|
def all_acceptable_categories
|
|
[ expected_category_name, *acceptable_alternatives ].compact
|
|
end
|
|
|
|
# Get expected merchant info (for merchant detection evals)
|
|
def expected_business_name
|
|
expected_output.dig("business_name") || expected_output["business_name"]
|
|
end
|
|
|
|
def expected_business_url
|
|
expected_output.dig("business_url") || expected_output["business_url"]
|
|
end
|
|
|
|
# Check if null is expected
|
|
def expects_null?
|
|
case dataset.eval_type
|
|
when "categorization"
|
|
expected_category_name.nil?
|
|
when "merchant_detection"
|
|
expected_business_name.nil? && expected_business_url.nil?
|
|
else
|
|
false
|
|
end
|
|
end
|
|
end
|