mirror of
https://github.com/we-promise/sure.git
synced 2026-04-08 14:54:49 +00:00
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
469 lines
14 KiB
Ruby
469 lines
14 KiB
Ruby
class Eval::Langfuse::ExperimentRunner
|
|
attr_reader :dataset, :model, :provider, :client, :provider_config
|
|
|
|
BATCH_SIZE = 25
|
|
|
|
def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {})
|
|
@dataset = dataset
|
|
@model = model
|
|
@provider = provider
|
|
@client = client || Eval::Langfuse::Client.new
|
|
@provider_config = provider_config
|
|
end
|
|
|
|
def run(run_name: nil)
|
|
@run_name = run_name || generate_run_name
|
|
|
|
Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'")
|
|
Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)")
|
|
Rails.logger.info("[Langfuse Experiment] Model: #{model}")
|
|
|
|
# Ensure dataset exists in Langfuse
|
|
ensure_dataset_exported
|
|
|
|
# Get dataset items from Langfuse
|
|
items = fetch_langfuse_items
|
|
|
|
# Run the experiment
|
|
results = process_items(items)
|
|
|
|
# Calculate and report metrics
|
|
metrics = calculate_metrics(results)
|
|
|
|
Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete")
|
|
Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%")
|
|
|
|
{
|
|
run_name: @run_name,
|
|
dataset_name: langfuse_dataset_name,
|
|
model: model,
|
|
samples_processed: results.size,
|
|
metrics: metrics
|
|
}
|
|
end
|
|
|
|
private
|
|
|
|
def generate_run_name
|
|
"#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
|
|
end
|
|
|
|
def langfuse_dataset_name
|
|
"eval_#{dataset.name}"
|
|
end
|
|
|
|
def ensure_dataset_exported
|
|
exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client)
|
|
exporter.export
|
|
end
|
|
|
|
def fetch_langfuse_items
|
|
items = []
|
|
page = 1
|
|
|
|
loop do
|
|
response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50)
|
|
batch = response["data"] || []
|
|
items.concat(batch)
|
|
|
|
break if batch.size < 50
|
|
|
|
page += 1
|
|
end
|
|
|
|
Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse")
|
|
items
|
|
end
|
|
|
|
def process_items(items)
|
|
results = []
|
|
|
|
items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
|
|
Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}")
|
|
|
|
batch_results = process_batch(batch)
|
|
results.concat(batch_results)
|
|
end
|
|
|
|
results
|
|
end
|
|
|
|
def process_batch(items)
|
|
case dataset.eval_type
|
|
when "categorization"
|
|
process_categorization_batch(items)
|
|
when "merchant_detection"
|
|
process_merchant_detection_batch(items)
|
|
when "chat"
|
|
process_chat_batch(items)
|
|
else
|
|
raise "Unsupported eval type: #{dataset.eval_type}"
|
|
end
|
|
end
|
|
|
|
def process_categorization_batch(items)
|
|
transactions = items.map do |item|
|
|
input = item["input"]
|
|
txn = input["transaction"] || input
|
|
txn.deep_symbolize_keys.merge(id: item["id"])
|
|
end
|
|
|
|
categories = items.first.dig("input", "categories") || []
|
|
categories = categories.map(&:deep_symbolize_keys)
|
|
|
|
# Determine effective JSON mode for this batch
|
|
# If the batch has many expected nulls, force strict mode to prevent false retries
|
|
effective_json_mode = json_mode_for_batch(items)
|
|
|
|
start_time = Time.current
|
|
|
|
response = llm_provider.auto_categorize(
|
|
transactions: transactions,
|
|
user_categories: categories,
|
|
model: model,
|
|
json_mode: effective_json_mode
|
|
)
|
|
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
|
|
if response.success?
|
|
items.map do |item|
|
|
categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s }
|
|
actual_category = normalize_null(categorization&.category_name)
|
|
expected_category = item.dig("expectedOutput", "category_name")
|
|
|
|
correct = actual_category == expected_category
|
|
score_value = correct ? 1.0 : 0.0
|
|
|
|
# Create trace and score in Langfuse
|
|
trace_id = create_trace_for_item(item, actual_category, latency_ms)
|
|
score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category)
|
|
|
|
{
|
|
item_id: item["id"],
|
|
expected: expected_category,
|
|
actual: actual_category,
|
|
correct: correct,
|
|
latency_ms: latency_ms / items.size
|
|
}
|
|
end
|
|
else
|
|
handle_batch_error(items, response.error)
|
|
end
|
|
rescue => e
|
|
handle_batch_error(items, e)
|
|
end
|
|
|
|
def process_merchant_detection_batch(items)
|
|
transactions = items.map do |item|
|
|
input = item["input"]
|
|
txn = input["transaction"] || input
|
|
txn.deep_symbolize_keys.merge(id: item["id"])
|
|
end
|
|
|
|
merchants = items.first.dig("input", "merchants") || []
|
|
merchants = merchants.map(&:deep_symbolize_keys)
|
|
|
|
start_time = Time.current
|
|
|
|
response = llm_provider.auto_detect_merchants(
|
|
transactions: transactions,
|
|
user_merchants: merchants,
|
|
model: model
|
|
)
|
|
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
|
|
if response.success?
|
|
items.map do |item|
|
|
detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s }
|
|
actual_name = normalize_null(detection&.business_name)
|
|
actual_url = normalize_null(detection&.business_url)
|
|
expected_name = item.dig("expectedOutput", "business_name")
|
|
expected_url = item.dig("expectedOutput", "business_url")
|
|
|
|
name_match = actual_name == expected_name
|
|
url_match = normalize_url(actual_url) == normalize_url(expected_url)
|
|
correct = name_match && url_match
|
|
score_value = correct ? 1.0 : 0.0
|
|
|
|
# Create trace and score in Langfuse
|
|
actual_output = { business_name: actual_name, business_url: actual_url }
|
|
trace_id = create_trace_for_item(item, actual_output, latency_ms)
|
|
score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"])
|
|
|
|
{
|
|
item_id: item["id"],
|
|
expected: { name: expected_name, url: expected_url },
|
|
actual: { name: actual_name, url: actual_url },
|
|
correct: correct,
|
|
latency_ms: latency_ms / items.size
|
|
}
|
|
end
|
|
else
|
|
handle_batch_error(items, response.error)
|
|
end
|
|
rescue => e
|
|
handle_batch_error(items, e)
|
|
end
|
|
|
|
def process_chat_batch(items)
|
|
# Chat is processed one at a time due to function calling complexity
|
|
items.map do |item|
|
|
process_chat_item(item)
|
|
end
|
|
end
|
|
|
|
def process_chat_item(item)
|
|
prompt = item.dig("input", "prompt")
|
|
expected_functions = item.dig("expectedOutput", "functions") || []
|
|
|
|
start_time = Time.current
|
|
|
|
response = llm_provider.chat_response(
|
|
prompt,
|
|
model: model,
|
|
instructions: "You are a helpful personal finance assistant.",
|
|
functions: build_available_functions
|
|
)
|
|
|
|
latency_ms = ((Time.current - start_time) * 1000).to_i
|
|
|
|
actual_functions = extract_function_calls(response)
|
|
correct = evaluate_function_match(actual_functions, expected_functions)
|
|
score_value = correct ? 1.0 : 0.0
|
|
|
|
# Create trace and score in Langfuse
|
|
trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms)
|
|
score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions)
|
|
|
|
{
|
|
item_id: item["id"],
|
|
expected: expected_functions,
|
|
actual: actual_functions,
|
|
correct: correct,
|
|
latency_ms: latency_ms
|
|
}
|
|
rescue => e
|
|
handle_item_error(item, e)
|
|
end
|
|
|
|
def create_trace_for_item(item, output, latency_ms)
|
|
trace_id = client.create_trace(
|
|
name: "#{dataset.eval_type}_eval",
|
|
input: item["input"],
|
|
output: output,
|
|
metadata: {
|
|
run_name: @run_name,
|
|
model: model,
|
|
latency_ms: latency_ms,
|
|
dataset_item_id: item["id"]
|
|
}
|
|
)
|
|
|
|
Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}")
|
|
trace_id
|
|
end
|
|
|
|
def score_result(trace_id, item_id, score_value, correct, actual, expected)
|
|
return unless trace_id
|
|
|
|
# Score the accuracy
|
|
client.create_score(
|
|
trace_id: trace_id,
|
|
name: "accuracy",
|
|
value: score_value,
|
|
comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}"
|
|
)
|
|
|
|
# Link to dataset run
|
|
client.create_dataset_run_item(
|
|
run_name: @run_name,
|
|
dataset_item_id: item_id,
|
|
trace_id: trace_id,
|
|
metadata: {
|
|
correct: correct,
|
|
actual: actual,
|
|
expected: expected
|
|
}
|
|
)
|
|
rescue => e
|
|
Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}")
|
|
end
|
|
|
|
def handle_batch_error(items, error)
|
|
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
|
Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}")
|
|
|
|
items.map do |item|
|
|
{
|
|
item_id: item["id"],
|
|
expected: item["expectedOutput"],
|
|
actual: { error: error_message },
|
|
correct: false,
|
|
latency_ms: 0
|
|
}
|
|
end
|
|
end
|
|
|
|
def handle_item_error(item, error)
|
|
Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}")
|
|
|
|
{
|
|
item_id: item["id"],
|
|
expected: item["expectedOutput"],
|
|
actual: { error: error.message },
|
|
correct: false,
|
|
latency_ms: 0
|
|
}
|
|
end
|
|
|
|
def calculate_metrics(results)
|
|
total = results.size
|
|
|
|
# Guard against empty results to avoid division by zero
|
|
if total.zero?
|
|
return {
|
|
accuracy: 0.0,
|
|
total: 0,
|
|
correct: 0,
|
|
incorrect: 0,
|
|
avg_latency_ms: 0
|
|
}
|
|
end
|
|
|
|
correct = results.count { |r| r[:correct] }
|
|
avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f
|
|
|
|
{
|
|
accuracy: (correct.to_f / total * 100).round(2),
|
|
total: total,
|
|
correct: correct,
|
|
incorrect: total - correct,
|
|
avg_latency_ms: avg_latency.round(0)
|
|
}
|
|
end
|
|
|
|
def llm_provider
|
|
@llm_provider ||= build_provider
|
|
end
|
|
|
|
def build_provider
|
|
case provider
|
|
when "openai"
|
|
access_token = provider_config[:access_token] ||
|
|
ENV["OPENAI_ACCESS_TOKEN"] ||
|
|
Setting.openai_access_token
|
|
|
|
raise "OpenAI access token not configured" unless access_token.present?
|
|
|
|
uri_base = provider_config[:uri_base] ||
|
|
ENV["OPENAI_URI_BASE"] ||
|
|
Setting.openai_uri_base
|
|
|
|
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
|
|
else
|
|
raise "Unsupported provider: #{provider}"
|
|
end
|
|
end
|
|
|
|
# Determine the effective JSON mode for a batch based on expected null ratio
|
|
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
|
|
def json_mode_for_batch(items)
|
|
# If a specific mode is configured (not "auto"), always use it
|
|
configured_mode = provider_config[:json_mode]
|
|
return configured_mode if configured_mode.present? && configured_mode != "auto"
|
|
|
|
# Calculate expected null ratio for this batch
|
|
expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? }
|
|
expected_null_ratio = expected_null_count.to_f / items.size
|
|
|
|
# If >50% of the batch is expected to return null, force strict mode
|
|
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
|
|
# and prevents unnecessary retries when nulls are legitimate
|
|
if expected_null_ratio > 0.5
|
|
Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode")
|
|
"strict"
|
|
else
|
|
# Use auto mode - let the auto-categorizer decide
|
|
"auto"
|
|
end
|
|
end
|
|
|
|
def normalize_null(value)
|
|
return nil if value.nil?
|
|
return nil if value == "null"
|
|
return nil if value.to_s.strip.empty?
|
|
value
|
|
end
|
|
|
|
def normalize_url(url)
|
|
return nil if url.nil?
|
|
url.to_s.downcase
|
|
.gsub(/^(https?:\/\/)?(www\.)?/, "")
|
|
.chomp("/")
|
|
.strip
|
|
end
|
|
|
|
def build_available_functions
|
|
# Simplified function definitions for chat eval
|
|
[
|
|
{
|
|
name: "get_accounts",
|
|
description: "Get user's financial accounts",
|
|
params_schema: { type: "object", properties: {}, required: [] }
|
|
},
|
|
{
|
|
name: "get_transactions",
|
|
description: "Get transactions with optional filters",
|
|
params_schema: {
|
|
type: "object",
|
|
properties: {
|
|
account_id: { type: "string" },
|
|
start_date: { type: "string" },
|
|
end_date: { type: "string" },
|
|
category: { type: "string" }
|
|
}
|
|
}
|
|
},
|
|
{
|
|
name: "get_balance_summary",
|
|
description: "Get balance summary across accounts",
|
|
params_schema: { type: "object", properties: {} }
|
|
},
|
|
{
|
|
name: "get_spending_by_category",
|
|
description: "Get spending breakdown by category",
|
|
params_schema: {
|
|
type: "object",
|
|
properties: {
|
|
start_date: { type: "string" },
|
|
end_date: { type: "string" }
|
|
}
|
|
}
|
|
}
|
|
]
|
|
end
|
|
|
|
def extract_function_calls(response)
|
|
return [] unless response.respond_to?(:messages)
|
|
|
|
response.messages.flat_map do |msg|
|
|
next [] unless msg.respond_to?(:function_calls)
|
|
msg.function_calls.map do |fc|
|
|
{ name: fc.name, arguments: fc.arguments }
|
|
end
|
|
end.compact
|
|
end
|
|
|
|
def evaluate_function_match(actual, expected)
|
|
return true if expected.empty? && actual.empty?
|
|
return false if expected.empty? != actual.empty?
|
|
|
|
expected_names = expected.map { |f| f["name"] || f[:name] }.sort
|
|
actual_names = actual.map { |f| f["name"] || f[:name] }.sort
|
|
|
|
expected_names == actual_names
|
|
end
|
|
end
|