mirror of
https://github.com/we-promise/sure.git
synced 2026-04-19 03:54:08 +00:00
Small llms improvements (#400)
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
468
app/models/eval/langfuse/experiment_runner.rb
Normal file
468
app/models/eval/langfuse/experiment_runner.rb
Normal file
@@ -0,0 +1,468 @@
|
||||
class Eval::Langfuse::ExperimentRunner
|
||||
attr_reader :dataset, :model, :provider, :client, :provider_config
|
||||
|
||||
BATCH_SIZE = 25
|
||||
|
||||
def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {})
|
||||
@dataset = dataset
|
||||
@model = model
|
||||
@provider = provider
|
||||
@client = client || Eval::Langfuse::Client.new
|
||||
@provider_config = provider_config
|
||||
end
|
||||
|
||||
def run(run_name: nil)
|
||||
@run_name = run_name || generate_run_name
|
||||
|
||||
Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'")
|
||||
Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)")
|
||||
Rails.logger.info("[Langfuse Experiment] Model: #{model}")
|
||||
|
||||
# Ensure dataset exists in Langfuse
|
||||
ensure_dataset_exported
|
||||
|
||||
# Get dataset items from Langfuse
|
||||
items = fetch_langfuse_items
|
||||
|
||||
# Run the experiment
|
||||
results = process_items(items)
|
||||
|
||||
# Calculate and report metrics
|
||||
metrics = calculate_metrics(results)
|
||||
|
||||
Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete")
|
||||
Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%")
|
||||
|
||||
{
|
||||
run_name: @run_name,
|
||||
dataset_name: langfuse_dataset_name,
|
||||
model: model,
|
||||
samples_processed: results.size,
|
||||
metrics: metrics
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def generate_run_name
|
||||
"#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
|
||||
end
|
||||
|
||||
def langfuse_dataset_name
|
||||
"eval_#{dataset.name}"
|
||||
end
|
||||
|
||||
def ensure_dataset_exported
|
||||
exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client)
|
||||
exporter.export
|
||||
end
|
||||
|
||||
def fetch_langfuse_items
|
||||
items = []
|
||||
page = 1
|
||||
|
||||
loop do
|
||||
response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50)
|
||||
batch = response["data"] || []
|
||||
items.concat(batch)
|
||||
|
||||
break if batch.size < 50
|
||||
|
||||
page += 1
|
||||
end
|
||||
|
||||
Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse")
|
||||
items
|
||||
end
|
||||
|
||||
def process_items(items)
|
||||
results = []
|
||||
|
||||
items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
|
||||
Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}")
|
||||
|
||||
batch_results = process_batch(batch)
|
||||
results.concat(batch_results)
|
||||
end
|
||||
|
||||
results
|
||||
end
|
||||
|
||||
def process_batch(items)
|
||||
case dataset.eval_type
|
||||
when "categorization"
|
||||
process_categorization_batch(items)
|
||||
when "merchant_detection"
|
||||
process_merchant_detection_batch(items)
|
||||
when "chat"
|
||||
process_chat_batch(items)
|
||||
else
|
||||
raise "Unsupported eval type: #{dataset.eval_type}"
|
||||
end
|
||||
end
|
||||
|
||||
def process_categorization_batch(items)
|
||||
transactions = items.map do |item|
|
||||
input = item["input"]
|
||||
txn = input["transaction"] || input
|
||||
txn.deep_symbolize_keys.merge(id: item["id"])
|
||||
end
|
||||
|
||||
categories = items.first.dig("input", "categories") || []
|
||||
categories = categories.map(&:deep_symbolize_keys)
|
||||
|
||||
# Determine effective JSON mode for this batch
|
||||
# If the batch has many expected nulls, force strict mode to prevent false retries
|
||||
effective_json_mode = json_mode_for_batch(items)
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
response = llm_provider.auto_categorize(
|
||||
transactions: transactions,
|
||||
user_categories: categories,
|
||||
model: model,
|
||||
json_mode: effective_json_mode
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
if response.success?
|
||||
items.map do |item|
|
||||
categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s }
|
||||
actual_category = normalize_null(categorization&.category_name)
|
||||
expected_category = item.dig("expectedOutput", "category_name")
|
||||
|
||||
correct = actual_category == expected_category
|
||||
score_value = correct ? 1.0 : 0.0
|
||||
|
||||
# Create trace and score in Langfuse
|
||||
trace_id = create_trace_for_item(item, actual_category, latency_ms)
|
||||
score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category)
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: expected_category,
|
||||
actual: actual_category,
|
||||
correct: correct,
|
||||
latency_ms: latency_ms / items.size
|
||||
}
|
||||
end
|
||||
else
|
||||
handle_batch_error(items, response.error)
|
||||
end
|
||||
rescue => e
|
||||
handle_batch_error(items, e)
|
||||
end
|
||||
|
||||
def process_merchant_detection_batch(items)
|
||||
transactions = items.map do |item|
|
||||
input = item["input"]
|
||||
txn = input["transaction"] || input
|
||||
txn.deep_symbolize_keys.merge(id: item["id"])
|
||||
end
|
||||
|
||||
merchants = items.first.dig("input", "merchants") || []
|
||||
merchants = merchants.map(&:deep_symbolize_keys)
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
response = llm_provider.auto_detect_merchants(
|
||||
transactions: transactions,
|
||||
user_merchants: merchants,
|
||||
model: model
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
if response.success?
|
||||
items.map do |item|
|
||||
detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s }
|
||||
actual_name = normalize_null(detection&.business_name)
|
||||
actual_url = normalize_null(detection&.business_url)
|
||||
expected_name = item.dig("expectedOutput", "business_name")
|
||||
expected_url = item.dig("expectedOutput", "business_url")
|
||||
|
||||
name_match = actual_name == expected_name
|
||||
url_match = normalize_url(actual_url) == normalize_url(expected_url)
|
||||
correct = name_match && url_match
|
||||
score_value = correct ? 1.0 : 0.0
|
||||
|
||||
# Create trace and score in Langfuse
|
||||
actual_output = { business_name: actual_name, business_url: actual_url }
|
||||
trace_id = create_trace_for_item(item, actual_output, latency_ms)
|
||||
score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"])
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: { name: expected_name, url: expected_url },
|
||||
actual: { name: actual_name, url: actual_url },
|
||||
correct: correct,
|
||||
latency_ms: latency_ms / items.size
|
||||
}
|
||||
end
|
||||
else
|
||||
handle_batch_error(items, response.error)
|
||||
end
|
||||
rescue => e
|
||||
handle_batch_error(items, e)
|
||||
end
|
||||
|
||||
def process_chat_batch(items)
|
||||
# Chat is processed one at a time due to function calling complexity
|
||||
items.map do |item|
|
||||
process_chat_item(item)
|
||||
end
|
||||
end
|
||||
|
||||
def process_chat_item(item)
|
||||
prompt = item.dig("input", "prompt")
|
||||
expected_functions = item.dig("expectedOutput", "functions") || []
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
response = llm_provider.chat_response(
|
||||
prompt,
|
||||
model: model,
|
||||
instructions: "You are a helpful personal finance assistant.",
|
||||
functions: build_available_functions
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
actual_functions = extract_function_calls(response)
|
||||
correct = evaluate_function_match(actual_functions, expected_functions)
|
||||
score_value = correct ? 1.0 : 0.0
|
||||
|
||||
# Create trace and score in Langfuse
|
||||
trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms)
|
||||
score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions)
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: expected_functions,
|
||||
actual: actual_functions,
|
||||
correct: correct,
|
||||
latency_ms: latency_ms
|
||||
}
|
||||
rescue => e
|
||||
handle_item_error(item, e)
|
||||
end
|
||||
|
||||
def create_trace_for_item(item, output, latency_ms)
|
||||
trace_id = client.create_trace(
|
||||
name: "#{dataset.eval_type}_eval",
|
||||
input: item["input"],
|
||||
output: output,
|
||||
metadata: {
|
||||
run_name: @run_name,
|
||||
model: model,
|
||||
latency_ms: latency_ms,
|
||||
dataset_item_id: item["id"]
|
||||
}
|
||||
)
|
||||
|
||||
Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}")
|
||||
trace_id
|
||||
end
|
||||
|
||||
def score_result(trace_id, item_id, score_value, correct, actual, expected)
|
||||
return unless trace_id
|
||||
|
||||
# Score the accuracy
|
||||
client.create_score(
|
||||
trace_id: trace_id,
|
||||
name: "accuracy",
|
||||
value: score_value,
|
||||
comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}"
|
||||
)
|
||||
|
||||
# Link to dataset run
|
||||
client.create_dataset_run_item(
|
||||
run_name: @run_name,
|
||||
dataset_item_id: item_id,
|
||||
trace_id: trace_id,
|
||||
metadata: {
|
||||
correct: correct,
|
||||
actual: actual,
|
||||
expected: expected
|
||||
}
|
||||
)
|
||||
rescue => e
|
||||
Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}")
|
||||
end
|
||||
|
||||
def handle_batch_error(items, error)
|
||||
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
||||
Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}")
|
||||
|
||||
items.map do |item|
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: item["expectedOutput"],
|
||||
actual: { error: error_message },
|
||||
correct: false,
|
||||
latency_ms: 0
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def handle_item_error(item, error)
|
||||
Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}")
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: item["expectedOutput"],
|
||||
actual: { error: error.message },
|
||||
correct: false,
|
||||
latency_ms: 0
|
||||
}
|
||||
end
|
||||
|
||||
def calculate_metrics(results)
|
||||
total = results.size
|
||||
|
||||
# Guard against empty results to avoid division by zero
|
||||
if total.zero?
|
||||
return {
|
||||
accuracy: 0.0,
|
||||
total: 0,
|
||||
correct: 0,
|
||||
incorrect: 0,
|
||||
avg_latency_ms: 0
|
||||
}
|
||||
end
|
||||
|
||||
correct = results.count { |r| r[:correct] }
|
||||
avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f
|
||||
|
||||
{
|
||||
accuracy: (correct.to_f / total * 100).round(2),
|
||||
total: total,
|
||||
correct: correct,
|
||||
incorrect: total - correct,
|
||||
avg_latency_ms: avg_latency.round(0)
|
||||
}
|
||||
end
|
||||
|
||||
def llm_provider
|
||||
@llm_provider ||= build_provider
|
||||
end
|
||||
|
||||
def build_provider
|
||||
case provider
|
||||
when "openai"
|
||||
access_token = provider_config[:access_token] ||
|
||||
ENV["OPENAI_ACCESS_TOKEN"] ||
|
||||
Setting.openai_access_token
|
||||
|
||||
raise "OpenAI access token not configured" unless access_token.present?
|
||||
|
||||
uri_base = provider_config[:uri_base] ||
|
||||
ENV["OPENAI_URI_BASE"] ||
|
||||
Setting.openai_uri_base
|
||||
|
||||
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
|
||||
else
|
||||
raise "Unsupported provider: #{provider}"
|
||||
end
|
||||
end
|
||||
|
||||
# Determine the effective JSON mode for a batch based on expected null ratio
|
||||
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
|
||||
def json_mode_for_batch(items)
|
||||
# If a specific mode is configured (not "auto"), always use it
|
||||
configured_mode = provider_config[:json_mode]
|
||||
return configured_mode if configured_mode.present? && configured_mode != "auto"
|
||||
|
||||
# Calculate expected null ratio for this batch
|
||||
expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? }
|
||||
expected_null_ratio = expected_null_count.to_f / items.size
|
||||
|
||||
# If >50% of the batch is expected to return null, force strict mode
|
||||
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
|
||||
# and prevents unnecessary retries when nulls are legitimate
|
||||
if expected_null_ratio > 0.5
|
||||
Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode")
|
||||
"strict"
|
||||
else
|
||||
# Use auto mode - let the auto-categorizer decide
|
||||
"auto"
|
||||
end
|
||||
end
|
||||
|
||||
def normalize_null(value)
|
||||
return nil if value.nil?
|
||||
return nil if value == "null"
|
||||
return nil if value.to_s.strip.empty?
|
||||
value
|
||||
end
|
||||
|
||||
def normalize_url(url)
|
||||
return nil if url.nil?
|
||||
url.to_s.downcase
|
||||
.gsub(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.chomp("/")
|
||||
.strip
|
||||
end
|
||||
|
||||
def build_available_functions
|
||||
# Simplified function definitions for chat eval
|
||||
[
|
||||
{
|
||||
name: "get_accounts",
|
||||
description: "Get user's financial accounts",
|
||||
params_schema: { type: "object", properties: {}, required: [] }
|
||||
},
|
||||
{
|
||||
name: "get_transactions",
|
||||
description: "Get transactions with optional filters",
|
||||
params_schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
account_id: { type: "string" },
|
||||
start_date: { type: "string" },
|
||||
end_date: { type: "string" },
|
||||
category: { type: "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name: "get_balance_summary",
|
||||
description: "Get balance summary across accounts",
|
||||
params_schema: { type: "object", properties: {} }
|
||||
},
|
||||
{
|
||||
name: "get_spending_by_category",
|
||||
description: "Get spending breakdown by category",
|
||||
params_schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
start_date: { type: "string" },
|
||||
end_date: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
end
|
||||
|
||||
def extract_function_calls(response)
|
||||
return [] unless response.respond_to?(:messages)
|
||||
|
||||
response.messages.flat_map do |msg|
|
||||
next [] unless msg.respond_to?(:function_calls)
|
||||
msg.function_calls.map do |fc|
|
||||
{ name: fc.name, arguments: fc.arguments }
|
||||
end
|
||||
end.compact
|
||||
end
|
||||
|
||||
def evaluate_function_match(actual, expected)
|
||||
return true if expected.empty? && actual.empty?
|
||||
return false if expected.empty? != actual.empty?
|
||||
|
||||
expected_names = expected.map { |f| f["name"] || f[:name] }.sort
|
||||
actual_names = actual.map { |f| f["name"] || f[:name] }.sort
|
||||
|
||||
expected_names == actual_names
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user