Files
sure/app/models/eval/langfuse/experiment_runner.rb
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

469 lines
14 KiB
Ruby

class Eval::Langfuse::ExperimentRunner
attr_reader :dataset, :model, :provider, :client, :provider_config
BATCH_SIZE = 25
def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {})
@dataset = dataset
@model = model
@provider = provider
@client = client || Eval::Langfuse::Client.new
@provider_config = provider_config
end
def run(run_name: nil)
@run_name = run_name || generate_run_name
Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'")
Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)")
Rails.logger.info("[Langfuse Experiment] Model: #{model}")
# Ensure dataset exists in Langfuse
ensure_dataset_exported
# Get dataset items from Langfuse
items = fetch_langfuse_items
# Run the experiment
results = process_items(items)
# Calculate and report metrics
metrics = calculate_metrics(results)
Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete")
Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%")
{
run_name: @run_name,
dataset_name: langfuse_dataset_name,
model: model,
samples_processed: results.size,
metrics: metrics
}
end
private
def generate_run_name
"#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
end
def langfuse_dataset_name
"eval_#{dataset.name}"
end
def ensure_dataset_exported
exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client)
exporter.export
end
def fetch_langfuse_items
items = []
page = 1
loop do
response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50)
batch = response["data"] || []
items.concat(batch)
break if batch.size < 50
page += 1
end
Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse")
items
end
def process_items(items)
results = []
items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}")
batch_results = process_batch(batch)
results.concat(batch_results)
end
results
end
def process_batch(items)
case dataset.eval_type
when "categorization"
process_categorization_batch(items)
when "merchant_detection"
process_merchant_detection_batch(items)
when "chat"
process_chat_batch(items)
else
raise "Unsupported eval type: #{dataset.eval_type}"
end
end
def process_categorization_batch(items)
transactions = items.map do |item|
input = item["input"]
txn = input["transaction"] || input
txn.deep_symbolize_keys.merge(id: item["id"])
end
categories = items.first.dig("input", "categories") || []
categories = categories.map(&:deep_symbolize_keys)
# Determine effective JSON mode for this batch
# If the batch has many expected nulls, force strict mode to prevent false retries
effective_json_mode = json_mode_for_batch(items)
start_time = Time.current
response = llm_provider.auto_categorize(
transactions: transactions,
user_categories: categories,
model: model,
json_mode: effective_json_mode
)
latency_ms = ((Time.current - start_time) * 1000).to_i
if response.success?
items.map do |item|
categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s }
actual_category = normalize_null(categorization&.category_name)
expected_category = item.dig("expectedOutput", "category_name")
correct = actual_category == expected_category
score_value = correct ? 1.0 : 0.0
# Create trace and score in Langfuse
trace_id = create_trace_for_item(item, actual_category, latency_ms)
score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category)
{
item_id: item["id"],
expected: expected_category,
actual: actual_category,
correct: correct,
latency_ms: latency_ms / items.size
}
end
else
handle_batch_error(items, response.error)
end
rescue => e
handle_batch_error(items, e)
end
def process_merchant_detection_batch(items)
transactions = items.map do |item|
input = item["input"]
txn = input["transaction"] || input
txn.deep_symbolize_keys.merge(id: item["id"])
end
merchants = items.first.dig("input", "merchants") || []
merchants = merchants.map(&:deep_symbolize_keys)
start_time = Time.current
response = llm_provider.auto_detect_merchants(
transactions: transactions,
user_merchants: merchants,
model: model
)
latency_ms = ((Time.current - start_time) * 1000).to_i
if response.success?
items.map do |item|
detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s }
actual_name = normalize_null(detection&.business_name)
actual_url = normalize_null(detection&.business_url)
expected_name = item.dig("expectedOutput", "business_name")
expected_url = item.dig("expectedOutput", "business_url")
name_match = actual_name == expected_name
url_match = normalize_url(actual_url) == normalize_url(expected_url)
correct = name_match && url_match
score_value = correct ? 1.0 : 0.0
# Create trace and score in Langfuse
actual_output = { business_name: actual_name, business_url: actual_url }
trace_id = create_trace_for_item(item, actual_output, latency_ms)
score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"])
{
item_id: item["id"],
expected: { name: expected_name, url: expected_url },
actual: { name: actual_name, url: actual_url },
correct: correct,
latency_ms: latency_ms / items.size
}
end
else
handle_batch_error(items, response.error)
end
rescue => e
handle_batch_error(items, e)
end
def process_chat_batch(items)
# Chat is processed one at a time due to function calling complexity
items.map do |item|
process_chat_item(item)
end
end
def process_chat_item(item)
prompt = item.dig("input", "prompt")
expected_functions = item.dig("expectedOutput", "functions") || []
start_time = Time.current
response = llm_provider.chat_response(
prompt,
model: model,
instructions: "You are a helpful personal finance assistant.",
functions: build_available_functions
)
latency_ms = ((Time.current - start_time) * 1000).to_i
actual_functions = extract_function_calls(response)
correct = evaluate_function_match(actual_functions, expected_functions)
score_value = correct ? 1.0 : 0.0
# Create trace and score in Langfuse
trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms)
score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions)
{
item_id: item["id"],
expected: expected_functions,
actual: actual_functions,
correct: correct,
latency_ms: latency_ms
}
rescue => e
handle_item_error(item, e)
end
def create_trace_for_item(item, output, latency_ms)
trace_id = client.create_trace(
name: "#{dataset.eval_type}_eval",
input: item["input"],
output: output,
metadata: {
run_name: @run_name,
model: model,
latency_ms: latency_ms,
dataset_item_id: item["id"]
}
)
Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}")
trace_id
end
def score_result(trace_id, item_id, score_value, correct, actual, expected)
return unless trace_id
# Score the accuracy
client.create_score(
trace_id: trace_id,
name: "accuracy",
value: score_value,
comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}"
)
# Link to dataset run
client.create_dataset_run_item(
run_name: @run_name,
dataset_item_id: item_id,
trace_id: trace_id,
metadata: {
correct: correct,
actual: actual,
expected: expected
}
)
rescue => e
Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}")
end
def handle_batch_error(items, error)
error_message = error.is_a?(Exception) ? error.message : error.to_s
Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}")
items.map do |item|
{
item_id: item["id"],
expected: item["expectedOutput"],
actual: { error: error_message },
correct: false,
latency_ms: 0
}
end
end
def handle_item_error(item, error)
Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}")
{
item_id: item["id"],
expected: item["expectedOutput"],
actual: { error: error.message },
correct: false,
latency_ms: 0
}
end
def calculate_metrics(results)
total = results.size
# Guard against empty results to avoid division by zero
if total.zero?
return {
accuracy: 0.0,
total: 0,
correct: 0,
incorrect: 0,
avg_latency_ms: 0
}
end
correct = results.count { |r| r[:correct] }
avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f
{
accuracy: (correct.to_f / total * 100).round(2),
total: total,
correct: correct,
incorrect: total - correct,
avg_latency_ms: avg_latency.round(0)
}
end
def llm_provider
@llm_provider ||= build_provider
end
def build_provider
case provider
when "openai"
access_token = provider_config[:access_token] ||
ENV["OPENAI_ACCESS_TOKEN"] ||
Setting.openai_access_token
raise "OpenAI access token not configured" unless access_token.present?
uri_base = provider_config[:uri_base] ||
ENV["OPENAI_URI_BASE"] ||
Setting.openai_uri_base
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
else
raise "Unsupported provider: #{provider}"
end
end
# Determine the effective JSON mode for a batch based on expected null ratio
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
def json_mode_for_batch(items)
# If a specific mode is configured (not "auto"), always use it
configured_mode = provider_config[:json_mode]
return configured_mode if configured_mode.present? && configured_mode != "auto"
# Calculate expected null ratio for this batch
expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? }
expected_null_ratio = expected_null_count.to_f / items.size
# If >50% of the batch is expected to return null, force strict mode
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
# and prevents unnecessary retries when nulls are legitimate
if expected_null_ratio > 0.5
Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode")
"strict"
else
# Use auto mode - let the auto-categorizer decide
"auto"
end
end
def normalize_null(value)
return nil if value.nil?
return nil if value == "null"
return nil if value.to_s.strip.empty?
value
end
def normalize_url(url)
return nil if url.nil?
url.to_s.downcase
.gsub(/^(https?:\/\/)?(www\.)?/, "")
.chomp("/")
.strip
end
def build_available_functions
# Simplified function definitions for chat eval
[
{
name: "get_accounts",
description: "Get user's financial accounts",
params_schema: { type: "object", properties: {}, required: [] }
},
{
name: "get_transactions",
description: "Get transactions with optional filters",
params_schema: {
type: "object",
properties: {
account_id: { type: "string" },
start_date: { type: "string" },
end_date: { type: "string" },
category: { type: "string" }
}
}
},
{
name: "get_balance_summary",
description: "Get balance summary across accounts",
params_schema: { type: "object", properties: {} }
},
{
name: "get_spending_by_category",
description: "Get spending breakdown by category",
params_schema: {
type: "object",
properties: {
start_date: { type: "string" },
end_date: { type: "string" }
}
}
}
]
end
def extract_function_calls(response)
return [] unless response.respond_to?(:messages)
response.messages.flat_map do |msg|
next [] unless msg.respond_to?(:function_calls)
msg.function_calls.map do |fc|
{ name: fc.name, arguments: fc.arguments }
end
end.compact
end
def evaluate_function_match(actual, expected)
return true if expected.empty? && actual.empty?
return false if expected.empty? != actual.empty?
expected_names = expected.map { |f| f["name"] || f[:name] }.sort
actual_names = actual.map { |f| f["name"] || f[:name] }.sort
expected_names == actual_names
end
end