Files
sure/db/eval_data/chat_golden_v1.yml
soky srm 88952e4714 Small llms improvements (#400)
* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2025-12-07 18:11:34 +01:00

826 lines
19 KiB
YAML

---
name: chat_golden_v1
description: Golden dataset for chat/assistant function calling evaluation
eval_type: chat
version: "1.0"
metadata:
created_at: "2024-12-01"
source: manual_curation
samples:
# ===== EASY - Simple single function calls =====
- id: chat_easy_001
difficulty: easy
tags: [get_accounts, simple]
input:
prompt: "What accounts do I have?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_002
difficulty: easy
tags: [get_accounts, simple]
input:
prompt: "Show me my accounts"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_003
difficulty: easy
tags: [get_accounts, balance]
input:
prompt: "What's my account balance?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_004
difficulty: easy
tags: [get_transactions, simple]
input:
prompt: "Show me my recent transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_005
difficulty: easy
tags: [get_transactions, simple]
input:
prompt: "What are my latest transactions?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_006
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "What's my net worth?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_007
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "Show me my assets and liabilities"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_008
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "What were my expenses last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_009
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "How much income did I make this month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_010
difficulty: easy
tags: [get_accounts, simple]
input:
prompt: "How many accounts do I have?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_011
difficulty: easy
tags: [get_transactions, simple]
input:
prompt: "List my transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_012
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "How much do I owe?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_013
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "What are my total assets?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_014
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "Show my spending"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_015
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "How much did I spend?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
# ===== MEDIUM - With filtering or specific parameters =====
- id: chat_medium_001
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "Show me my restaurant spending"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_002
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "What did I spend on groceries?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_003
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "Show transactions over $100"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_004
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "What did I spend at Amazon?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_005
difficulty: medium
tags: [get_transactions, date_range]
input:
prompt: "Show me last week's transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_006
difficulty: medium
tags: [get_income_statement, date_range]
input:
prompt: "What was my income in January?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_007
difficulty: medium
tags: [get_income_statement, comparison]
input:
prompt: "How much did I save last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_008
difficulty: medium
tags: [get_accounts, specific]
input:
prompt: "What's the balance in my checking account?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_009
difficulty: medium
tags: [get_accounts, specific]
input:
prompt: "How much do I have in savings?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_010
difficulty: medium
tags: [get_transactions, category]
input:
prompt: "Show me all my subscription payments"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_011
difficulty: medium
tags: [get_transactions, search]
input:
prompt: "Find transactions from Uber"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_012
difficulty: medium
tags: [get_income_statement, category]
input:
prompt: "How much do I spend on entertainment?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_013
difficulty: medium
tags: [get_balance_sheet, trend]
input:
prompt: "How has my net worth changed over time?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_medium_014
difficulty: medium
tags: [get_transactions, amount]
input:
prompt: "What's my largest expense this month?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_015
difficulty: medium
tags: [get_income_statement, breakdown]
input:
prompt: "Break down my expenses by category"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_016
difficulty: medium
tags: [get_transactions, recurring]
input:
prompt: "Show me my recurring payments"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_017
difficulty: medium
tags: [get_accounts, credit]
input:
prompt: "What's my credit card balance?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_018
difficulty: medium
tags: [get_income_statement, specific]
input:
prompt: "How much did I spend on food last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_019
difficulty: medium
tags: [get_transactions, date]
input:
prompt: "Show transactions from December"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_020
difficulty: medium
tags: [get_balance_sheet, liability]
input:
prompt: "What are my debts?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
# ===== HARD - Analysis, comparisons, insights =====
- id: chat_hard_001
difficulty: hard
tags: [analysis, spending_trend]
input:
prompt: "Am I spending more than I make?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_002
difficulty: hard
tags: [comparison, month_over_month]
input:
prompt: "How does my spending this month compare to last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_003
difficulty: hard
tags: [analysis, budget]
input:
prompt: "Where can I cut expenses?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_004
difficulty: hard
tags: [analysis, savings]
input:
prompt: "What's my savings rate?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_005
difficulty: hard
tags: [analysis, trend]
input:
prompt: "Are my expenses trending up or down?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_006
difficulty: hard
tags: [analysis, category]
input:
prompt: "What category do I spend the most on?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_007
difficulty: hard
tags: [analysis, unusual]
input:
prompt: "Are there any unusual transactions this month?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_008
difficulty: hard
tags: [analysis, debt]
input:
prompt: "How long will it take to pay off my credit card?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_hard_009
difficulty: hard
tags: [analysis, financial_health]
input:
prompt: "What's my debt-to-income ratio?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_hard_010
difficulty: hard
tags: [analysis, goals]
input:
prompt: "Can I afford to save $500 more per month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_011
difficulty: hard
tags: [comparison, year_over_year]
input:
prompt: "How does this year compare to last year?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_012
difficulty: hard
tags: [analysis, pattern]
input:
prompt: "Do I have any spending patterns I should know about?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_013
difficulty: hard
tags: [advice, budget]
input:
prompt: "How should I allocate my income?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_014
difficulty: hard
tags: [analysis, efficiency]
input:
prompt: "Am I overspending on subscriptions?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_015
difficulty: hard
tags: [forecast, projection]
input:
prompt: "At this rate, how much will I have saved by year end?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
# ===== EDGE CASES - Unclear intent, no function needed =====
- id: chat_edge_001
difficulty: edge_case
tags: [no_function, greeting]
input:
prompt: "Hello"
expected:
functions: []
response_contains: []
- id: chat_edge_002
difficulty: edge_case
tags: [no_function, thanks]
input:
prompt: "Thank you!"
expected:
functions: []
response_contains: []
- id: chat_edge_003
difficulty: edge_case
tags: [no_function, general]
input:
prompt: "What can you help me with?"
expected:
functions: []
response_contains: []
- id: chat_edge_004
difficulty: edge_case
tags: [no_function, advice]
input:
prompt: "Should I invest in stocks?"
expected:
functions: []
response_contains: []
- id: chat_edge_005
difficulty: edge_case
tags: [no_function, external]
input:
prompt: "What's the weather like?"
expected:
functions: []
response_contains: []
- id: chat_edge_006
difficulty: edge_case
tags: [ambiguous]
input:
prompt: "Tell me about my money"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_edge_007
difficulty: edge_case
tags: [ambiguous]
input:
prompt: "How am I doing financially?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_edge_008
difficulty: edge_case
tags: [ambiguous]
input:
prompt: "Give me a summary"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_edge_009
difficulty: edge_case
tags: [no_function, off_topic]
input:
prompt: "What's 2 + 2?"
expected:
functions: []
response_contains: []
- id: chat_edge_010
difficulty: edge_case
tags: [no_function, general]
input:
prompt: "Who are you?"
expected:
functions: []
response_contains: []
# Additional samples
- id: chat_easy_016
difficulty: easy
tags: [get_transactions]
input:
prompt: "Pull up my transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_017
difficulty: easy
tags: [get_accounts]
input:
prompt: "Show all my bank accounts"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_018
difficulty: easy
tags: [get_balance_sheet]
input:
prompt: "What do I own?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_019
difficulty: easy
tags: [get_income_statement]
input:
prompt: "What's my income?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_020
difficulty: easy
tags: [get_transactions]
input:
prompt: "Recent purchases"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_021
difficulty: medium
tags: [get_transactions, merchant]
input:
prompt: "How much have I spent at Starbucks?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_022
difficulty: medium
tags: [get_transactions, category]
input:
prompt: "Show transportation expenses"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_023
difficulty: medium
tags: [get_income_statement, period]
input:
prompt: "Quarterly expense report"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_024
difficulty: medium
tags: [get_accounts, type]
input:
prompt: "Show my investment accounts"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_025
difficulty: medium
tags: [get_transactions, amount]
input:
prompt: "Transactions under $50"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_016
difficulty: hard
tags: [analysis, discretionary]
input:
prompt: "How much discretionary spending do I have?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_017
difficulty: hard
tags: [analysis, fixed_vs_variable]
input:
prompt: "What are my fixed vs variable expenses?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_018
difficulty: hard
tags: [analysis, emergency_fund]
input:
prompt: "Do I have enough for an emergency fund?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_hard_019
difficulty: hard
tags: [analysis, liquidity]
input:
prompt: "How liquid are my assets?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_hard_020
difficulty: hard
tags: [comparison, benchmark]
input:
prompt: "Am I spending too much on housing?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []