mirror of
https://github.com/we-promise/sure.git
synced 2026-05-08 13:14:58 +00:00
* feat(ci): improve LLM eval visibility in GitHub Actions - Add step summary output for each eval run (shows in GH UI) - Add new 'summarize_evals' job that aggregates results from all matrix runs - Generate markdown table with accuracy, cost, and duration for all evals - Add threshold checking (fails workflow if accuracy < 70%) - Include status icons (✅/❌) for quick visual assessment - Show overall pass/fail status at the end of summary * Fix LLM eval workflow summary --------- Co-authored-by: SureBot <sure-bot@we-promise.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
458 lines
15 KiB
YAML
458 lines
15 KiB
YAML
name: LLM Evals
|
|
|
|
on:
|
|
push:
|
|
tags:
|
|
- 'v*'
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
env:
|
|
EVAL_MODELS: gpt-4.1
|
|
RAILS_ENV: test
|
|
DATABASE_URL: postgres://postgres:postgres@localhost:5432
|
|
REDIS_URL: redis://localhost:6379
|
|
PLAID_CLIENT_ID: foo
|
|
PLAID_SECRET: bar
|
|
|
|
jobs:
|
|
check_openai:
|
|
name: Check OpenAI credentials
|
|
runs-on: ubuntu-latest
|
|
outputs:
|
|
should_run: ${{ steps.gate.outputs.should_run }}
|
|
reason: ${{ steps.gate.outputs.reason }}
|
|
|
|
steps:
|
|
- name: Validate OpenAI token and quota
|
|
id: gate
|
|
env:
|
|
OPENAI_ACCESS_TOKEN: ${{ secrets.OPENAI_ACCESS_TOKEN }}
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
if [ -z "${OPENAI_ACCESS_TOKEN:-}" ]; then
|
|
echo "OpenAI token is not configured; skipping eval workflow."
|
|
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
|
echo "reason=OPENAI_ACCESS_TOKEN secret is missing" >> "$GITHUB_OUTPUT"
|
|
exit 0
|
|
fi
|
|
|
|
TEST_MODEL=$(printf '%s' "$EVAL_MODELS" | cut -d',' -f1 | xargs)
|
|
if [ -z "$TEST_MODEL" ]; then
|
|
TEST_MODEL="gpt-4.1"
|
|
fi
|
|
|
|
echo "Checking API access with model: ${TEST_MODEL}"
|
|
|
|
RESPONSE_FILE="$(mktemp)"
|
|
STATUS_CODE=$(curl -sS -o "$RESPONSE_FILE" -w "%{http_code}" \
|
|
https://api.openai.com/v1/chat/completions \
|
|
-H "Authorization: Bearer ${OPENAI_ACCESS_TOKEN}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"model\":\"${TEST_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":1}")
|
|
|
|
if [ "$STATUS_CODE" = "200" ]; then
|
|
echo "OpenAI token check passed."
|
|
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
|
echo "reason=ok" >> "$GITHUB_OUTPUT"
|
|
exit 0
|
|
fi
|
|
|
|
ERROR_MESSAGE=$(ruby -rjson -e '
|
|
body = File.read(ARGV[0]) rescue ""
|
|
data = JSON.parse(body) rescue {}
|
|
message = data.dig("error", "message") || data["message"] || "unknown error"
|
|
puts message.gsub(/\s+/, " ").strip
|
|
' "$RESPONSE_FILE")
|
|
|
|
echo "OpenAI check failed (${STATUS_CODE}): ${ERROR_MESSAGE}"
|
|
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
|
echo "reason=OpenAI token invalid or insufficient quota (${STATUS_CODE})" >> "$GITHUB_OUTPUT"
|
|
exit 0
|
|
|
|
discover_datasets:
|
|
name: Discover eval datasets
|
|
needs: check_openai
|
|
if: needs.check_openai.outputs.should_run == 'true'
|
|
runs-on: ubuntu-latest
|
|
|
|
services:
|
|
postgres:
|
|
image: postgres:16
|
|
env:
|
|
POSTGRES_USER: postgres
|
|
POSTGRES_PASSWORD: postgres
|
|
ports:
|
|
- 5432:5432
|
|
options: --health-cmd="pg_isready" --health-interval=10s --health-timeout=5s --health-retries=3
|
|
|
|
redis:
|
|
image: redis:7.2
|
|
ports:
|
|
- 6379:6379
|
|
options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=3
|
|
|
|
outputs:
|
|
datasets: ${{ steps.datasets.outputs.datasets }}
|
|
models: ${{ steps.models.outputs.models }}
|
|
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Set up Ruby
|
|
uses: ruby/setup-ruby@v1
|
|
with:
|
|
ruby-version: .ruby-version
|
|
bundler-cache: true
|
|
|
|
- name: Prepare database
|
|
run: |
|
|
bin/rails db:create
|
|
bin/rails db:schema:load
|
|
|
|
- name: Import eval datasets
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
shopt -s nullglob
|
|
dataset_files=(db/eval_data/*.yml)
|
|
|
|
if [ ${#dataset_files[@]} -eq 0 ]; then
|
|
echo "::error::No eval dataset files found under db/eval_data/*.yml"
|
|
exit 1
|
|
fi
|
|
|
|
for dataset_file in "${dataset_files[@]}"; do
|
|
echo "Importing ${dataset_file}"
|
|
bundle exec rake "evals:import_dataset[${dataset_file}]"
|
|
done
|
|
|
|
|
|
- name: Resolve eval models
|
|
id: models
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
MODELS_JSON=$(bin/rails runner '
|
|
models = ENV.fetch("EVAL_MODELS", "").split(",").map(&:strip).reject(&:blank?)
|
|
puts models.to_json
|
|
')
|
|
|
|
if [ "$MODELS_JSON" = "[]" ]; then
|
|
echo "::error::EVAL_MODELS is empty. Set at least one model, for example: EVAL_MODELS=gpt-4.1"
|
|
exit 1
|
|
fi
|
|
|
|
{
|
|
echo "models<<EOF"
|
|
echo "$MODELS_JSON"
|
|
echo "EOF"
|
|
} >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Resolve available eval datasets
|
|
id: datasets
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
DATASETS_JSON=$(bin/rails runner 'puts Eval::Dataset.order(:name).pluck(:name).to_json')
|
|
|
|
if [ "$DATASETS_JSON" = "[]" ]; then
|
|
echo "::error::No eval datasets found. Import one first with: rake evals:import_dataset[path/to/file.yml]"
|
|
exit 1
|
|
fi
|
|
|
|
{
|
|
echo "datasets<<EOF"
|
|
echo "$DATASETS_JSON"
|
|
echo "EOF"
|
|
} >> "$GITHUB_OUTPUT"
|
|
|
|
run_evals:
|
|
name: Run eval for ${{ matrix.dataset }} on ${{ matrix.model }}
|
|
needs: [check_openai, discover_datasets]
|
|
if: needs.check_openai.outputs.should_run == 'true'
|
|
runs-on: ubuntu-latest
|
|
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
dataset: ${{ fromJson(needs.discover_datasets.outputs.datasets) }}
|
|
model: ${{ fromJson(needs.discover_datasets.outputs.models) }}
|
|
|
|
services:
|
|
postgres:
|
|
image: postgres:16
|
|
env:
|
|
POSTGRES_USER: postgres
|
|
POSTGRES_PASSWORD: postgres
|
|
ports:
|
|
- 5432:5432
|
|
options: --health-cmd="pg_isready" --health-interval=10s --health-timeout=5s --health-retries=3
|
|
|
|
redis:
|
|
image: redis:7.2
|
|
ports:
|
|
- 6379:6379
|
|
options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=3
|
|
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Set up Ruby
|
|
uses: ruby/setup-ruby@v1
|
|
with:
|
|
ruby-version: .ruby-version
|
|
bundler-cache: true
|
|
|
|
- name: Prepare database
|
|
run: |
|
|
bin/rails db:create
|
|
bin/rails db:schema:load
|
|
|
|
- name: Import eval datasets
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
shopt -s nullglob
|
|
dataset_files=(db/eval_data/*.yml)
|
|
|
|
if [ ${#dataset_files[@]} -eq 0 ]; then
|
|
echo "::error::No eval dataset files found under db/eval_data/*.yml"
|
|
exit 1
|
|
fi
|
|
|
|
for dataset_file in "${dataset_files[@]}"; do
|
|
echo "Importing ${dataset_file}"
|
|
bundle exec rake "evals:import_dataset[${dataset_file}]"
|
|
done
|
|
|
|
- name: Prepare dataset artifact names
|
|
id: dataset_slug
|
|
env:
|
|
DATASET: ${{ matrix.dataset }}
|
|
MODEL: ${{ matrix.model }}
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
slug=$(printf '%s' "$DATASET" \
|
|
| tr '[:upper:]' '[:lower:]' \
|
|
| sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//; s/-{2,}/-/g')
|
|
|
|
if [ -z "$slug" ]; then
|
|
echo "::error::Could not generate dataset slug from '$DATASET'"
|
|
exit 1
|
|
fi
|
|
|
|
echo "slug=$slug" >> "$GITHUB_OUTPUT"
|
|
model_slug=$(printf '%s' "$MODEL" \
|
|
| tr '[:upper:]' '[:lower:]' \
|
|
| sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//; s/-{2,}/-/g')
|
|
|
|
if [ -z "$model_slug" ]; then
|
|
echo "::error::Could not generate model slug from '$MODEL'"
|
|
exit 1
|
|
fi
|
|
|
|
echo "model_slug=$model_slug" >> "$GITHUB_OUTPUT"
|
|
echo "log_path=tmp/evals/${slug}-${model_slug}.log" >> "$GITHUB_OUTPUT"
|
|
echo "json_path=tmp/evals/${slug}-${model_slug}.json" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Verify dataset exists
|
|
env:
|
|
DATASET: ${{ matrix.dataset }}
|
|
MODEL: ${{ matrix.model }}
|
|
run: |
|
|
bin/rails runner 'dataset = Eval::Dataset.find_by(name: ENV.fetch("DATASET")); abort("Dataset not found: #{ENV.fetch("DATASET")}") if dataset.nil?'
|
|
|
|
- name: Run eval
|
|
env:
|
|
DATASET: ${{ matrix.dataset }}
|
|
MODEL: ${{ matrix.model }}
|
|
OPENAI_ACCESS_TOKEN: ${{ secrets.OPENAI_ACCESS_TOKEN }}
|
|
run: |
|
|
set -euo pipefail
|
|
mkdir -p tmp/evals
|
|
bundle exec rake "evals:run[${DATASET},${MODEL}]" | tee "${{ steps.dataset_slug.outputs.log_path }}"
|
|
|
|
- name: Export run summary
|
|
id: export_summary
|
|
env:
|
|
DATASET: ${{ matrix.dataset }}
|
|
MODEL: ${{ matrix.model }}
|
|
JSON_PATH: ${{ steps.dataset_slug.outputs.json_path }}
|
|
run: |
|
|
set -euo pipefail
|
|
mkdir -p "$(dirname "$JSON_PATH")"
|
|
|
|
bin/rails runner '
|
|
dataset = Eval::Dataset.find_by!(name: ENV.fetch("DATASET"))
|
|
run = Eval::Run.where(dataset: dataset, model: ENV.fetch("MODEL")).order(created_at: :desc).first
|
|
abort("No eval run found for dataset #{dataset.name} and model #{ENV.fetch("MODEL")}") if run.nil?
|
|
payload = {
|
|
dataset: dataset.name,
|
|
dataset_metadata: dataset.metadata,
|
|
model: ENV.fetch("MODEL"),
|
|
run_id: run.id,
|
|
status: run.status,
|
|
created_at: run.created_at,
|
|
completed_at: run.completed_at,
|
|
total_prompt_tokens: run.total_prompt_tokens,
|
|
total_completion_tokens: run.total_completion_tokens,
|
|
total_cost: run.total_cost,
|
|
metrics: run.metrics,
|
|
accuracy: run.accuracy || 0.0,
|
|
duration_seconds: run.duration_seconds
|
|
}
|
|
File.write(ENV.fetch("JSON_PATH"), JSON.pretty_generate(payload))
|
|
'
|
|
|
|
echo "accuracy=$(jq -r '.accuracy // 0' "$JSON_PATH")" >> "$GITHUB_OUTPUT"
|
|
echo "status=$(jq -r '.status' "$JSON_PATH")" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Upload eval artifact
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: llm-evals-${{ steps.dataset_slug.outputs.slug }}-${{ steps.dataset_slug.outputs.model_slug }}
|
|
path: |
|
|
${{ steps.dataset_slug.outputs.log_path }}
|
|
${{ steps.dataset_slug.outputs.json_path }}
|
|
if-no-files-found: error
|
|
retention-days: 30
|
|
|
|
- name: Output eval result
|
|
shell: bash
|
|
run: |
|
|
echo "### Eval Result: ${{ matrix.dataset }} / ${{ matrix.model }}" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "- **Status**: ${{ steps.export_summary.outputs.status }}" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "- **Accuracy**: ${{ steps.export_summary.outputs.accuracy }}%" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
summarize_evals:
|
|
name: Summarize LLM Evals
|
|
needs: [check_openai, run_evals]
|
|
if: always() && needs.check_openai.outputs.should_run == 'true'
|
|
runs-on: ubuntu-latest
|
|
|
|
steps:
|
|
- name: Download all artifacts
|
|
uses: actions/download-artifact@v4
|
|
with:
|
|
path: eval-artifacts
|
|
pattern: llm-evals-*
|
|
|
|
- name: Generate summary
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
echo "# 🧪 LLM Evals Results" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
printf "Triggered by: \`%s\`\n" "$GITHUB_REF" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "---" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
# Find all JSON result files
|
|
shopt -s globstar nullglob
|
|
json_files=(eval-artifacts/**/*.json)
|
|
|
|
if [ ${#json_files[@]} -eq 0 ]; then
|
|
echo "⚠️ No eval results found." >> "$GITHUB_STEP_SUMMARY"
|
|
exit 0
|
|
fi
|
|
|
|
# Table header
|
|
echo "| Dataset | Model | Status | Accuracy | Cost | Duration |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "|---------|-------|--------|----------|------|----------|" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
all_passed=true
|
|
accuracy_threshold=70
|
|
for json_file in "${json_files[@]}"; do
|
|
dataset=$(jq -r '.dataset' "$json_file")
|
|
model=$(jq -r '.model' "$json_file")
|
|
status=$(jq -r '.status' "$json_file")
|
|
accuracy=$(jq -r '.accuracy // 0' "$json_file")
|
|
cost=$(jq -r '.total_cost // 0' "$json_file")
|
|
duration=$(jq -r '.duration_seconds // 0' "$json_file")
|
|
|
|
if [ "$status" = "completed" ] && awk -v accuracy="$accuracy" -v threshold="$accuracy_threshold" 'BEGIN { exit !((accuracy + 0) >= threshold) }'; then
|
|
icon="✅"
|
|
else
|
|
icon="❌"
|
|
all_passed=false
|
|
fi
|
|
|
|
printf '| %s | %s | %s %s | %s%% | \\$%s | %ss |\n' \
|
|
"$dataset" "$model" "$icon" "$status" "$accuracy" "$cost" "$duration" >> "$GITHUB_STEP_SUMMARY"
|
|
done
|
|
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "---" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
if [ "$all_passed" = "true" ]; then
|
|
echo "✅ **All evals passed!**" >> "$GITHUB_STEP_SUMMARY"
|
|
else
|
|
echo "❌ **Some evals failed. Check the details above.**" >> "$GITHUB_STEP_SUMMARY"
|
|
fi
|
|
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "📦 Artifacts with full logs are available for download." >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
- name: Check eval thresholds
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
shopt -s globstar nullglob
|
|
json_files=(eval-artifacts/**/*.json)
|
|
|
|
failed=0
|
|
accuracy_threshold=70
|
|
for json_file in "${json_files[@]}"; do
|
|
status=$(jq -r '.status' "$json_file")
|
|
accuracy=$(jq -r '.accuracy // 0' "$json_file")
|
|
dataset=$(jq -r '.dataset' "$json_file")
|
|
model=$(jq -r '.model' "$json_file")
|
|
|
|
if [ "$status" != "completed" ]; then
|
|
echo "::error::Eval for $dataset / $model did not complete successfully"
|
|
failed=$((failed + 1))
|
|
fi
|
|
|
|
# Fail if accuracy is below 70%
|
|
if awk -v accuracy="$accuracy" -v threshold="$accuracy_threshold" 'BEGIN { exit !((accuracy + 0) < threshold) }'; then
|
|
echo "::error::Accuracy for $dataset / $model is below threshold: ${accuracy}%"
|
|
failed=$((failed + 1))
|
|
fi
|
|
done
|
|
|
|
if [ $failed -gt 0 ]; then
|
|
echo "::error::$failed eval(s) failed or below threshold"
|
|
exit 1
|
|
fi
|
|
|
|
echo "All evals passed with acceptable accuracy."
|
|
|
|
skip_evals:
|
|
name: Skip evals (no valid OpenAI token/quota)
|
|
needs: check_openai
|
|
if: needs.check_openai.outputs.should_run != 'true'
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Report skip reason
|
|
run: |
|
|
echo "LLM evals were skipped gracefully."
|
|
echo "Reason: ${{ needs.check_openai.outputs.reason }}"
|