name: LLM Evals on: push: tags: - 'v*' permissions: contents: read env: EVAL_MODELS: gpt-4.1 RAILS_ENV: test DATABASE_URL: postgres://postgres:postgres@localhost:5432 REDIS_URL: redis://localhost:6379 PLAID_CLIENT_ID: foo PLAID_SECRET: bar jobs: check_openai: name: Check OpenAI credentials runs-on: ubuntu-latest outputs: should_run: ${{ steps.gate.outputs.should_run }} reason: ${{ steps.gate.outputs.reason }} steps: - name: Validate OpenAI token and quota id: gate env: OPENAI_ACCESS_TOKEN: ${{ secrets.OPENAI_ACCESS_TOKEN }} shell: bash run: | set -euo pipefail if [ -z "${OPENAI_ACCESS_TOKEN:-}" ]; then echo "OpenAI token is not configured; skipping eval workflow." echo "should_run=false" >> "$GITHUB_OUTPUT" echo "reason=OPENAI_ACCESS_TOKEN secret is missing" >> "$GITHUB_OUTPUT" exit 0 fi TEST_MODEL=$(printf '%s' "$EVAL_MODELS" | cut -d',' -f1 | xargs) if [ -z "$TEST_MODEL" ]; then TEST_MODEL="gpt-4.1" fi echo "Checking API access with model: ${TEST_MODEL}" RESPONSE_FILE="$(mktemp)" STATUS_CODE=$(curl -sS -o "$RESPONSE_FILE" -w "%{http_code}" \ https://api.openai.com/v1/chat/completions \ -H "Authorization: Bearer ${OPENAI_ACCESS_TOKEN}" \ -H "Content-Type: application/json" \ -d "{\"model\":\"${TEST_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":1}") if [ "$STATUS_CODE" = "200" ]; then echo "OpenAI token check passed." echo "should_run=true" >> "$GITHUB_OUTPUT" echo "reason=ok" >> "$GITHUB_OUTPUT" exit 0 fi ERROR_MESSAGE=$(ruby -rjson -e ' body = File.read(ARGV[0]) rescue "" data = JSON.parse(body) rescue {} message = data.dig("error", "message") || data["message"] || "unknown error" puts message.gsub(/\s+/, " ").strip ' "$RESPONSE_FILE") echo "OpenAI check failed (${STATUS_CODE}): ${ERROR_MESSAGE}" echo "should_run=false" >> "$GITHUB_OUTPUT" echo "reason=OpenAI token invalid or insufficient quota (${STATUS_CODE})" >> "$GITHUB_OUTPUT" exit 0 discover_datasets: name: Discover eval datasets needs: check_openai if: needs.check_openai.outputs.should_run == 'true' runs-on: ubuntu-latest services: postgres: image: postgres:16 env: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres ports: - 5432:5432 options: --health-cmd="pg_isready" --health-interval=10s --health-timeout=5s --health-retries=3 redis: image: redis:7.2 ports: - 6379:6379 options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=3 outputs: datasets: ${{ steps.datasets.outputs.datasets }} models: ${{ steps.models.outputs.models }} steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: .ruby-version bundler-cache: true - name: Prepare database run: | bin/rails db:create bin/rails db:schema:load - name: Import eval datasets shell: bash run: | set -euo pipefail shopt -s nullglob dataset_files=(db/eval_data/*.yml) if [ ${#dataset_files[@]} -eq 0 ]; then echo "::error::No eval dataset files found under db/eval_data/*.yml" exit 1 fi for dataset_file in "${dataset_files[@]}"; do echo "Importing ${dataset_file}" bundle exec rake "evals:import_dataset[${dataset_file}]" done - name: Resolve eval models id: models shell: bash run: | set -euo pipefail MODELS_JSON=$(bin/rails runner ' models = ENV.fetch("EVAL_MODELS", "").split(",").map(&:strip).reject(&:blank?) puts models.to_json ') if [ "$MODELS_JSON" = "[]" ]; then echo "::error::EVAL_MODELS is empty. Set at least one model, for example: EVAL_MODELS=gpt-4.1" exit 1 fi { echo "models<> "$GITHUB_OUTPUT" - name: Resolve available eval datasets id: datasets shell: bash run: | set -euo pipefail DATASETS_JSON=$(bin/rails runner 'puts Eval::Dataset.order(:name).pluck(:name).to_json') if [ "$DATASETS_JSON" = "[]" ]; then echo "::error::No eval datasets found. Import one first with: rake evals:import_dataset[path/to/file.yml]" exit 1 fi { echo "datasets<> "$GITHUB_OUTPUT" run_evals: name: Run eval for ${{ matrix.dataset }} on ${{ matrix.model }} needs: [check_openai, discover_datasets] if: needs.check_openai.outputs.should_run == 'true' runs-on: ubuntu-latest strategy: fail-fast: false matrix: dataset: ${{ fromJson(needs.discover_datasets.outputs.datasets) }} model: ${{ fromJson(needs.discover_datasets.outputs.models) }} services: postgres: image: postgres:16 env: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres ports: - 5432:5432 options: --health-cmd="pg_isready" --health-interval=10s --health-timeout=5s --health-retries=3 redis: image: redis:7.2 ports: - 6379:6379 options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=3 steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: .ruby-version bundler-cache: true - name: Prepare database run: | bin/rails db:create bin/rails db:schema:load - name: Import eval datasets shell: bash run: | set -euo pipefail shopt -s nullglob dataset_files=(db/eval_data/*.yml) if [ ${#dataset_files[@]} -eq 0 ]; then echo "::error::No eval dataset files found under db/eval_data/*.yml" exit 1 fi for dataset_file in "${dataset_files[@]}"; do echo "Importing ${dataset_file}" bundle exec rake "evals:import_dataset[${dataset_file}]" done - name: Prepare dataset artifact names id: dataset_slug env: DATASET: ${{ matrix.dataset }} MODEL: ${{ matrix.model }} shell: bash run: | set -euo pipefail slug=$(printf '%s' "$DATASET" \ | tr '[:upper:]' '[:lower:]' \ | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//; s/-{2,}/-/g') if [ -z "$slug" ]; then echo "::error::Could not generate dataset slug from '$DATASET'" exit 1 fi echo "slug=$slug" >> "$GITHUB_OUTPUT" model_slug=$(printf '%s' "$MODEL" \ | tr '[:upper:]' '[:lower:]' \ | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//; s/-{2,}/-/g') if [ -z "$model_slug" ]; then echo "::error::Could not generate model slug from '$MODEL'" exit 1 fi echo "model_slug=$model_slug" >> "$GITHUB_OUTPUT" echo "log_path=tmp/evals/${slug}-${model_slug}.log" >> "$GITHUB_OUTPUT" echo "json_path=tmp/evals/${slug}-${model_slug}.json" >> "$GITHUB_OUTPUT" - name: Verify dataset exists env: DATASET: ${{ matrix.dataset }} MODEL: ${{ matrix.model }} run: | bin/rails runner 'dataset = Eval::Dataset.find_by(name: ENV.fetch("DATASET")); abort("Dataset not found: #{ENV.fetch("DATASET")}") if dataset.nil?' - name: Run eval env: DATASET: ${{ matrix.dataset }} MODEL: ${{ matrix.model }} OPENAI_ACCESS_TOKEN: ${{ secrets.OPENAI_ACCESS_TOKEN }} run: | set -euo pipefail mkdir -p tmp/evals bundle exec rake "evals:run[${DATASET},${MODEL}]" | tee "${{ steps.dataset_slug.outputs.log_path }}" - name: Export run summary id: export_summary env: DATASET: ${{ matrix.dataset }} MODEL: ${{ matrix.model }} JSON_PATH: ${{ steps.dataset_slug.outputs.json_path }} run: | set -euo pipefail mkdir -p "$(dirname "$JSON_PATH")" bin/rails runner ' dataset = Eval::Dataset.find_by!(name: ENV.fetch("DATASET")) run = Eval::Run.where(dataset: dataset, model: ENV.fetch("MODEL")).order(created_at: :desc).first abort("No eval run found for dataset #{dataset.name} and model #{ENV.fetch("MODEL")}") if run.nil? payload = { dataset: dataset.name, dataset_metadata: dataset.metadata, model: ENV.fetch("MODEL"), run_id: run.id, status: run.status, created_at: run.created_at, completed_at: run.completed_at, total_prompt_tokens: run.total_prompt_tokens, total_completion_tokens: run.total_completion_tokens, total_cost: run.total_cost, metrics: run.metrics, accuracy: run.accuracy || 0.0, duration_seconds: run.duration_seconds } File.write(ENV.fetch("JSON_PATH"), JSON.pretty_generate(payload)) ' echo "accuracy=$(jq -r '.accuracy // 0' "$JSON_PATH")" >> "$GITHUB_OUTPUT" echo "status=$(jq -r '.status' "$JSON_PATH")" >> "$GITHUB_OUTPUT" - name: Upload eval artifact uses: actions/upload-artifact@v4 with: name: llm-evals-${{ steps.dataset_slug.outputs.slug }}-${{ steps.dataset_slug.outputs.model_slug }} path: | ${{ steps.dataset_slug.outputs.log_path }} ${{ steps.dataset_slug.outputs.json_path }} if-no-files-found: error retention-days: 30 - name: Output eval result shell: bash run: | echo "### Eval Result: ${{ matrix.dataset }} / ${{ matrix.model }}" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "- **Status**: ${{ steps.export_summary.outputs.status }}" >> "$GITHUB_STEP_SUMMARY" echo "- **Accuracy**: ${{ steps.export_summary.outputs.accuracy }}%" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" summarize_evals: name: Summarize LLM Evals needs: [check_openai, run_evals] if: always() && needs.check_openai.outputs.should_run == 'true' runs-on: ubuntu-latest steps: - name: Download all artifacts uses: actions/download-artifact@v4 with: path: eval-artifacts pattern: llm-evals-* - name: Generate summary shell: bash run: | set -euo pipefail echo "# ๐Ÿงช LLM Evals Results" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" printf "Triggered by: \`%s\`\n" "$GITHUB_REF" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "---" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" # Find all JSON result files shopt -s globstar nullglob json_files=(eval-artifacts/**/*.json) if [ ${#json_files[@]} -eq 0 ]; then echo "โš ๏ธ No eval results found." >> "$GITHUB_STEP_SUMMARY" exit 0 fi # Table header echo "| Dataset | Model | Status | Accuracy | Cost | Duration |" >> "$GITHUB_STEP_SUMMARY" echo "|---------|-------|--------|----------|------|----------|" >> "$GITHUB_STEP_SUMMARY" all_passed=true accuracy_threshold=70 for json_file in "${json_files[@]}"; do dataset=$(jq -r '.dataset' "$json_file") model=$(jq -r '.model' "$json_file") status=$(jq -r '.status' "$json_file") accuracy=$(jq -r '.accuracy // 0' "$json_file") cost=$(jq -r '.total_cost // 0' "$json_file") duration=$(jq -r '.duration_seconds // 0' "$json_file") if [ "$status" = "completed" ] && awk -v accuracy="$accuracy" -v threshold="$accuracy_threshold" 'BEGIN { exit !((accuracy + 0) >= threshold) }'; then icon="โœ…" else icon="โŒ" all_passed=false fi printf '| %s | %s | %s %s | %s%% | \\$%s | %ss |\n' \ "$dataset" "$model" "$icon" "$status" "$accuracy" "$cost" "$duration" >> "$GITHUB_STEP_SUMMARY" done echo "" >> "$GITHUB_STEP_SUMMARY" echo "---" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" if [ "$all_passed" = "true" ]; then echo "โœ… **All evals passed!**" >> "$GITHUB_STEP_SUMMARY" else echo "โŒ **Some evals failed. Check the details above.**" >> "$GITHUB_STEP_SUMMARY" fi echo "" >> "$GITHUB_STEP_SUMMARY" echo "๐Ÿ“ฆ Artifacts with full logs are available for download." >> "$GITHUB_STEP_SUMMARY" - name: Check eval thresholds shell: bash run: | set -euo pipefail shopt -s globstar nullglob json_files=(eval-artifacts/**/*.json) failed=0 accuracy_threshold=70 for json_file in "${json_files[@]}"; do status=$(jq -r '.status' "$json_file") accuracy=$(jq -r '.accuracy // 0' "$json_file") dataset=$(jq -r '.dataset' "$json_file") model=$(jq -r '.model' "$json_file") if [ "$status" != "completed" ]; then echo "::error::Eval for $dataset / $model did not complete successfully" failed=$((failed + 1)) fi # Fail if accuracy is below 70% if awk -v accuracy="$accuracy" -v threshold="$accuracy_threshold" 'BEGIN { exit !((accuracy + 0) < threshold) }'; then echo "::error::Accuracy for $dataset / $model is below threshold: ${accuracy}%" failed=$((failed + 1)) fi done if [ $failed -gt 0 ]; then echo "::error::$failed eval(s) failed or below threshold" exit 1 fi echo "All evals passed with acceptable accuracy." skip_evals: name: Skip evals (no valid OpenAI token/quota) needs: check_openai if: needs.check_openai.outputs.should_run != 'true' runs-on: ubuntu-latest steps: - name: Report skip reason run: | echo "LLM evals were skipped gracefully." echo "Reason: ${{ needs.check_openai.outputs.reason }}"