fix: correct class name reference in SDK evaluation script [remote-eval] #9
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Remote Evaluations (SDK) | |
| # This workflow runs Codebuff evaluations using the public SDK exclusively. | |
| # It creates a containerized backend environment and runs evaluations via CodebuffClient. | |
| # Trigger: Add [remote-eval] to commit message or use workflow_dispatch | |
| # Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations | |
| on: | |
| push: | |
| branches: ['**'] | |
| workflow_dispatch: | |
| inputs: | |
| eval_file: | |
| description: 'Eval file to run (e.g., eval-codebuff.json)' | |
| required: false | |
| default: 'eval-codebuff.json' | |
| type: string | |
| commit_index: | |
| description: 'Commit index to evaluate (0-based)' | |
| required: false | |
| default: '0' | |
| type: string | |
| mode: | |
| description: 'Auth mode (seed or bypass)' | |
| required: false | |
| default: 'bypass' | |
| type: choice | |
| options: | |
| - 'bypass' | |
| - 'seed' | |
| jobs: | |
| remote-evals: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Check commit message | |
| id: check_commit | |
| env: | |
| COMMIT_MESSAGE: ${{ github.event.head_commit.message }} | |
| run: | | |
| shopt -s nocasematch | |
| if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| echo "should_run_evals=true" >> $GITHUB_OUTPUT | |
| echo "Will run remote evaluations" | |
| else | |
| echo "should_run_evals=false" >> $GITHUB_OUTPUT | |
| echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)" | |
| fi | |
| - name: Set up Bun | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| uses: oven-sh/setup-bun@v2 | |
| with: | |
| bun-version: '1.2.12' | |
| - name: Install dependencies | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| run: bun install --frozen-lockfile | |
| - name: Validate environment for SDK evaluation | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| run: | | |
| echo "π Validating SDK evaluation environment..." | |
| echo " Checking for required files..." | |
| test -f evals/scripts/run-remote-parameterized.sh || { echo "β Missing run-remote-parameterized.sh"; exit 1; } | |
| test -f evals/git-evals/run-single-eval.ts || { echo "β Missing run-single-eval.ts"; exit 1; } | |
| test -f evals/docker-compose.evals.yml || { echo "β Missing docker-compose.evals.yml"; exit 1; } | |
| echo " Checking SDK package..." | |
| bun --version | |
| echo "β Environment validation passed" | |
| - name: Run remote evaluation | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| env: | |
| EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }} | |
| COMMIT_INDEX: ${{ inputs.commit_index || '0' }} | |
| MODE: ${{ inputs.mode || 'bypass' }} | |
| CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws" | |
| CODEBUFF_SKIP_BINARY_CHECK: "1" | |
| run: | | |
| echo "π Remote Evaluation Starting (SDK Mode)" | |
| echo "π GitHub Actions Environment:" | |
| echo " Runner: ${{ runner.os }}" | |
| echo " SHA: ${{ github.sha }}" | |
| echo " Ref: ${{ github.ref }}" | |
| echo " Event: ${{ github.event_name }}" | |
| echo " Eval File: $EVAL_FILE" | |
| echo " Commit Index: $COMMIT_INDEX" | |
| echo " Mode: $MODE" | |
| echo "π³ Docker Info:" | |
| docker --version | |
| docker compose version | |
| echo "πΎ Disk Space:" | |
| df -h | |
| echo "π§ Starting SDK-based evaluation..." | |
| bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX" | |
| - name: Dump logs on failure | |
| if: failure() && steps.check_commit.outputs.should_run_evals == 'true' | |
| run: | | |
| echo "β SDK Evaluation failed - dumping diagnostic information" | |
| echo "π§ SDK Environment:" | |
| echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}" | |
| echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}" | |
| echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}" | |
| echo "π³ Docker containers status:" | |
| docker ps -a || true | |
| echo "π Backend container logs:" | |
| docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true | |
| echo "π Database container logs:" | |
| docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true | |
| echo "πΎ Disk usage:" | |
| df -h || true | |
| echo "π§ Memory usage:" | |
| free -h || true | |
| echo "π Evaluation files:" | |
| ls -la evals/git-evals/ || true | |
| ls -la evals/scripts/ || true | |
| - name: Upload evaluation logs | |
| if: always() && steps.check_commit.outputs.should_run_evals == 'true' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: remote-eval-logs-${{ github.sha }} | |
| path: | | |
| evals/test-repos/ | |
| debug/ | |
| ~/.cache/bun/ | |
| retention-days: 7 | |
| - name: Cleanup containers | |
| if: always() && steps.check_commit.outputs.should_run_evals == 'true' | |
| run: | | |
| echo "π§Ή Final cleanup - removing all containers and volumes..." | |
| docker compose -f evals/docker-compose.evals.yml down -v || true | |
| docker system prune -f || true | |
| echo "β Cleanup completed" | |
| # Optional: Matrix job to run multiple evaluations in parallel | |
| remote-evals-matrix: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 | |
| if: contains(github.event.head_commit.message, '[remote-eval-all]') | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| eval: | |
| - { file: 'eval-codebuff.json', index: '0' } | |
| - { file: 'eval-codebuff.json', index: '1' } | |
| - { file: 'eval-manifold.json', index: '0' } | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Bun | |
| uses: oven-sh/setup-bun@v2 | |
| with: | |
| bun-version: '1.2.12' | |
| - name: Install dependencies | |
| run: bun install --frozen-lockfile | |
| - name: Validate environment for SDK evaluation | |
| run: | | |
| echo "π Validating SDK evaluation environment for matrix job..." | |
| test -f evals/scripts/run-remote-parameterized.sh || { echo "β Missing run-remote-parameterized.sh"; exit 1; } | |
| test -f evals/git-evals/run-single-eval.ts || { echo "β Missing run-single-eval.ts"; exit 1; } | |
| test -f evals/docker-compose.evals.yml || { echo "β Missing docker-compose.evals.yml"; exit 1; } | |
| echo "β Matrix environment validation passed" | |
| - name: Run evaluation matrix | |
| env: | |
| EVAL_FILE: ${{ matrix.eval.file }} | |
| COMMIT_INDEX: ${{ matrix.eval.index }} | |
| CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws" | |
| CODEBUFF_SKIP_BINARY_CHECK: "1" | |
| run: | | |
| echo "π Running matrix evaluation (SDK Mode)..." | |
| bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX" | |
| - name: Dump matrix logs on failure | |
| if: failure() | |
| run: | | |
| echo "β Matrix SDK Evaluation failed - dumping diagnostic information" | |
| echo "π§ Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX" | |
| echo "π³ Docker containers status:" | |
| docker ps -a || true | |
| echo "π Container logs:" | |
| docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true | |
| - name: Upload matrix evaluation results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }} | |
| path: | | |
| evals/test-repos/ | |
| debug/ | |
| retention-days: 7 | |
| - name: Cleanup containers | |
| if: always() | |
| run: | | |
| docker compose -f evals/docker-compose.evals.yml down -v || true | |
| docker system prune -f || true |