Skip to content

feat: implement remote evaluation infrastructure with Docker Compose #5

feat: implement remote evaluation infrastructure with Docker Compose

feat: implement remote evaluation infrastructure with Docker Compose #5

Workflow file for this run

name: Remote Evaluations
on:
push:
branches: ['**']
workflow_dispatch:
inputs:
eval_file:
description: 'Eval file to run (e.g., eval-codebuff.json)'
required: false
default: 'eval-codebuff.json'
type: string
commit_index:
description: 'Commit index to evaluate (0-based)'
required: false
default: '0'
type: string
mode:
description: 'Auth mode (seed or bypass)'
required: false
default: 'bypass'
type: choice
options:
- 'bypass'
- 'seed'
jobs:
remote-evals:
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Check commit message
id: check_commit
env:
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
run: |
shopt -s nocasematch
if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "should_run_evals=true" >> $GITHUB_OUTPUT
echo "Will run remote evaluations"
else
echo "should_run_evals=false" >> $GITHUB_OUTPUT
echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
fi
- name: Set up Bun
if: steps.check_commit.outputs.should_run_evals == 'true'
uses: oven-sh/setup-bun@v2
with:
bun-version: '1.2.12'
- name: Install dependencies
if: steps.check_commit.outputs.should_run_evals == 'true'
run: bun install --frozen-lockfile
- name: Run remote evaluation
if: steps.check_commit.outputs.should_run_evals == 'true'
env:
EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
MODE: ${{ inputs.mode || 'bypass' }}
run: |
echo "🚀 Starting remote evaluation..."
bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
- name: Upload evaluation logs
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
uses: actions/upload-artifact@v4
with:
name: remote-eval-logs-${{ github.sha }}
path: |
evals/test-repos/
debug/
retention-days: 7
- name: Cleanup containers
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "🧹 Cleaning up Docker containers..."
docker compose -f evals/docker-compose.evals.yml down -v || true
docker system prune -f || true
# Optional: Matrix job to run multiple evaluations in parallel
remote-evals-matrix:
runs-on: ubuntu-latest
timeout-minutes: 90
if: contains(github.event.head_commit.message, '[remote-eval-all]') || (github.event_name == 'workflow_dispatch' && inputs.mode == 'matrix')
strategy:
fail-fast: false
matrix:
eval:
- { file: 'eval-codebuff.json', index: '0' }
- { file: 'eval-codebuff.json', index: '1' }
- { file: 'eval-manifold.json', index: '0' }
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: '1.2.12'
- name: Install dependencies
run: bun install --frozen-lockfile
- name: Run evaluation matrix
env:
EVAL_FILE: ${{ matrix.eval.file }}
COMMIT_INDEX: ${{ matrix.eval.index }}
run: |
echo "🚀 Running matrix evaluation..."
bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
- name: Upload matrix evaluation results
if: always()
uses: actions/upload-artifact@v4
with:
name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
path: |
evals/test-repos/
debug/
retention-days: 7
- name: Cleanup containers
if: always()
run: |
docker compose -f evals/docker-compose.evals.yml down -v || true
docker system prune -f || true