This is based on having observed that there is a lot of variation between runs on `n=1` and `n=3`. * With `n=8` two runs on the same branch give answers that seem close enough to be reasonably consistent. * With higher concurrency, trying to run this many repetitions seems to lead language servers to time out a lot, causing evals to fail. Release Notes: - N/A
81 lines
2.4 KiB
YAML
81 lines
2.4 KiB
YAML
name: Run Agent Eval
|
||
|
||
on:
|
||
schedule:
|
||
- cron: "0 0 * * *"
|
||
|
||
pull_request:
|
||
branches:
|
||
- "**"
|
||
types: [opened, synchronize, reopened, labeled]
|
||
|
||
workflow_dispatch:
|
||
|
||
concurrency:
|
||
# Allow only one workflow per any non-`main` branch.
|
||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||
cancel-in-progress: true
|
||
|
||
env:
|
||
CARGO_TERM_COLOR: always
|
||
CARGO_INCREMENTAL: 0
|
||
RUST_BACKTRACE: 1
|
||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
|
||
ZED_EVAL_TELEMETRY: 1
|
||
|
||
jobs:
|
||
# This is a no-op job that we run to prevent GitHub from marking the workflow
|
||
# as failed for PRs that don't have the `run-eval` label.
|
||
noop:
|
||
name: No-op
|
||
runs-on: ubuntu-latest
|
||
steps:
|
||
- name: No-op
|
||
run: echo "Nothing to do"
|
||
|
||
run_eval:
|
||
timeout-minutes: 60
|
||
name: Run Agent Eval
|
||
if: >
|
||
github.repository_owner == 'zed-industries' &&
|
||
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
|
||
runs-on:
|
||
- buildjet-16vcpu-ubuntu-2204
|
||
steps:
|
||
- name: Add Rust to the PATH
|
||
run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||
|
||
- name: Checkout repo
|
||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
|
||
with:
|
||
clean: false
|
||
|
||
- name: Cache dependencies
|
||
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
|
||
with:
|
||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||
cache-provider: "buildjet"
|
||
|
||
- name: Install Linux dependencies
|
||
run: ./script/linux
|
||
|
||
- name: Configure CI
|
||
run: |
|
||
mkdir -p ./../.cargo
|
||
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
|
||
|
||
- name: Compile eval
|
||
run: cargo build --package=eval
|
||
|
||
- name: Run eval
|
||
run: cargo run --package=eval -- --repetitions=8 --concurrency=1
|
||
|
||
# Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
|
||
# But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
|
||
# to clean up the config file, I’ve included the cleanup code here as a precaution.
|
||
# While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
|
||
- name: Clean CI config file
|
||
if: always()
|
||
run: rm -rf ./../.cargo
|