Source code for olmo_tap.final_evals.elo.elo_engine

"""Permutation-averaged Elo with K-factor sensitivity sweep.

Implements the robustness recipe from Boubdir et al. (2023), *Elo Uncovered:
Robustness and Best Practices in Language Model Evaluation*:

  - Initial rating ``R_0 = 1400`` for every entrant.
  - Update rule
    ``R'_A = R_A + K * (S_A - E_A)`` with
    ``E_A = 1 / (1 + 10^((R_B - R_A) / 400))``.
  - Ties are dropped from the match list (consistent with the paper's
    handling of inconsistent judge verdicts).
  - The match list is shuffled ``n_perms`` times; per-permutation Elo is
    computed independently and we report mean ± SEM across permutations
    rather than a single-pass score.
  - A K-factor sweep returns the same per-entrant statistics over a list
    of K values, enabling the sensitivity heatmap (Figure 3 of the paper).

The input is a flat list of ``(entrant_a, entrant_b, winner)`` triples where
``winner`` is either one of the entrant ids or ``None`` / ``"TIE"`` to mark a
tie. See :func:`compute_elo_permutation` for the full contract.

Implementation notes:

  - All shuffling uses ``numpy.random.default_rng(seed)`` and the seed is
    surfaced on the result for reproducibility / logging.
  - The hot loop is written with simple Python and ``math.pow`` rather than
    NumPy because matches are processed serially per permutation; the
    speedup from vectorising would be marginal and would obscure the update
    rule.
  - Ratings are stored in a plain ``dict`` keyed by entrant id so the engine
    is agnostic to entrant ordering and to the entrant set.
"""

from __future__ import annotations

import math
from dataclasses import dataclass, field
from typing import Iterable, Sequence

import numpy as np

# Type alias for a single match record. ``winner`` is one of the two entrant
# ids on a decisive verdict, or ``None`` (canonical) / ``"TIE"`` for a tie.
Match = tuple[str, str, str | None]

DEFAULT_INITIAL_RATING: float = 1400.0
DEFAULT_K: float = 16.0
DEFAULT_N_PERMS: int = 500
DEFAULT_K_SWEEP: tuple[int, ...] = (1, 4, 8, 16, 32)
TIE_TOKEN: str = "TIE"


[docs] @dataclass(frozen=True) class EloResult: """Per-entrant rating statistics across permutations. Attributes: entrant_id: The entrant the statistics are for. mean: Mean Elo rating across the permutations. sem: Standard error of the mean (``std(ddof=1) / sqrt(n_perms)``). ci95_low: ``mean - 1.96 * sem``. ci95_high: ``mean + 1.96 * sem``. per_perm_ratings: 1-D array of per-permutation final ratings; the full trace is returned so that downstream reports can plot distributions, not just summary statistics. """ entrant_id: str mean: float sem: float ci95_low: float ci95_high: float per_perm_ratings: np.ndarray = field(repr=False)
def _normalise_match(match: Match) -> Match: """Coerce a tie marker into the canonical ``None`` form. Accepts ``None`` or the string ``"TIE"`` (case-insensitive) for the winner field; everything else must equal one of the two entrant ids and is returned unchanged. """ a, b, winner = match if winner is None: return (a, b, None) if isinstance(winner, str) and winner.upper() == TIE_TOKEN: return (a, b, None) if winner != a and winner != b: raise ValueError( f"Match winner {winner!r} must be one of {a!r}, {b!r}, " f"None, or {TIE_TOKEN!r}." ) return (a, b, winner) def _drop_ties(matches: Iterable[Match]) -> list[Match]: """Filter out tied matches, returning a list of decisive matches.""" out: list[Match] = [] for m in matches: norm = _normalise_match(m) if norm[2] is None: continue out.append(norm) return out def _expected_score(rating_a: float, rating_b: float) -> float: """Standard Elo expected-score for player A. ``E_A = 1 / (1 + 10 ** ((R_B - R_A) / 400))``. """ return 1.0 / (1.0 + math.pow(10.0, (rating_b - rating_a) / 400.0)) def _entrant_ids(matches: Iterable[Match]) -> list[str]: """Return the unique entrant ids referenced by a match list, sorted.""" seen: set[str] = set() for a, b, _ in matches: seen.add(a) seen.add(b) return sorted(seen) def _run_single_pass( matches: Sequence[Match], entrants: Sequence[str], *, k: float, initial_rating: float, ) -> dict[str, float]: """Run a single sequential pass of the Elo update over ``matches``.""" ratings: dict[str, float] = {e: initial_rating for e in entrants} for a, b, winner in matches: r_a = ratings[a] r_b = ratings[b] e_a = _expected_score(r_a, r_b) # winner is guaranteed to be either ``a`` or ``b`` after _drop_ties s_a = 1.0 if winner == a else 0.0 s_b = 1.0 - s_a ratings[a] = r_a + k * (s_a - e_a) ratings[b] = r_b + k * (s_b - (1.0 - e_a)) return ratings
[docs] def compute_elo_permutation( matches: Iterable[Match], *, k: float = DEFAULT_K, initial_rating: float = DEFAULT_INITIAL_RATING, n_perms: int = DEFAULT_N_PERMS, seed: int = 0, ) -> dict[str, EloResult]: """Compute permutation-averaged Elo ratings. The match list is shuffled ``n_perms`` times with :func:`numpy.random.default_rng(seed)`; for each permutation the Elo update is run sequentially and the final rating per entrant is recorded. The returned dict reports mean ± SEM and 95% CI per entrant alongside the full per-permutation trace. Args: matches: Iterable of ``(entrant_a, entrant_b, winner)`` triples. ``winner`` may be ``None`` or ``"TIE"`` to mark a tie; ties are dropped before the permutation loop. k: Elo K-factor. initial_rating: Starting rating for every entrant on every permutation. Boubdir et al. (2023) fix this at 1400. n_perms: Number of independent permutations to run. The paper recommends ``>= 100``; the headline run uses 500. seed: Seed for the NumPy ``Generator`` driving the shuffles. Logged on the result so runs are reproducible. Returns: Mapping ``entrant_id -> EloResult`` for every entrant referenced by ``matches``. Raises: ValueError: If ``n_perms < 1``, if ``matches`` is empty, or if any match has a winner that is neither one of the two entrants nor a recognised tie marker. """ if n_perms < 1: raise ValueError(f"n_perms must be >= 1, got {n_perms}.") decisive = _drop_ties(matches) if not decisive: raise ValueError( "No decisive matches after dropping ties — " "cannot compute Elo over an empty match list." ) entrants = _entrant_ids(decisive) rng = np.random.default_rng(seed) n = len(decisive) # Per-perm final ratings: row i = perm i, col j = entrants[j]. final_ratings = np.empty((n_perms, len(entrants)), dtype=np.float64) for perm_idx in range(n_perms): order = rng.permutation(n) shuffled = [decisive[i] for i in order] ratings = _run_single_pass( shuffled, entrants, k=k, initial_rating=initial_rating ) for col, entrant_id in enumerate(entrants): final_ratings[perm_idx, col] = ratings[entrant_id] results: dict[str, EloResult] = {} for col, entrant_id in enumerate(entrants): traces = final_ratings[:, col] mean = float(traces.mean()) # ddof=1 for the unbiased estimator; SEM is undefined for n_perms == 1. if n_perms > 1: std = float(traces.std(ddof=1)) sem = std / math.sqrt(n_perms) else: sem = 0.0 results[entrant_id] = EloResult( entrant_id=entrant_id, mean=mean, sem=sem, ci95_low=mean - 1.96 * sem, ci95_high=mean + 1.96 * sem, per_perm_ratings=traces.copy(), ) return results
[docs] def k_factor_sweep( matches: Iterable[Match], *, k_values: Sequence[int | float] = DEFAULT_K_SWEEP, initial_rating: float = DEFAULT_INITIAL_RATING, n_perms: int = DEFAULT_N_PERMS, seed: int = 0, ) -> dict[float, dict[str, EloResult]]: """Run :func:`compute_elo_permutation` across a list of K-factors. Returns a heatmap-ready ``{k: {entrant_id: EloResult}}`` structure. The same shuffle ``seed`` is used for every K so the difference across the sweep reflects the K-factor's effect rather than shuffle variance. Args: matches: Same contract as :func:`compute_elo_permutation`. k_values: K-factors to sweep. Default ``{1, 4, 8, 16, 32}`` covers the range where the Boubdir paper observed ranking changes on their benchmark suites. initial_rating: Starting rating per entrant per permutation. n_perms: Permutations per K-factor. seed: Seed for every K's permutation loop. Returns: Mapping ``k -> {entrant_id: EloResult}``. K-factors are stored as ``float`` keys for stable hashing across int / float inputs. """ # Materialise once so each K sees the same input regardless of generator # exhaustion or list mutation. materialised = list(matches) sweep: dict[float, dict[str, EloResult]] = {} for k in k_values: sweep[float(k)] = compute_elo_permutation( materialised, k=float(k), initial_rating=initial_rating, n_perms=n_perms, seed=seed, ) return sweep
[docs] def rank_entrants( results: dict[str, EloResult], ) -> list[tuple[str, float]]: """Return ``[(entrant_id, mean_rating), ...]`` sorted high-to-low.""" return sorted( ((eid, r.mean) for eid, r in results.items()), key=lambda x: x[1], reverse=True, )