Source code for olmo_tap.final_evals.elo.elo_engine

"""Permutation-averaged Elo with K-factor sensitivity sweep.

Implements the robustness recipe from Boubdir et al. (2023), *Elo Uncovered:
Robustness and Best Practices in Language Model Evaluation*:

  - Initial rating ``R_0 = 1400`` for every entrant.
  - Update rule
    ``R'_A = R_A + K * (S_A - E_A)`` with
    ``E_A = 1 / (1 + 10^((R_B - R_A) / 400))``.
  - Ties are dropped from the match list (consistent with the paper's
    handling of inconsistent judge verdicts).
  - The match list is shuffled ``n_perms`` times; per-permutation Elo is
    computed independently and we report mean ± SEM across permutations
    rather than a single-pass score.
  - A K-factor sweep returns the same per-entrant statistics over a list
    of K values, enabling the sensitivity heatmap (Figure 3 of the paper).

The input is a flat list of ``(entrant_a, entrant_b, winner)`` triples where
``winner`` is either one of the entrant ids or ``None`` / ``"TIE"`` to mark a
tie. See :func:`compute_elo_permutation` for the full contract.

Implementation notes:

  - All shuffling uses ``numpy.random.default_rng(seed)`` and the seed is
    surfaced on the result for reproducibility / logging.
  - The hot loop is written with simple Python and ``math.pow`` rather than
    NumPy because matches are processed serially per permutation; the
    speedup from vectorising would be marginal and would obscure the update
    rule.
  - Ratings are stored in a plain ``dict`` keyed by entrant id so the engine
    is agnostic to entrant ordering and to the entrant set.
"""

from __future__ import annotations

import math
from dataclasses import dataclass, field
from typing import Iterable, Sequence

import numpy as np

# Type alias for a single match record. ``winner`` is one of the two entrant
# ids on a decisive verdict, or ``None`` (canonical) / ``"TIE"`` for a tie.
Match = tuple[str, str, str | None]

DEFAULT_INITIAL_RATING: float = 1400.0
DEFAULT_K: float = 16.0
DEFAULT_N_PERMS: int = 500
DEFAULT_K_SWEEP: tuple[int, ...] = (1, 4, 8, 16, 32)
TIE_TOKEN: str = "TIE"



[docs]
@dataclass(frozen=True)
class EloResult:
    """Per-entrant rating statistics across permutations.

    Attributes:
        entrant_id: The entrant the statistics are for.
        mean: Mean Elo rating across the permutations.
        sem: Standard error of the mean (``std(ddof=1) / sqrt(n_perms)``).
        ci95_low: ``mean - 1.96 * sem``.
        ci95_high: ``mean + 1.96 * sem``.
        per_perm_ratings: 1-D array of per-permutation final ratings; the
            full trace is returned so that downstream reports can plot
            distributions, not just summary statistics.
    """

    entrant_id: str
    mean: float
    sem: float
    ci95_low: float
    ci95_high: float
    per_perm_ratings: np.ndarray = field(repr=False)



def _normalise_match(match: Match) -> Match:
    """Coerce a tie marker into the canonical ``None`` form.

    Accepts ``None`` or the string ``"TIE"`` (case-insensitive) for the
    winner field; everything else must equal one of the two entrant ids and
    is returned unchanged.
    """
    a, b, winner = match
    if winner is None:
        return (a, b, None)
    if isinstance(winner, str) and winner.upper() == TIE_TOKEN:
        return (a, b, None)
    if winner != a and winner != b:
        raise ValueError(
            f"Match winner {winner!r} must be one of {a!r}, {b!r}, "
            f"None, or {TIE_TOKEN!r}."
        )
    return (a, b, winner)


def _drop_ties(matches: Iterable[Match]) -> list[Match]:
    """Filter out tied matches, returning a list of decisive matches."""
    out: list[Match] = []
    for m in matches:
        norm = _normalise_match(m)
        if norm[2] is None:
            continue
        out.append(norm)
    return out


def _expected_score(rating_a: float, rating_b: float) -> float:
    """Standard Elo expected-score for player A.

    ``E_A = 1 / (1 + 10 ** ((R_B - R_A) / 400))``.
    """
    return 1.0 / (1.0 + math.pow(10.0, (rating_b - rating_a) / 400.0))


def _entrant_ids(matches: Iterable[Match]) -> list[str]:
    """Return the unique entrant ids referenced by a match list, sorted."""
    seen: set[str] = set()
    for a, b, _ in matches:
        seen.add(a)
        seen.add(b)
    return sorted(seen)


def _run_single_pass(
    matches: Sequence[Match],
    entrants: Sequence[str],
    *,
    k: float,
    initial_rating: float,
) -> dict[str, float]:
    """Run a single sequential pass of the Elo update over ``matches``."""
    ratings: dict[str, float] = {e: initial_rating for e in entrants}
    for a, b, winner in matches:
        r_a = ratings[a]
        r_b = ratings[b]
        e_a = _expected_score(r_a, r_b)
        # winner is guaranteed to be either ``a`` or ``b`` after _drop_ties
        s_a = 1.0 if winner == a else 0.0
        s_b = 1.0 - s_a
        ratings[a] = r_a + k * (s_a - e_a)
        ratings[b] = r_b + k * (s_b - (1.0 - e_a))
    return ratings



[docs]
def compute_elo_permutation(
    matches: Iterable[Match],
    *,
    k: float = DEFAULT_K,
    initial_rating: float = DEFAULT_INITIAL_RATING,
    n_perms: int = DEFAULT_N_PERMS,
    seed: int = 0,
) -> dict[str, EloResult]:
    """Compute permutation-averaged Elo ratings.

    The match list is shuffled ``n_perms`` times with
    :func:`numpy.random.default_rng(seed)`; for each permutation the Elo
    update is run sequentially and the final rating per entrant is recorded.
    The returned dict reports mean ± SEM and 95% CI per entrant alongside
    the full per-permutation trace.

    Args:
        matches: Iterable of ``(entrant_a, entrant_b, winner)`` triples.
            ``winner`` may be ``None`` or ``"TIE"`` to mark a tie; ties are
            dropped before the permutation loop.
        k: Elo K-factor.
        initial_rating: Starting rating for every entrant on every
            permutation. Boubdir et al. (2023) fix this at 1400.
        n_perms: Number of independent permutations to run. The paper
            recommends ``>= 100``; the headline run uses 500.
        seed: Seed for the NumPy ``Generator`` driving the shuffles. Logged
            on the result so runs are reproducible.

    Returns:
        Mapping ``entrant_id -> EloResult`` for every entrant referenced by
        ``matches``.

    Raises:
        ValueError: If ``n_perms < 1``, if ``matches`` is empty, or if any
            match has a winner that is neither one of the two entrants nor
            a recognised tie marker.
    """
    if n_perms < 1:
        raise ValueError(f"n_perms must be >= 1, got {n_perms}.")

    decisive = _drop_ties(matches)
    if not decisive:
        raise ValueError(
            "No decisive matches after dropping ties — "
            "cannot compute Elo over an empty match list."
        )

    entrants = _entrant_ids(decisive)
    rng = np.random.default_rng(seed)

    n = len(decisive)
    # Per-perm final ratings: row i = perm i, col j = entrants[j].
    final_ratings = np.empty((n_perms, len(entrants)), dtype=np.float64)

    for perm_idx in range(n_perms):
        order = rng.permutation(n)
        shuffled = [decisive[i] for i in order]
        ratings = _run_single_pass(
            shuffled, entrants, k=k, initial_rating=initial_rating
        )
        for col, entrant_id in enumerate(entrants):
            final_ratings[perm_idx, col] = ratings[entrant_id]

    results: dict[str, EloResult] = {}
    for col, entrant_id in enumerate(entrants):
        traces = final_ratings[:, col]
        mean = float(traces.mean())
        # ddof=1 for the unbiased estimator; SEM is undefined for n_perms == 1.
        if n_perms > 1:
            std = float(traces.std(ddof=1))
            sem = std / math.sqrt(n_perms)
        else:
            sem = 0.0
        results[entrant_id] = EloResult(
            entrant_id=entrant_id,
            mean=mean,
            sem=sem,
            ci95_low=mean - 1.96 * sem,
            ci95_high=mean + 1.96 * sem,
            per_perm_ratings=traces.copy(),
        )
    return results




[docs]
def k_factor_sweep(
    matches: Iterable[Match],
    *,
    k_values: Sequence[int | float] = DEFAULT_K_SWEEP,
    initial_rating: float = DEFAULT_INITIAL_RATING,
    n_perms: int = DEFAULT_N_PERMS,
    seed: int = 0,
) -> dict[float, dict[str, EloResult]]:
    """Run :func:`compute_elo_permutation` across a list of K-factors.

    Returns a heatmap-ready ``{k: {entrant_id: EloResult}}`` structure. The
    same shuffle ``seed`` is used for every K so the difference across the
    sweep reflects the K-factor's effect rather than shuffle variance.

    Args:
        matches: Same contract as :func:`compute_elo_permutation`.
        k_values: K-factors to sweep. Default ``{1, 4, 8, 16, 32}`` covers
            the range where the Boubdir paper observed ranking changes on
            their benchmark suites.
        initial_rating: Starting rating per entrant per permutation.
        n_perms: Permutations per K-factor.
        seed: Seed for every K's permutation loop.

    Returns:
        Mapping ``k -> {entrant_id: EloResult}``. K-factors are stored as
        ``float`` keys for stable hashing across int / float inputs.
    """
    # Materialise once so each K sees the same input regardless of generator
    # exhaustion or list mutation.
    materialised = list(matches)
    sweep: dict[float, dict[str, EloResult]] = {}
    for k in k_values:
        sweep[float(k)] = compute_elo_permutation(
            materialised,
            k=float(k),
            initial_rating=initial_rating,
            n_perms=n_perms,
            seed=seed,
        )
    return sweep




[docs]
def rank_entrants(
    results: dict[str, EloResult],
) -> list[tuple[str, float]]:
    """Return ``[(entrant_id, mean_rating), ...]`` sorted high-to-low."""
    return sorted(
        ((eid, r.mean) for eid, r in results.items()),
        key=lambda x: x[1],
        reverse=True,
    )