Source code for olmo_tap.final_evals.elo.report

"""Tournament reporting helpers.

Stubbed for now. Will produce:

  - ``elo_results.json`` — mean / SEM / 95% CI per entrant per dimension,
    plus the full per-permutation traces.
  - ``sensitivity_heatmap.png`` — K × entrant heatmap mirroring the
    ranking-stability figure from Boubdir et al. (2023).
  - ``pairwise_winrates.csv`` — raw win/loss/tie counts per pair per
    dimension (pre-Elo).
  - ``judge_log.jsonl`` — every judge query with full inputs, verdict,
    reasoning trace, and cache key.
  - ``run_manifest.json`` — timestamps, seeds, model versions, prompt-set
    hash, rubric version (so reviewers can verify reproducibility).
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Mapping

from olmo_tap.final_evals.elo.elo_engine import EloResult


[docs] def write_results_json( results_per_dim: Mapping[str, dict[str, EloResult]], out_path: Path, ) -> None: """Serialise per-dimension Elo results to ``out_path``.""" raise NotImplementedError( "write_results_json is not yet implemented — pending the reporting build-out." )
[docs] def render_sensitivity_heatmap( sweep: Mapping[float, dict[str, EloResult]], out_path: Path, *, dimension: str, ) -> None: """Render the K × entrant heatmap (one PNG per dimension).""" raise NotImplementedError( "render_sensitivity_heatmap is not yet implemented — pending the " "reporting build-out." )
[docs] def write_run_manifest( config: Mapping[str, Any], out_path: Path, ) -> None: """Snapshot every reproducibility-relevant input for the report.""" raise NotImplementedError( "write_run_manifest is not yet implemented — pending the reporting build-out." )