Source code for olmo_tap.final_evals.elo.report
"""Tournament reporting helpers.
Stubbed for now. Will produce:
- ``elo_results.json`` — mean / SEM / 95% CI per entrant per dimension,
plus the full per-permutation traces.
- ``sensitivity_heatmap.png`` — K × entrant heatmap mirroring the
ranking-stability figure from Boubdir et al. (2023).
- ``pairwise_winrates.csv`` — raw win/loss/tie counts per pair per
dimension (pre-Elo).
- ``judge_log.jsonl`` — every judge query with full inputs, verdict,
reasoning trace, and cache key.
- ``run_manifest.json`` — timestamps, seeds, model versions, prompt-set
hash, rubric version (so reviewers can verify reproducibility).
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Mapping
from olmo_tap.final_evals.elo.elo_engine import EloResult
[docs]
def write_results_json(
results_per_dim: Mapping[str, dict[str, EloResult]],
out_path: Path,
) -> None:
"""Serialise per-dimension Elo results to ``out_path``."""
raise NotImplementedError(
"write_results_json is not yet implemented — pending the reporting build-out."
)
[docs]
def render_sensitivity_heatmap(
sweep: Mapping[float, dict[str, EloResult]],
out_path: Path,
*,
dimension: str,
) -> None:
"""Render the K × entrant heatmap (one PNG per dimension)."""
raise NotImplementedError(
"render_sensitivity_heatmap is not yet implemented — pending the "
"reporting build-out."
)
[docs]
def write_run_manifest(
config: Mapping[str, Any],
out_path: Path,
) -> None:
"""Snapshot every reproducibility-relevant input for the report."""
raise NotImplementedError(
"write_run_manifest is not yet implemented — pending the reporting build-out."
)