Source code for olmo_tap.benchmarks.harness

import statistics
import torch


[docs] def get_l2_flush_buffer(): l2_bytes = torch.cuda.get_device_properties(0).L2_cache_size n_elements = l2_bytes // 4 return torch.empty(n_elements, dtype=torch.int32, device="cuda")
[docs] def timed_call(fn, setup=None, flush_buf=None): if setup is not None: setup() if flush_buf is not None: flush_buf.zero_() torch.cuda.synchronize() start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() fn() end.record() torch.cuda.synchronize() return start.elapsed_time(end)
[docs] def benchmark(fn, warmup_ms=100.0, rep_ms=1000.0, flush_l2=False, setup=None): if setup is not None: setup() fn() torch.cuda.synchronize() flush_buf = get_l2_flush_buffer() if flush_l2 else None estimate_ms = statistics.median( [timed_call(fn, setup, flush_buf) for _ in range(5)] ) n_warmup = max(1, int(warmup_ms / estimate_ms)) n_repeat = max(1, int(rep_ms / estimate_ms)) for _ in range(n_warmup): timed_call(fn, setup, flush_buf) return [timed_call(fn, setup, flush_buf) for _ in range(n_repeat)]
[docs] def filter_outliers_iqr(timings, factor=1.5): q1, _, q3 = statistics.quantiles(timings) iqr = q3 - q1 lo = q1 - factor * iqr hi = q3 + factor * iqr return [t for t in timings if lo <= t <= hi]
[docs] def compute_stats(timings): sorted_t = sorted(timings) n = len(timings) mean = sum(timings) / n med = sorted_t[n // 2] variance = sum((t - mean) ** 2 for t in sorted_t) / n return { "mean_ms": round(mean, 4), "median_ms": round(med, 4), "std_ms": round(variance**0.5, 4), "min_ms": round(sorted_t[0], 4), "max_ms": round(sorted_t[-1], 4), "p20_ms": round(sorted_t[int(n * 0.2)], 4), "p80_ms": round(sorted_t[int(n * 0.8)], 4), "n": n, }