Source code for olmo_tap.benchmarks.harness
import statistics
import torch
[docs]
def get_l2_flush_buffer():
l2_bytes = torch.cuda.get_device_properties(0).L2_cache_size
n_elements = l2_bytes // 4
return torch.empty(n_elements, dtype=torch.int32, device="cuda")
[docs]
def timed_call(fn, setup=None, flush_buf=None):
if setup is not None:
setup()
if flush_buf is not None:
flush_buf.zero_()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
fn()
end.record()
torch.cuda.synchronize()
return start.elapsed_time(end)
[docs]
def benchmark(fn, warmup_ms=100.0, rep_ms=1000.0, flush_l2=False, setup=None):
if setup is not None:
setup()
fn()
torch.cuda.synchronize()
flush_buf = get_l2_flush_buffer() if flush_l2 else None
estimate_ms = statistics.median(
[timed_call(fn, setup, flush_buf) for _ in range(5)]
)
n_warmup = max(1, int(warmup_ms / estimate_ms))
n_repeat = max(1, int(rep_ms / estimate_ms))
for _ in range(n_warmup):
timed_call(fn, setup, flush_buf)
return [timed_call(fn, setup, flush_buf) for _ in range(n_repeat)]
[docs]
def filter_outliers_iqr(timings, factor=1.5):
q1, _, q3 = statistics.quantiles(timings)
iqr = q3 - q1
lo = q1 - factor * iqr
hi = q3 + factor * iqr
return [t for t in timings if lo <= t <= hi]
[docs]
def compute_stats(timings):
sorted_t = sorted(timings)
n = len(timings)
mean = sum(timings) / n
med = sorted_t[n // 2]
variance = sum((t - mean) ** 2 for t in sorted_t) / n
return {
"mean_ms": round(mean, 4),
"median_ms": round(med, 4),
"std_ms": round(variance**0.5, 4),
"min_ms": round(sorted_t[0], 4),
"max_ms": round(sorted_t[-1], 4),
"p20_ms": round(sorted_t[int(n * 0.2)], 4),
"p80_ms": round(sorted_t[int(n * 0.8)], 4),
"n": n,
}