1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
import itertools
import os
import random
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from typing import Any
from benchmark_runner import BenchmarkRunner # type: ignore[import-not-found]
from benchmark_utils import ( # type: ignore[import-not-found]
fits_in_memory,
get_mm_tensors,
get_random_between_pow2,
set_precision,
)
import torch
from torch._inductor.utils import fresh_inductor_cache
class BenchmarkRunnerMM(BenchmarkRunner): # type: ignore[misc, no-any-unimported]
"""
BenchmarkRunner for mm.
"""
def __init__(self) -> None:
super().__init__("mm")
def create_input(self) -> tuple[Any, ...]:
dtype = random.choices([torch.float32, torch.float16, torch.bfloat16])[0]
set_precision(dtype)
m, k, n = self.get_m_k_n(dtype)
return (m, k, n, dtype)
def run_benchmark(
self,
m: int,
k: int,
n: int,
dtype: Any,
) -> Any:
# for a given shape, test all possible combinations of transpose_left and transpose_right
for transpose_left, transpose_right in itertools.product(
[False, True], repeat=2
):
print(
f"m: {m}, k: {k}, n: {n}, transpose_left: {transpose_left}, transpose_right: {transpose_right}, dtype: {dtype}"
)
a, b = get_mm_tensors(
m,
k,
n,
transpose_left,
transpose_right,
dtype_left=dtype,
dtype_right=dtype,
)
with fresh_inductor_cache():
def mixed_mm(A: Any, B: Any) -> Any:
return torch.mm(A, B)
cf = torch.compile(mixed_mm, mode="max-autotune-no-cudagraphs")
cf(a, b)
torch.compiler.reset()
def random_multiple_of_128(self, min_num: int = 7, max_num: int = 17) -> int:
# generates a random number ran_pow2 between min_num and max_num -1
# and returns a random multiple of 128 between 2^ran_pow2 and 2^(ran_pow2+1)
ran_pow2 = random.randint(min_num, max_num - 1)
start = (2**ran_pow2) // 128
end = (2 ** (ran_pow2 + 1)) // 128
random_multiple = random.randint(start, end)
return random_multiple * 128
def get_distr_type(self) -> str:
# 85%: choose a random multiple of 128 between 2^10 and 2^17
# 10%: choose a random power of 2 between 2^0 and 2^17
# 4%: choose a random number between 1 and 131072
# 1%: choose a random number between 2^i and 2^(i+1) with i in [1, 16]
return random.choices(
["mult_128", "pow2", "uniform", "uniform-between-pow2"],
[0.85, 0.1, 0.04, 0.01],
)[0]
def get_random_dim(self) -> int:
distr_type = self.get_distr_type()
if distr_type == "mult_128":
return self.random_multiple_of_128(min_num=10, max_num=17)
if distr_type == "pow2":
return int(2 ** random.randint(0, 17))
elif distr_type == "uniform-between-pow2":
# TODO(AlnisM): make mypy work for torchgen/_autoheuristic/
return int(get_random_between_pow2(min_power2=1, max_power2=17))
elif distr_type == "uniform":
return random.randint(1, 131072)
print(f"random_type {distr_type} not supported")
sys.exit(1)
def get_m_k_n(self, dtype: Any) -> tuple[int, int, int]:
numel_max = 2**31
# repeat until tensors fit in memory
while True:
m = self.get_random_dim()
k = self.get_random_dim()
n = self.get_random_dim()
if m * k >= numel_max or m * n >= numel_max or k * n >= numel_max:
# autotuning will not happen for tensors that are this large
continue
if fits_in_memory(dtype, m, k, n):
return (m, k, n)
if __name__ == "__main__":
runner = BenchmarkRunnerMM()
runner.run()
|