Skip to content
52 changes: 52 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# mkl_umath ASV Benchmarks

Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).

The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads.

### Coverage

| File | Ufuncs | Dtypes | Sizes/Presets |
|------|--------|--------|---------------|
| `micro/bench_micro.py` | 25 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M |
| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) |
| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) |
| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) |
| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) |

## Running Benchmarks

Prerequisites:

```bash
pip install asv psutil
```

Run benchmarks against the current commit:

```bash
asv run --python=same --quick HEAD^!
```

Compare two commits:

```bash
asv continuous --python=same HEAD~1 HEAD
```

View results in a browser:

```bash
asv publish
asv preview
```

## Threading

Set `MKL_NUM_THREADS` to control the thread count used by MKL:

```bash
MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
```

If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).
20 changes: 20 additions & 0 deletions benchmarks/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"version": 1,
"project": "mkl_umath",
"project_url": "https://github.com/IntelPython/mkl_umath",
"repo": "..",
"branches": [
"main"
],
"environment_type": "existing",
"benchmark_dir": "benchmarks",
"env_dir": ".asv/env",
"results_dir": ".asv/results",
"html_dir": ".asv/html",
"show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/",
"build_cache_size": 2,
"default_benchmark_timeout": 1500,
"regressions_thresholds": {
".*": 0.2
}
}
26 changes: 26 additions & 0 deletions benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""ASV benchmarks for mkl_umath"""

import os

import psutil

from ._patch_setup import _apply_patches

_MIN_THREADS = 4 # minimum physical cores required for multi-threaded mode


def _physical_cores():
"""Return physical core count; fall back to 1 (conservative)."""
return psutil.cpu_count(logical=False) or 1


def _thread_count():
physical = _physical_cores()
return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"


_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
os.environ["MKL_NUM_THREADS"] = _THREADS

_apply_patches()
del _apply_patches
65 changes: 65 additions & 0 deletions benchmarks/benchmarks/_patch_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""MKL patch setup — executed once per ASV worker process at import time.

Patches NumPy with the Intel MKL umath implementation.
Hard-fails with a descriptive RuntimeError if mkl_umath is missing or the
patch does not take effect, so benchmarks never silently run on stock NumPy.
"""

_PATCH_MAP = [
Comment thread
vchamarthi marked this conversation as resolved.
("mkl_umath", "patch_numpy_umath"),
]


def _apply_patches():
import numpy as np

patched = {}

for mod_name, patch_fn_name in _PATCH_MAP:
try:
mod = __import__(mod_name)
except ImportError as exc:
raise RuntimeError(
f"[mkl-patch] Cannot import {mod_name}: {exc}\n"
f" Ensure the conda env contains {mod_name} "
f"from the Intel channel.\n"
" Required channels: "
"https://software.repos.intel.com/python/conda"
) from exc

patch_fn = getattr(mod, patch_fn_name, None)
if patch_fn is None:
raise RuntimeError(
f"[mkl-patch] {mod_name} has no {patch_fn_name}(). "
f"Upgrade {mod_name} to a version that exposes "
"the stock-numpy patch API."
)

try:
patch_fn()
except Exception as exc:
raise RuntimeError(
f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}"
) from exc

is_patched_fn = getattr(mod, "is_patched", None)
if callable(is_patched_fn) and not is_patched_fn():
raise RuntimeError(
f"[mkl-patch] {mod_name}.is_patched() returned False "
"after patching. NumPy may have been imported before "
"patching in a conflicting state."
)

patched[mod_name] = mod

_attr_checks = {
"mkl_umath": lambda: np.exp.__module__,
}
for mod_name in patched:
try:
attr = _attr_checks[mod_name]()
except Exception:
attr = "unknown"
print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}")

print("[mkl-patch] ALL OK -- mkl_umath active")
Empty file.
90 changes: 90 additions & 0 deletions benchmarks/benchmarks/micro/bench_micro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Micro-benchmarks for mkl_umath unary ufuncs.

Times each ufunc over a Cartesian product of
dtype in [float32, float64]
size in [10_000, 100_000, 1_000_000]

Arrays are pre-allocated in setup() and reused across timing calls.
Patching is applied once at package import via benchmarks._patch_setup.
"""

import numpy as np

_UFUNC_CONFIGS = {
"exp": {"func": np.exp, "low": -10.0, "high": 10.0},
"exp2": {"func": np.exp2, "low": -10.0, "high": 10.0},
"expm1": {"func": np.expm1, "low": -10.0, "high": 10.0},
"log": {"func": np.log, "low": 1e-3, "high": 1e3},
"log2": {"func": np.log2, "low": 1e-3, "high": 1e3},
"log10": {"func": np.log10, "low": 1e-3, "high": 1e3},
"log1p": {"func": np.log1p, "low": 0.0, "high": 10.0},
"sin": {"func": np.sin, "low": -np.pi, "high": np.pi},
"cos": {"func": np.cos, "low": -np.pi, "high": np.pi},
"tan": {"func": np.tan, "low": -1.4, "high": 1.4},
"arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0},
"arccos": {"func": np.arccos, "low": -1.0, "high": 1.0},
"arctan": {"func": np.arctan, "low": -10.0, "high": 10.0},
"sinh": {"func": np.sinh, "low": -5.0, "high": 5.0},
"cosh": {"func": np.cosh, "low": -5.0, "high": 5.0},
"tanh": {"func": np.tanh, "low": -5.0, "high": 5.0},
"arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0},
"arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0},
"arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99},
"sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0},
"cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0},
"square": {"func": np.square, "low": -10.0, "high": 10.0},
"fabs": {"func": np.fabs, "low": -100.0, "high": 100.0},
"absolute": {"func": np.absolute, "low": -100.0, "high": 100.0},
"reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0},
}


class BenchMicro:
params = (
sorted(_UFUNC_CONFIGS.keys()),
["float32", "float64"],
[10_000, 100_000, 1_000_000],
Comment thread
vchamarthi marked this conversation as resolved.
)
param_names = ["ufunc", "dtype", "size"]

def setup(self, ufunc, dtype, size):
Comment thread
vchamarthi marked this conversation as resolved.
cfg = _UFUNC_CONFIGS[ufunc]
rng = np.random.default_rng(42)
self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype)
self._func = cfg["func"]
self._func(self.x)

def time_micro(self, ufunc, dtype, size):
self._func(self.x)


class BenchArctan2:
"""Binary ufunc arctan2"""

params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
param_names = ["dtype", "size"]

def setup(self, dtype, size):
rng = np.random.default_rng(42)
self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
np.arctan2(self.y, self.x)

def time_arctan2(self, dtype, size):
np.arctan2(self.y, self.x)


class BenchPower:
"""Binary ufunc power (arbitrary exponent via MKL vdPow)"""

params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
param_names = ["dtype", "size"]

def setup(self, dtype, size):
rng = np.random.default_rng(42)
self.base = rng.uniform(0.1, 10.0, size).astype(dtype)
self.exp = rng.uniform(0.5, 3.0, size).astype(dtype)
np.power(self.base, self.exp)

def time_power(self, dtype, size):
np.power(self.base, self.exp)
Empty file.
54 changes: 54 additions & 0 deletions benchmarks/benchmarks/npbench/bench_arc_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt.

Preset sizes from npbench bench_info/arc_distance.json:
M: N=1_000_000
L: N=10_000_000
"""

import numpy as np


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py
def _initialize(N):
from numpy.random import default_rng

rng = default_rng(42)
t0 = rng.random((N,))
p0 = rng.random((N,))
t1 = rng.random((N,))
p1 = rng.random((N,))
return t0, p0, t1, p1


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py
def _arc_distance(theta_1, phi_1, theta_2, phi_2):
temp = (
np.sin((theta_2 - theta_1) / 2) ** 2
+ np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2
)
return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))


_PRESETS = {
"M": {"N": 1_000_000},
"L": {"N": 10_000_000},
}


class BenchArcDistance:
params = (["M", "L"],)
param_names = ["preset"]
number = 1
repeat = 20

def setup_cache(self):
return {p: _initialize(**kw) for p, kw in _PRESETS.items()}

def setup(self, cache, preset):
self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset]
_arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)

def time_arc_distance(self, cache, preset):
_arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)
76 changes: 76 additions & 0 deletions benchmarks/benchmarks/npbench/bench_go_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""npbench wrapper: GoFast — mkl_umath ops: tanh.

Preset sizes from npbench bench_info/go_fast.json:
M: N=6_000
L: N=20_000

Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop
(go_fast_loop). A vectorized variant (go_fast_vec) using np.tanh on the
full diagonal is included for direct MKL VM throughput measurement.
"""

import numpy as np


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py
def _initialize(N):
from numpy.random import default_rng

rng = default_rng(42)
a = rng.random((N, N))
return (a,)


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py
def _go_fast(a):
trace = 0.0
for i in range(a.shape[0]):
trace += np.tanh(a[i, i])
return a + trace


_PRESETS = {
"M": {"N": 6_000},
"L": {"N": 20_000},
}


class BenchGoFastLoop:
"""Original npbench kernel — Python loop calling np.tanh per element."""

params = (["M", "L"],)
param_names = ["preset"]
number = 1
repeat = 20

def setup_cache(self):
return {p: _initialize(**kw) for p, kw in _PRESETS.items()}

def setup(self, cache, preset):
(self.a,) = cache[preset]
np.tanh(self.a[0, 0])

def time_go_fast_loop(self, cache, preset):
_go_fast(self.a)


class BenchGoFastVec:
"""Vectorized variant — np.tanh on the full diagonal array at once."""

params = (["M", "L"],)
param_names = ["preset"]
number = 1
repeat = 20

def setup_cache(self):
return {p: _initialize(**kw) for p, kw in _PRESETS.items()}

def setup(self, cache, preset):
(self.a,) = cache[preset]
self.diag = np.copy(np.diag(self.a))
np.tanh(self.diag)

def time_go_fast_vec(self, cache, preset):
np.tanh(self.diag)
Loading
Loading