IntelPython · vchamarthi · May 4, 2026 · May 11, 2026 · May 15, 2026 · May 18, 2026
@@ -0,0 +1,52 @@
+# mkl_umath ASV Benchmarks
+
+Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).
+
+The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads.
+
+### Coverage
+
+| File | Ufuncs | Dtypes | Sizes/Presets |
+|------|--------|--------|---------------|
+| `micro/bench_micro.py` | 25 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M |
+| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) |
+| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) |
+| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) |
+| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) |
+
+## Running Benchmarks
+
+Prerequisites:
+
+```bash
+pip install asv psutil
+```
+
+Run benchmarks against the current commit:
+
+```bash
+asv run --python=same --quick HEAD^!
+```
+
+Compare two commits:
+
+```bash
+asv continuous --python=same HEAD~1 HEAD
+```
+
+View results in a browser:
+
+```bash
+asv publish
+asv preview
+```
+
+## Threading
+
+Set `MKL_NUM_THREADS` to control the thread count used by MKL:
+
+```bash
+MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
+```
+
+If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).
@@ -0,0 +1,20 @@
+{
+    "version": 1,
+    "project": "mkl_umath",
+    "project_url": "https://github.com/IntelPython/mkl_umath",
+    "repo": "..",
+    "branches": [
+        "main"
+    ],
+    "environment_type": "existing",
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html",
+    "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/",
+    "build_cache_size": 2,
+    "default_benchmark_timeout": 1500,
+    "regressions_thresholds": {
+        ".*": 0.2
+    }
+}
@@ -0,0 +1,26 @@
+"""ASV benchmarks for mkl_umath"""
+
+import os
+
+import psutil
+
+from ._patch_setup import _apply_patches
+
+_MIN_THREADS = 4  # minimum physical cores required for multi-threaded mode
+
+
+def _physical_cores():
+    """Return physical core count; fall back to 1 (conservative)."""
+    return psutil.cpu_count(logical=False) or 1
+
+
+def _thread_count():
+    physical = _physical_cores()
+    return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"
+
+
+_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
+os.environ["MKL_NUM_THREADS"] = _THREADS
+
+_apply_patches()
+del _apply_patches
@@ -0,0 +1,65 @@
+"""MKL patch setup — executed once per ASV worker process at import time.
+
+Patches NumPy with the Intel MKL umath implementation.
+Hard-fails with a descriptive RuntimeError if mkl_umath is missing or the
+patch does not take effect, so benchmarks never silently run on stock NumPy.
+"""
+
+_PATCH_MAP = [
+    ("mkl_umath", "patch_numpy_umath"),
+]
+
+
+def _apply_patches():
+    import numpy as np
+
+    patched = {}
+
+    for mod_name, patch_fn_name in _PATCH_MAP:
+        try:
+            mod = __import__(mod_name)
+        except ImportError as exc:
+            raise RuntimeError(
+                f"[mkl-patch] Cannot import {mod_name}: {exc}\n"
+                f"  Ensure the conda env contains {mod_name} "
+                f"from the Intel channel.\n"
+                "  Required channels: "
+                "https://software.repos.intel.com/python/conda"
+            ) from exc
+
+        patch_fn = getattr(mod, patch_fn_name, None)
+        if patch_fn is None:
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name} has no {patch_fn_name}(). "
+                f"Upgrade {mod_name} to a version that exposes "
+                "the stock-numpy patch API."
+            )
+
+        try:
+            patch_fn()
+        except Exception as exc:
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}"
+            ) from exc
+
+        is_patched_fn = getattr(mod, "is_patched", None)
+        if callable(is_patched_fn) and not is_patched_fn():
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name}.is_patched() returned False "
+                "after patching. NumPy may have been imported before "
+                "patching in a conflicting state."
+            )
+
+        patched[mod_name] = mod
+
+    _attr_checks = {
+        "mkl_umath": lambda: np.exp.__module__,
+    }
+    for mod_name in patched:
+        try:
+            attr = _attr_checks[mod_name]()
+        except Exception:
+            attr = "unknown"
+        print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}")
+
+    print("[mkl-patch] ALL OK -- mkl_umath active")
@@ -0,0 +1,90 @@
+"""Micro-benchmarks for mkl_umath unary ufuncs.
+
+Times each ufunc over a Cartesian product of
+  dtype  in [float32, float64]
+  size   in [10_000, 100_000, 1_000_000]
+
+Arrays are pre-allocated in setup() and reused across timing calls.
+Patching is applied once at package import via benchmarks._patch_setup.
+"""
+
+import numpy as np
+
+_UFUNC_CONFIGS = {
+    "exp": {"func": np.exp, "low": -10.0, "high": 10.0},
+    "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0},
+    "expm1": {"func": np.expm1, "low": -10.0, "high": 10.0},
+    "log": {"func": np.log, "low": 1e-3, "high": 1e3},
+    "log2": {"func": np.log2, "low": 1e-3, "high": 1e3},
+    "log10": {"func": np.log10, "low": 1e-3, "high": 1e3},
+    "log1p": {"func": np.log1p, "low": 0.0, "high": 10.0},
+    "sin": {"func": np.sin, "low": -np.pi, "high": np.pi},
+    "cos": {"func": np.cos, "low": -np.pi, "high": np.pi},
+    "tan": {"func": np.tan, "low": -1.4, "high": 1.4},
+    "arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0},
+    "arccos": {"func": np.arccos, "low": -1.0, "high": 1.0},
+    "arctan": {"func": np.arctan, "low": -10.0, "high": 10.0},
+    "sinh": {"func": np.sinh, "low": -5.0, "high": 5.0},
+    "cosh": {"func": np.cosh, "low": -5.0, "high": 5.0},
+    "tanh": {"func": np.tanh, "low": -5.0, "high": 5.0},
+    "arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0},
+    "arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0},
+    "arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99},
+    "sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0},
+    "cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0},
+    "square": {"func": np.square, "low": -10.0, "high": 10.0},
+    "fabs": {"func": np.fabs, "low": -100.0, "high": 100.0},
+    "absolute": {"func": np.absolute, "low": -100.0, "high": 100.0},
+    "reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0},
+}
+
+
+class BenchMicro:
+    params = (
+        sorted(_UFUNC_CONFIGS.keys()),
+        ["float32", "float64"],
+        [10_000, 100_000, 1_000_000],
+    )
+    param_names = ["ufunc", "dtype", "size"]
+
+    def setup(self, ufunc, dtype, size):
+        cfg = _UFUNC_CONFIGS[ufunc]
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype)
+        self._func = cfg["func"]
+        self._func(self.x)
+
+    def time_micro(self, ufunc, dtype, size):
+        self._func(self.x)
+
+
+class BenchArctan2:
+    """Binary ufunc arctan2"""
+
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
+        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
+        np.arctan2(self.y, self.x)
+
+    def time_arctan2(self, dtype, size):
+        np.arctan2(self.y, self.x)
+
+
+class BenchPower:
+    """Binary ufunc power (arbitrary exponent via MKL vdPow)"""
+
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.base = rng.uniform(0.1, 10.0, size).astype(dtype)
+        self.exp = rng.uniform(0.5, 3.0, size).astype(dtype)
+        np.power(self.base, self.exp)
+
+    def time_power(self, dtype, size):
+        np.power(self.base, self.exp)
@@ -0,0 +1,54 @@
+"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt.
+
+Preset sizes from npbench bench_info/arc_distance.json:
+  M: N=1_000_000
+  L: N=10_000_000
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py
+def _initialize(N):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+    t0 = rng.random((N,))
+    p0 = rng.random((N,))
+    t1 = rng.random((N,))
+    p1 = rng.random((N,))
+    return t0, p0, t1, p1
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py
+def _arc_distance(theta_1, phi_1, theta_2, phi_2):
+    temp = (
+        np.sin((theta_2 - theta_1) / 2) ** 2
+        + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2
+    )
+    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))
+
+
+_PRESETS = {
+    "M": {"N": 1_000_000},
+    "L": {"N": 10_000_000},
+}
+
+
+class BenchArcDistance:
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset]
+        _arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)
+
+    def time_arc_distance(self, cache, preset):
+        _arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)
@@ -0,0 +1,76 @@
+"""npbench wrapper: GoFast — mkl_umath ops: tanh.
+
+Preset sizes from npbench bench_info/go_fast.json:
+  M: N=6_000
+  L: N=20_000
+
+Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop
+(go_fast_loop).  A vectorized variant (go_fast_vec) using np.tanh on the
+full diagonal is included for direct MKL VM throughput measurement.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py
+def _initialize(N):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+    a = rng.random((N, N))
+    return (a,)
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py
+def _go_fast(a):
+    trace = 0.0
+    for i in range(a.shape[0]):
+        trace += np.tanh(a[i, i])
+    return a + trace
+
+
+_PRESETS = {
+    "M": {"N": 6_000},
+    "L": {"N": 20_000},
+}
+
+
+class BenchGoFastLoop:
+    """Original npbench kernel — Python loop calling np.tanh per element."""
+
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.a,) = cache[preset]
+        np.tanh(self.a[0, 0])
+
+    def time_go_fast_loop(self, cache, preset):
+        _go_fast(self.a)
+
+
+class BenchGoFastVec:
+    """Vectorized variant — np.tanh on the full diagonal array at once."""
+
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.a,) = cache[preset]
+        self.diag = np.copy(np.diag(self.a))
+        np.tanh(self.diag)
+
+    def time_go_fast_vec(self, cache, preset):
+        np.tanh(self.diag)