Source code for rics.performance._multi_case_timer

import functools
import logging
import warnings
from collections.abc import Collection, Hashable, Mapping
from time import perf_counter
from timeit import Timer
from typing import Any, ClassVar, Generic, Self, TypeAlias

from rics.logs import LoggerArg, get_logger
from rics.misc import tname
from rics.strings import format_perf_counter as fmt_perf
from rics.strings import format_seconds as fmt_time

from ._autonumber import compute_candidate_numbers
from ._generated_data import GeneratedData
from ._progress import make_progress
from ._skip_if import SkipIfFunc, SkipIfParams
from ._strata import _AUTO_PROBE_SECONDS, Strata, estimate_label_costs, make_strata
from .types import CandFunc, DataFunc, DataType, ResultsDict, SetupFunc, StratifyArg, Ts

UNRELIABLE_RESULTS_LIMIT = 1e-6  # Prevent spurious "4x" warnings.

CandidateMethodArg: TypeAlias = Mapping[str, CandFunc[DataType]] | Collection[CandFunc[DataType]] | CandFunc[DataType]
TestDataArg: TypeAlias = Mapping[Any, DataType] | Collection[DataType]



[docs]
class MultiCaseTimer(Generic[DataType, *Ts]):
    """Performance testing implementation for multiple candidates and data sets.

    Test data:
        * Typically a dict ``{label: data}`` to evaluate candidates.

          * Other collections are converted to ``dict`` using :meth:`process_test_data`. String label will then be based
            on sample data.
          * Labels may also be ``tuple``. This may then be used to plot different categories of data in different
            facets; see the :func:`.plot_run` function with the `names` argument.
          * For non-dict inputs, string labels will be generated automatically.

        * If `test_data` is :py:func:`callable`, test data will be generated from the `case_args`.

          * The `case_args` will be passed as positional arguments.
          * The `case_args` will be used as the output labels when using :meth:`run` (similar to the ``setup`` option
            provided by the built-in :py:mod:`timeit` module).

        Data access time is *not* measured by the ``run`` method.

    Timing model:
        :meth:`run` derives a single iteration ``number`` **per candidate** (shared across all test-data variants, so
        candidates stay comparable), calibrated so each repetition takes about ``time_per_candidate``. The total
        runtime is therefore approximately ``repeat * time_per_candidate * n_candidates``: adding more test-data
        variants does *not* increase it, it divides the per-candidate budget across the variants. Pass ``number``
        explicitly to bypass calibration.

        When variants differ wildly in cost (e.g. tiny and huge inputs in one run), the shared ``number`` is driven
        by the slowest variants, leaving the fast ones under-sampled and noisy. Stratification (below) fixes this.

    Stratification:
        The first ``run(stratify="auto")`` probes once and caches the resulting :class:`.Strata` on the instance, so
        later runs reuse it. Use :meth:`compute_strata` to derive a grouping without side effects -- to inspect what
        ``"auto"`` chose or share it across timers -- or :meth:`fit_strata` to derive *and* cache it (e.g. with a
        tuned probe). Either result, or any mapping, can be passed back as ``run(stratify=...)``.

    Args:
        candidate_method: A dict ``{label: function}``. Alternatively, you may pass a collection of functions or a
            single function.
        test_data: A ``{label: data}`` to evaluate candidates on. You may also pass a list of data, which will be
            converted to a dict as above. Data may also be generated by passing a callable.
        case_args: Collection of positional arguments for a `test_data` callable.
        kwargs: Shared keyword arguments for a `test_data` callable.
        setup: A callable ``(data) -> data`` invoked -- **not** measured -- before each timed repetition to produce a
            fresh input (mirrors :py:class:`timeit.Timer`'s ``setup``). Use for candidates that mutate their input, or
            to reset shared state (e.g. caches) between repetitions.
        warmup: Number of untimed calls per candidate/data pair before timing begins (warms caches/JIT/imports).
        logger: Logger instance to use.

    Raises:
        TypeError: If `args` or `kwargs` are set when `test_data` is not a callable.
        ValueError: If `args` is empty and `test_data` is a callable.
    """

    LOGGER: ClassVar[logging.Logger | logging.LoggerAdapter[Any]] = logging.getLogger(__package__)
    """Class logger instance."""  # TODO(7.0.0): Remove.

    def __init__(
        self,
        candidate_method: CandidateMethodArg[DataType],
        test_data: TestDataArg[DataType] | DataFunc[*Ts, DataType],  # DataFunc[DataFuncP, DataType]
        *,
        case_args: Collection[tuple[*Ts]] | None = None,
        kwargs: Any | None = None,
        setup: SetupFunc[DataType] | None = None,
        warmup: int = 0,
        logger: LoggerArg | None = None,  # TODO(7.0.0): None -> True (since None=disabled)
    ) -> None:
        if logger is None:
            self._logger = self.LOGGER  # Legacy behavior.
        else:
            self._logger = get_logger(logger)

        self._candidates = self.process_candidates(candidate_method)

        self._data: dict[Hashable, DataType] | GeneratedData[DataType, *Ts]
        if callable(test_data):
            self._data = GeneratedData(test_data, case_args, kwargs, self._logger)
        else:
            if case_args or kwargs:
                msg = "Cannot pass `case_args` or `kwargs` when `test_data` is not a callable."
                raise TypeError(msg)

            self._data = self.process_test_data(test_data)

        self._setup = setup
        self._warmup = warmup
        self._strata: Strata | None = None  # Lazily fit + cached on the first run(stratify="auto").


[docs]
    @classmethod
    def process_candidates(cls, candidates: CandidateMethodArg[DataType]) -> dict[str, CandFunc[DataType]]:
        """Convert input candidates to the internal format."""
        rv = cls._process_candidates(candidates)
        if rv:
            return rv
        raise ValueError("No candidates given.")  # pragma: no cover



[docs]
    @classmethod
    def process_test_data(cls, test_data: TestDataArg[DataType]) -> dict[Hashable, DataType]:
        """Convert input test data to the internal format."""
        rv = {**test_data} if isinstance(test_data, Mapping) else cls._dict_from_collection(test_data)
        if rv:
            return rv
        raise ValueError("No case data given.")  # pragma: no cover



[docs]
    def derive_names(self) -> list[str]:
        """Derive names argument.

        Raises:
            TypeError: If `test_data` is not callable.
        """
        if not isinstance(self._data, GeneratedData):
            raise TypeError("Cannot derive names without callable `test_data`.")

        return self._data.derive_names()


    @property
    def is_data_generated(self) -> bool:
        """Returns ``True`` if the `test_data` is callable."""
        return isinstance(self._data, GeneratedData)


[docs]
    def compute_strata(
        self,
        stratify: StratifyArg = "auto",
        *,
        min_probe_time: float = _AUTO_PROBE_SECONDS,
        skip_if: SkipIfFunc[DataType, *Ts] | None = None,
    ) -> Strata:
        """Derive a :class:`Strata` grouping for this timer's candidates and data, without side effects.

        Valid ``stratify`` input types:
            * A callable ``(data_label) -> stratum_key``.
            * An ``int`` ``case_args`` level (group by ``case_args[level]``).
            * Literal ``"full"`` -- one stratum per variant.
            * ``"auto"`` -- derive a single ``case_args`` level automatically; see below.
            * A precomputed ``{stratum_key: {data_label, ...}}`` mapping.

        Automatic stratification:
            For ``stratify="auto"`` a quick timing probe measures each variant's cost, then the single ``case_args``
            level whose strata best cluster variants of *comparable* cost is chosen -- formally, the level minimizing
            the worst within-stratum cost ratio (usually the input size/cost dimension).

            The probe is deliberately cheap; increase `min_probe_time` to increase accuracy.

        Use :meth:`fit_strata` to cache the result for later ``run(stratify="auto")`` calls, or set the
        :attr:`.MultiCaseTimer.strata` property.

        Args:
            stratify: Any :data:`.StratifyArg`. A :class:`Strata` is returned unchanged; any other mapping is wrapped
                (and validated to cover the data).
            min_probe_time: Per ``(candidate, variant)`` budget for the ``"auto"`` probe; larger is less noisy but
                slower. Ignored unless `stratify` is ``"auto"``.
            skip_if: Filter applied while probing (``"auto"`` only); recorded on the result.

        Returns:
            The grouping.
        """
        if isinstance(stratify, Strata):
            return stratify

        cost = None
        if stratify == "auto":
            cost = estimate_label_costs(
                self._candidates,
                self._data,
                skip_if=skip_if,
                make_timer=self._new_timer,
                logger=self._logger,
                min_probe_time=min_probe_time,
            )

        return make_strata(self._data, stratify, cost=cost, skip_if=skip_if)


    @property
    def strata(self) -> Strata:
        """Cached :class:`Strata` instance; see :meth:`fit_strata`."""
        if self._strata is None:
            raise RuntimeError("not fitted")
        return self._strata

    @strata.setter
    def strata(self, value: Strata | None) -> None:
        if value is not None and not isinstance(value, Strata):
            raise TypeError(f"expected {Strata.__name__} or None, got {type(value).__name__}")
        self._strata = value


[docs]
    def fit_strata(
        self,
        stratify: StratifyArg = "auto",
        *,
        min_probe_time: float = _AUTO_PROBE_SECONDS,
        skip_if: SkipIfFunc[DataType, *Ts] | None = None,
    ) -> Self:
        """:meth:`compute_strata`, then cache the result so later ``run(stratify="auto")`` calls reuse it.

        This is how :meth:`run` memoizes its first implicit ``"auto"`` fit; call it yourself to control the probe
        (`min_probe_time`) or to pin a grouping before running. Any previously cached grouping is overwritten
        silently. See :meth:`compute_strata` for the arguments and how ``"auto"`` is derived.

        Use :attr:`strata` to access the cached instance.

        Returns:
            Self, for chained assignment.
        """
        start = perf_counter()
        strata = self.compute_strata(stratify, min_probe_time=min_probe_time, skip_if=skip_if)
        self.strata = strata
        self._logger.info(f"Cached {strata!r} in {fmt_perf(start)}; subsequent run(stratify='auto') will reuse it.")
        return self


    def _resolve_strata(
        self,
        stratify: StratifyArg,
        *,
        skip_if: SkipIfFunc[DataType, *Ts] | None,
        number: int | None,
    ) -> Strata:
        if isinstance(stratify, Strata):
            uncovered = [label for label in self._data if label not in stratify.labels]
            if uncovered:
                raise ValueError(f"Reused strata does not cover every data label; missing: {uncovered}.")
            if stratify.skip_if is not skip_if:
                warnings.warn(
                    f"Reusing strata fit with skip_if={stratify.skip_if!r} under a run with skip_if={skip_if!r}; "
                    "the grouping is kept as-is (it depends only on the data labels, not on skip_if).",
                    UserWarning,
                    stacklevel=3,
                )
            return stratify

        if stratify == "auto" and number is None:
            # Probe once, then reuse across runs; an explicit `number` makes grouping moot, so fall through instead.
            if self._strata is None:
                self.fit_strata("auto", skip_if=skip_if)
            return self.strata

        # No probe needed: None/full/int/callable, or any stratify when `number` makes the grouping moot.
        return make_strata(self._data, stratify, skip_if=skip_if)

    def _new_timer(self, func: CandFunc[DataType], data: DataType) -> Timer:
        return self._make_timer(func, data, self._setup)


[docs]
    def run(
        self,
        *,
        time_per_candidate: float = 6.0,
        repeat: int = 5,
        number: int | None = None,
        stratify: StratifyArg = None,
        skip_if: SkipIfFunc[DataType, *Ts] | None = None,
        progress: bool = False,
    ) -> ResultsDict:
        """Run for all cases.

        Args:
            time_per_candidate: Minimum runtime per repetition and candidate label. When `stratify` is set this budget
                applies **per** ``(candidate, stratum)`` **instead**, so total runtime scales with the number of strata.
                Ignored if `number` is set.
            repeat: Number of times to repeat for all candidates per data label.
            number: Number of times to execute each candidate, per repetition.
            stratify: Groups variants of comparable cost so that ``number`` is calibrated once per ``(candidate,
                stratum)`` instead of once per candidate function. Using ``"auto"`` implicitly calls :meth:`fit_strata`
                the first time. Set to ``None`` to disable.
            skip_if: A callable ``(skip_if) -> bool``; see the :class:`params <SkipIfParams>` type.
            progress: If ``True``, display progress. Uses ``tqdm`` on a TTY and falls back to periodic logging
                otherwise (so ``tqdm`` is optional).

        Examples:
            If `repeat=5` and `time_per_candidate=3` for an instance with 2 candidates, the total runtime will be
            approximately ``5 * 3 * 2 = 30`` seconds -- regardless of how many test-data variants are used (unless
            `stratify` is set).

        Returns:
            A dict `run_results` on the form ``{candidate_label: {data_label: [runtime, ...]}}``.

        Notes:
            * Calibration is inaccurate for candidates where a single call already exceeds `time_per_candidate`; the
              derived ``number`` then bottoms out at 1.

        See Also:
            The :py:class:`timeit.Timer` class which this implementation depends on.

        """
        logger = self._logger
        n_cand = len(self._candidates)
        n_data = len(self._data)
        total = n_cand * n_data

        logger.debug("Begin evaluating %i combinations: %i candidates and %i test cases.", total, n_cand, n_data)

        strata = self._resolve_strata(stratify, skip_if=skip_if, number=number)

        candidate_to_stratum_to_number = compute_candidate_numbers(
            self._candidates,
            self._data,
            strata,
            number=number,
            repeat=repeat,
            time_allocation=time_per_candidate,
            skip_if=skip_if,
            make_timer=self._new_timer,
            progress=progress,
            logger=logger,
        )

        pbar = make_progress(total, enabled=progress, logger=logger)

        i = 0
        run_results: ResultsDict = {}
        for candidate_label, func in self._candidates.items():
            by_stratum = candidate_to_stratum_to_number[candidate_label]
            if by_stratum is None:
                continue

            run_results[candidate_label] = candidate_results = {}

            iters = ", ".join(f"{repeat}x{n}" for n, _ in by_stratum.values())
            logger.info(f"Evaluate candidate {candidate_label!r} {iters} times per datum..")
            for data_label, test_data in self._data.items():
                i += 1
                pbar.set_description(f"{candidate_label}({data_label})")

                entry = by_stratum.get(strata.stratum_of(data_label))
                candidate_number = None if entry is None else entry[0]
                candidate_est_time = None if entry is None else entry[1]

                if skip_if:
                    skip_if_params: SkipIfParams[DataType, *Ts] = SkipIfParams(
                        candidate=func,
                        candidate_label=candidate_label,
                        data=test_data,
                        data_label=data_label,
                        est_time=None if candidate_est_time is None else candidate_est_time * repeat,
                        results_so_far=run_results,
                    )

                    if skip_if(skip_if_params):
                        pbar.update()
                        logger.debug(f"Skip combination {i}/{total}: {candidate_label!r} @ {data_label!r}.")
                        continue

                if candidate_number is None:
                    # The whole stratum was skip_if-filtered during calibration, so no number was derived.
                    pbar.update()
                    continue

                logger.debug(f"Start evaluating combination {i}/{total}: {candidate_label!r} @ {data_label!r}.")

                raw_timings = self._get_raw_timings(
                    func,
                    test_data,
                    repeat,
                    candidate_number,
                    setup=self._setup,
                    warmup=self._warmup,
                )

                timings = [dt / candidate_number for dt in raw_timings]

                # Same heuristic as the IPython cell magic.
                best = min(timings)
                worst = max(timings)
                if best > 0 and worst >= best * 4 and worst > UNRELIABLE_RESULTS_LIMIT:
                    t = (candidate_label, data_label)
                    warnings.warn(
                        f"Results may be unreliable for {t}. The worst time {fmt_time(worst)} "
                        f"was ~{worst / best:.1f} times slower than the best time ({fmt_time(best)}).",
                        UserWarning,
                        stacklevel=1,
                    )

                candidate_results[data_label] = timings
                pbar.update()

        pbar.close()
        return run_results


    @classmethod
    def _get_raw_timings(
        cls,
        func: CandFunc[DataType],
        test_data: DataType,
        repeat: int,
        number: int,
        *,
        setup: SetupFunc[DataType] | None = None,
        warmup: int = 0,
    ) -> list[float]:
        """Exists so that it can be overridden for testing."""
        timer = cls._make_timer(func, test_data, setup)
        for _ in range(warmup):
            timer.timeit(1)
        return timer.repeat(repeat, number)

    @staticmethod
    def _make_timer(func: CandFunc[DataType], test_data: DataType, setup: SetupFunc[DataType] | None) -> Timer:
        """Build a :class:`timeit.Timer`. With `setup`, fresh input is produced (unmeasured) before each repetition."""
        if setup is None:
            return Timer(functools.partial(func, test_data))

        holder: dict[str, DataType] = {}

        def _setup() -> None:
            holder["data"] = setup(test_data)

        def _stmt() -> None:
            func(holder["data"])

        return Timer(_stmt, _setup)

    @staticmethod
    def _process_candidates(candidates: CandidateMethodArg[DataType]) -> dict[str, CandFunc[DataType]]:
        if isinstance(candidates, Mapping):
            return {**candidates}
        if callable(candidates):
            return {tname(candidates, prefix_classname=True): candidates}

        def make_label(a: Any) -> str:
            name = tname(a, prefix_classname=True)
            return name.removeprefix("candidate_")

        labeled_candidates = {make_label(c): c for c in candidates}
        if len(labeled_candidates) != len(candidates):
            raise ValueError(
                f"Derived names for input {candidates=} are not unique. Use a dict to assign candidate names.",
            )
        return labeled_candidates

    @staticmethod
    def _dict_from_collection(test_data: Collection[DataType]) -> dict[Hashable, DataType]:
        result: dict[Hashable, DataType] = {}
        for data in test_data:
            s = str(data)

            if isinstance(data, (bool, float, int, str, tuple)):
                key = s
            else:
                key = f"{s[:29]}..." if len(s) > 32 else s  # noqa: PLR2004
                key = f"Sample data: '{key}'"
            result[key] = data
        return result