Source code for rics.strings

"""Utility functions that act on or produce strings."""

import typing as _t



[docs]
def format_bytes(n: int, *, binary: bool = True, long: bool = False, decimals: int = 2) -> str:
    """Format bytes as a string.

    Args:
        n: Number of bytes. Must be positive.
        binary: Output `binary <https://en.wikipedia.org/wiki/Binary_prefix>`_ prefixes if ``True``,
            use `metric (SI) <https://en.wikipedia.org/wiki/Metric_prefix>`_ prefixes otherwise.
        long: Output out full unit and prefix if ``True``, use abbreviated versions otherwise.
        decimals: Number of decimals to include. Ignored for when `n < base`.

    Returns:
        Formatted number of bytes.

    Examples:
        **Formatting on prefix bounds**

        The jump as made at `base / 2`, where `base` is one of 1024 and 1000 (when ``binary=False``).

        >>> format_bytes(512 * 1024)
        '512.00 KiB'
        >>> format_bytes(512 * 1024 + 1)
        '0.50 MiB'

        This rule does *not* apply when `n <= base`.

        >>> format_bytes(1024, long=True)
        1024 bytes
        >>> format_bytes(1024 + 1)
        '1.00 KiB'

        **Output flags**

        >>> format_bytes(20190511, binary=False, long=False)
        '20.19 MB'
        >>> format_bytes(20190511, binary=False, long=True)
        '20.19 megabytes'
        >>> format_bytes(20190511, binary=True, long=False)
        '19.26 MiB'
        >>> format_bytes(20190511, binary=True, long=True)
        '19.26 mebibytes'

        **Large outputs**

        Metric and binary have different upper limits.

        >>> format_bytes(21**21, binary=True)
        '2416.44 YiB'
        >>> format_bytes(21**21, binary=True, long=True)
        '2416.44 yobibytes'
        >>> format_bytes(21**21, binary=False)
        '5.84 RB'
        >>> format_bytes(21**21, binary=False, long=True)
        '5.84 ronnabytes'

        If you ever see output like this, please let me know so that I can brag that someone important is using my
        little library.
    """
    base = 1024 if binary else 1000

    if n <= base:
        return f"{n} {'bytes' if long else 'B'}"

    x: float = n * 2.0
    n_divisions = -1
    while x > base:
        x /= base
        n_divisions += 1

    if binary:
        # https://en.wikipedia.org/wiki/Binary_prefix
        if long:
            prefixes = ["kibi", "mebi", "gibi", "tebi", "pebi", "exbi", "zebi", "yobi"]
        else:
            prefixes = ["Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"]
    else:  # noqa: PLR5501  # BUG: this rule doesn't preserve comments: https://github.com/astral-sh/ruff/issues/9790
        # https://en.wikipedia.org/wiki/Metric_prefix
        if long:
            prefixes = ["kilo", "mega", "giga", "tera", "peta", "exa", "zetta", "yotta", "ronna", "quetta"]
        else:
            prefixes = ["k", "M", "G", "T", "P", "E", "Z", "Y", "R", "Q"]

    try:
        prefix = prefixes[n_divisions]
    except IndexError:
        prefix = prefixes[-1]
        x = n / base ** len(prefixes)

    prefix += "bytes" if long else "B"
    return f"{x / 2:.{decimals}f} {prefix}"




[docs]
def format_perf_counter(start: float, *, end: float | None = None, full: bool = False) -> str:
    """Format performance counter output.

    This function formats performance counter output based on the time elapsed. This is a thin wrapper around the
    :func:`~rics.strings.format_seconds` function.

    Args:
        start: Start time.
        end: End time. Retrieved using :py:func:`time.perf_counter` if ``None``.
        full: If ``True``, show all non-zero components above four hours.

    Returns:
        A formatted performance counter time.

    Examples:
        Basic usage.

        >>> import time
        >>> start = time.perf_counter()
        >>> time.sleep(1219.0)  # doctest: +SKIP
        >>> format_perf_counter(start)  # doctest: +SKIP
        '20m 19s'

        With no `end` argument given, the current time is retrieved using :py:func:`time.perf_counter`.
    """
    from time import perf_counter

    end = perf_counter() if end is None else end
    return format_seconds(end - start, full=full)




[docs]
def format_seconds(t: float, *, allow_negative: bool = False, full: bool = False) -> str:
    """Format performance counter output.

    Args:
        t: Time in seconds.
        allow_negative: If ``True``, format negative `t` with a leading minus sign.
        full: If ``True``, show all non-zero components above four hours.

    Returns:
        A formatted performance counter time.

    Examples:
        Basic usage.

        >>> format_seconds(0.0000154)
        '15 μs'
        >>> format_seconds(0.154)
        '154 ms'
        >>> format_seconds(31.39)
        '31.4 sec'

        Clock units are used for `t > 60` seconds.

        >>> format_seconds(59.99)
        '60.0 sec'
        >>> format_seconds(60.00)
        '60.0 sec'
        >>> format_seconds(60.01)
        '1m'
        >>> format_seconds(309623.49)
        '3d 14h'

        Large intervals is rounded by default. You may set ``full=True`` to show full output.

        >>> format_seconds(309623.49)
        '3d 14h'
        >>> format_seconds(309633.51, full=True)
        '3d 14h 0m 34s'

    Raises:
        ValueError: If ``t < 0`` and ``allow_negative=False`` (the default).

    """
    if t < 0:
        if not allow_negative:
            allow_negative = True
            raise ValueError(f"Refuse to format {t=} < 0; to allow, set {allow_negative=}")
        return f"-{format_seconds(abs(t), full=full)}"

    long_limit: float = 60.0
    return _format_seconds(t) if t <= long_limit else _format_minutes(t, full)



def _format_minutes(t: float, full: bool) -> str:
    if full or t < 4 * 3600.0:
        total_seconds = round(t)
    else:
        total_seconds = 60 * round(t / 60)  # Drop seconds above four hours

    days, seconds = divmod(total_seconds, 86400)
    hours, seconds = divmod(seconds, 3600)
    minutes, seconds = divmod(seconds, 60)

    parts = (days, hours, minutes, seconds)
    nonzero = tuple(p > 0 for p in parts)
    start = nonzero.index(True)
    stop = len(nonzero) - nonzero[::-1].index(True)
    return " ".join(f"{parts[i]}{'dhms'[i]}" for i in range(start, stop))


def _format_seconds(t: float) -> str:
    single_decimal_limit: float = 1.0
    if t >= single_decimal_limit:
        return f"{t:.1f} sec"

    double_decimal_limit: float = 0.5
    if t > double_decimal_limit:
        return f"{t:.2f} sec"

    if t > 10**-3:
        return f"{t * 10**3:.0f} ms"
    if t > 10**-6:  # 1 μs
        return f"{t * 10**6:.0f} μs"
    if t > 10**-9:
        return f"{t * 10**9:.0f} ns"

    return f"{t:.3g} sec"



[docs]
def camel_to_snake(s: str) -> str:
    """Naive ``camelCase`` or ``PascalCase`` to ``snake_case`` conversion.

    Args:
        s: A string to convert.

    Returns:
        A ``snake_case`` string.

    Raises:
        IndexError: If `string` is empty.

    Examples:
        Converting camel case strings.

        >>> camel_to_snake("ClassName")
        'class_name'
        >>> camel_to_snake("variableName")
        'variable_name'

        Proper ``snake_case`` strings will not be changed.

        >>> camel_to_snake("already_snake_case")
        'already_snake_case'

    Notes:
        Passing ``SCREAMING_SNAKE_CASE`` strings is **not** supported.
    """
    parts = [s[0]]

    for ch in s[1:]:
        if ch.isupper():
            parts.append("_")
        parts.append(ch)

    return "".join(parts).lower()




[docs]
def snake_to_camel(s: str, *, lower: bool = True) -> str:
    """Naive ``snake_case`` to ``camelCase`` conversion.

    Args:
        s: A string to convert.
        lower: If ``False``, return ``PamelCase`` instead of ``camelCase``.

    Returns:
        A ``camelCase`` string.

    Raises:
        IndexError: If `string` is empty.

    Examples:
        Converting snake case strings.

        >>> snake_to_camel("snake_case")
        'snakeCase'

        Passing ``SCREAMING_SNAKE_CASE`` strings is supported.

        >>> snake_to_camel("SCREAMING_SNAKE_CASE")
        'screamingSnakeCase'

        Set ``lower=False`` to convert to ``PascalCase`` or ``UpperCamelCase``.

        >>> snake_to_camel("SCREAMING_SNAKE_CASE", lower=False)
        'ScreamingSnakeCase'

    Notes:
        Passing ``camelCase`` strings is **not** supported.
    """
    s = s.title().replace("_", "")
    s0 = s[0]
    if lower:
        s0 = s0.lower()
    return s0 + s[1:]



TRUE = "1", "true", "yes", "on", "enable", "enabled"
FALSE = "0", "false", "no", "off", "disable", "disabled"



[docs]
def str_as_bool(s: str) -> bool:
    """Convert a string `s` to a boolean value.

    The output is determined by the content of `s`, as per the mapping shown below.

    Keys:
        * False: ``{false}``
        * True: ``{true}``

    Matching is case-insensitive.

    Args:
        s: A string.

    Returns:
        A ``bool`` value.

    Raises:
        TypeError: If `s` is not a string.
        ValueError: If `s` cannot be converted to ``bool`` using the keys above.

    Examples:
        Basic usage.

        >>> str_as_bool("true"), str_as_bool("false")
        (True, False)

        The input is cleaned and normalized.

        >>> str_as_bool(" TRUE"), str_as_bool("False")
        (True, False)

        Input strings are normalized using :py:meth:`str.strip` and :py:meth:`str.lower`.

    Notes:
        Using ``bool(<str>)`` is equivalent to ``len(<str>) == 0``.
    """
    if not isinstance(s, str):
        msg = f"Input must be a string; got {type(s).__name__}."
        raise TypeError(msg)

    s = s.strip().lower()
    if s in FALSE:
        return False
    if s in TRUE:
        return True

    error = ValueError(f"Cannot cast {s!r} to `bool`.")
    error.add_note(f"{FALSE=}")
    error.add_note(f"{TRUE=}")
    raise error



if str_as_bool.__doc__:
    str_as_bool.__doc__ = str_as_bool.__doc__.format(false=FALSE, true=TRUE)



[docs]
def format_kwargs(
    kwargs: _t.Mapping[str, _t.Any],
    *,
    max_value_length: int = 120,
    prefix_classname: bool = False,
    include_module: bool = False,
) -> str:
    """Format keyword arguments.

    Args:
        kwargs: Arguments to format.
        prefix_classname: If ``True``, prepend the class name if a value belongs to a class.
        include_module: If ``True``, prepend the public module (see :func:`.misc.get_public_module`).
        max_value_length: Replace value with the class name above this limit. 0=no limit.

    Returns:
        A string on the form `'key0=repr(value0), key1=repr(value1)'`.

    Raises:
        ValueError: For keys in `kwargs` that are not valid Python argument names.

    Examples:
        Basic usage.

        >>> format_kwargs({"an_int": 1, "a_string": "Hello!"})
        "an_int=1, a_string='Hello!'"

    Notes:
        Uses :class:`ReprFormatter` to format values.
    """
    invalid = [k for k in kwargs if not k.isidentifier()]
    if invalid:
        raise ValueError(f"Got {len(invalid)} invalid identifiers: {invalid}.")

    rf = ReprFormatter(
        max_value_length=max_value_length,
        prefix_classname=prefix_classname,
        include_module=include_module,
    )

    return ", ".join(f"{k}={rf.format_value(v)}" for k, v in kwargs.items())




[docs]
class ReprFormatter:
    """Alternative :py:func:`repr` implementation.

    Values above `max_value_length` characters are replaced by stylized class names.

    Args:
        max_value_length: Use class name above this length. 0=no limit, -1=force class name.
        prefix_classname: If ``True``, prepend the class name if a value belongs to a class.
        include_module: If ``True``, prepend the public module (see :func:`.misc.get_public_module`).
        module_aliases: A mapping of module replacements, e.g. ``{"pandas": "pd"}``. Default is
            :attr:`DEFAULT_MODULE_ALIASES`. Trailing dots are added automatically. Ignored when `include_module` is
            ``False``.

    See Also:
        The :func:`format_kwargs`, :func:`.misc.tname`, and :func:`.misc.get_public_module` functions.
    """

    DEFAULT_MODULE_ALIASES: _t.Mapping[str, str] = {
        "numpy": "np",
        "pandas": "pd",
        "polars": "pl",
        "tensorflow": "tf",
        "matplotlib.pyplot": "plt",
    }

    def __init__(
        self,
        *,
        max_value_length: int = 120,
        prefix_classname: bool = False,
        include_module: bool = False,
        module_aliases: _t.Mapping[str, str] | None = None,
    ) -> None:
        self._max_value_length = max_value_length

        if module_aliases is None:
            module_aliases = self.DEFAULT_MODULE_ALIASES
        self._module_aliases = {k + ".": v + "." for k, v in module_aliases.items()}

        self._prefix_classname = prefix_classname
        self._include_module = include_module

        self._cache: dict[int, str] = {}


[docs]
    def format_value(self, value: _t.Any) -> str:
        """Convert any value to string."""
        value_id = id(value)
        value_repr = self._cache.get(value_id)
        if value_repr is None:
            value_repr = self._format_value(value)
            self._cache[value_id] = value_repr

        return value_repr


    def _format_value(self, value: _t.Any) -> str:
        """Convert any value to string."""
        if self._max_value_length == 0:
            return self._serialize_as_value(value)
        if self._max_value_length < 0:
            shape = self._get_shape(value)
            return self._serialize_as_class(value, shape)

        for serializer in [
            self._repr_str,
            self._repr_builtin_collection,
            self._format_ndim_array,
        ]:
            value_repr = serializer(value)
            if isinstance(value_repr, str):
                return value_repr
            elif value_repr is False:
                break

        value_repr = self._serialize_as_value(value)
        if len(value_repr) <= self._max_value_length:
            return value_repr

        return self._serialize_as_class(value, ())


[docs]
    def format_ndim_array(self, value: _t.Any) -> str:
        """Format shaped types, e.g. attr:`pandas.DataFrame.shape`."""
        shape = self._get_shape(value)
        if shape:
            return self._serialize_as_class(value, shape)

        msg = f"{type(value).__name__}.shape={shape} not valid"
        raise TypeError(msg)


    def _format_ndim_array(self, value: _t.Any) -> str | None:
        if shape := self._get_shape(value):
            return self._serialize_as_class(value, shape)
        return None

    @classmethod
    def _serialize_as_value(cls, value: _t.Any) -> str:
        from pprint import PrettyPrinter

        pp = PrettyPrinter(
            indent=2,
            width=120,
            depth=4,
            compact=True,
            sort_dicts=True,
            underscore_numbers=True,
        )
        return pp.pformat(value)

    def _serialize_as_class(self, value: _t.Any, shape: tuple[int, ...]) -> str:
        from rics.misc import tname

        value_cls = tname(value, prefix_classname=self._prefix_classname, include_module=self._include_module)

        if self._include_module:
            for module, alias in self._module_aliases.items():
                if value_cls.startswith(module):
                    value_cls = value_cls.replace(module, alias)

        if not shape:
            return value_cls

        dims = "x".join(map(str, shape))
        return f"{value_cls}[{dims}]"

    def _repr_str(self, value: _t.Any) -> str | bool:
        if isinstance(value, str):
            sz = len(value)
            if sz > self._max_value_length:
                return f"str[{sz}]"
            else:
                return repr(value)  # Might be longer than max_value_length if there's a lot of escaping.

        return True

    def _repr_builtin_collection(self, value: _t.Any) -> str | bool:
        if isinstance(value, (list, tuple, set)):
            if len(value) * 3 > self._max_value_length:
                shape = (len(value),)
                return self._serialize_as_class(value, shape)
            else:
                return False

        if isinstance(value, dict):
            if len(value) * 6 > self._max_value_length:
                shape = (len(value),)
                return self._serialize_as_class(value, shape)
            else:
                return False

        return True

    @classmethod
    def _get_shape(cls, value: _t.Any) -> tuple[int, ...]:
        if hasattr(value, "shape") and isinstance(value.shape, tuple):
            return value.shape
        elif hasattr(value, "__len__"):
            return (len(value),)
        return ()