Source code for rics.strings
"""Utility functions that act on or produce strings."""
import typing as _t
[docs]
def format_bytes(n: int, *, binary: bool = True, long: bool = False, decimals: int = 2) -> str:
"""Format bytes as a string.
Args:
n: Number of bytes. Must be positive.
binary: Output `binary <https://en.wikipedia.org/wiki/Binary_prefix>`_ prefixes if ``True``,
use `metric (SI) <https://en.wikipedia.org/wiki/Metric_prefix>`_ prefixes otherwise.
long: Output out full unit and prefix if ``True``, use abbreviated versions otherwise.
decimals: Number of decimals to include. Ignored for when `n < base`.
Returns:
Formatted number of bytes.
Examples:
**Formatting on prefix bounds**
The jump as made at `base / 2`, where `base` is one of 1024 and 1000 (when ``binary=False``).
>>> format_bytes(512 * 1024)
'512.00 KiB'
>>> format_bytes(512 * 1024 + 1)
'0.50 MiB'
This rule does *not* apply when `n <= base`.
>>> format_bytes(1024, long=True)
1024 bytes
>>> format_bytes(1024 + 1)
'1.00 KiB'
**Output flags**
>>> format_bytes(20190511, binary=False, long=False)
'20.19 MB'
>>> format_bytes(20190511, binary=False, long=True)
'20.19 megabytes'
>>> format_bytes(20190511, binary=True, long=False)
'19.26 MiB'
>>> format_bytes(20190511, binary=True, long=True)
'19.26 mebibytes'
**Large outputs**
Metric and binary have different upper limits.
>>> format_bytes(21**21, binary=True)
'2416.44 YiB'
>>> format_bytes(21**21, binary=True, long=True)
'2416.44 yobibytes'
>>> format_bytes(21**21, binary=False)
'5.84 RB'
>>> format_bytes(21**21, binary=False, long=True)
'5.84 ronnabytes'
If you ever see output like this, please let me know so that I can brag that someone important is using my
little library.
"""
base = 1024 if binary else 1000
if n <= base:
return f"{n} {'bytes' if long else 'B'}"
x: float = n * 2.0
n_divisions = -1
while x > base:
x /= base
n_divisions += 1
if binary:
# https://en.wikipedia.org/wiki/Binary_prefix
if long:
prefixes = ["kibi", "mebi", "gibi", "tebi", "pebi", "exbi", "zebi", "yobi"]
else:
prefixes = ["Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"]
else: # noqa: PLR5501 # BUG: this rule doesn't preserve comments: https://github.com/astral-sh/ruff/issues/9790
# https://en.wikipedia.org/wiki/Metric_prefix
if long:
prefixes = ["kilo", "mega", "giga", "tera", "peta", "exa", "zetta", "yotta", "ronna", "quetta"]
else:
prefixes = ["k", "M", "G", "T", "P", "E", "Z", "Y", "R", "Q"]
try:
prefix = prefixes[n_divisions]
except IndexError:
prefix = prefixes[-1]
x = n / base ** len(prefixes)
prefix += "bytes" if long else "B"
return f"{x / 2:.{decimals}f} {prefix}"
[docs]
def format_perf_counter(start: float, *, end: float | None = None, full: bool = False) -> str:
"""Format performance counter output.
This function formats performance counter output based on the time elapsed. This is a thin wrapper around the
:func:`~rics.strings.format_seconds` function.
Args:
start: Start time.
end: End time. Retrieved using :py:func:`time.perf_counter` if ``None``.
full: If ``True``, show all non-zero components above four hours.
Returns:
A formatted performance counter time.
Examples:
Basic usage.
>>> import time
>>> start = time.perf_counter()
>>> time.sleep(1219.0) # doctest: +SKIP
>>> format_perf_counter(start) # doctest: +SKIP
'20m 19s'
With no `end` argument given, the current time is retrieved using :py:func:`time.perf_counter`.
"""
from time import perf_counter
end = perf_counter() if end is None else end
return format_seconds(end - start, full=full)
[docs]
def format_seconds(t: float, *, allow_negative: bool = False, full: bool = False) -> str:
"""Format performance counter output.
Args:
t: Time in seconds.
allow_negative: If ``True``, format negative `t` with a leading minus sign.
full: If ``True``, show all non-zero components above four hours.
Returns:
A formatted performance counter time.
Examples:
Basic usage.
>>> format_seconds(0.0000154)
'15 μs'
>>> format_seconds(0.154)
'154 ms'
>>> format_seconds(31.39)
'31.4 sec'
Clock units are used for `t > 60` seconds.
>>> format_seconds(59.99)
'60.0 sec'
>>> format_seconds(60.00)
'60.0 sec'
>>> format_seconds(60.01)
'1m'
>>> format_seconds(309623.49)
'3d 14h'
Large intervals is rounded by default. You may set ``full=True`` to show full output.
>>> format_seconds(309623.49)
'3d 14h'
>>> format_seconds(309633.51, full=True)
'3d 14h 0m 34s'
Raises:
ValueError: If ``t < 0`` and ``allow_negative=False`` (the default).
"""
if t < 0:
if not allow_negative:
allow_negative = True
raise ValueError(f"Refuse to format {t=} < 0; to allow, set {allow_negative=}")
return f"-{format_seconds(abs(t), full=full)}"
long_limit: float = 60.0
return _format_seconds(t) if t <= long_limit else _format_minutes(t, full)
def _format_minutes(t: float, full: bool) -> str:
if full or t < 4 * 3600.0:
total_seconds = round(t)
else:
total_seconds = 60 * round(t / 60) # Drop seconds above four hours
days, seconds = divmod(total_seconds, 86400)
hours, seconds = divmod(seconds, 3600)
minutes, seconds = divmod(seconds, 60)
parts = (days, hours, minutes, seconds)
nonzero = tuple(p > 0 for p in parts)
start = nonzero.index(True)
stop = len(nonzero) - nonzero[::-1].index(True)
return " ".join(f"{parts[i]}{'dhms'[i]}" for i in range(start, stop))
def _format_seconds(t: float) -> str:
single_decimal_limit: float = 1.0
if t >= single_decimal_limit:
return f"{t:.1f} sec"
double_decimal_limit: float = 0.5
if t > double_decimal_limit:
return f"{t:.2f} sec"
if t > 10**-3:
return f"{t * 10**3:.0f} ms"
if t > 10**-6: # 1 μs
return f"{t * 10**6:.0f} μs"
if t > 10**-9:
return f"{t * 10**9:.0f} ns"
return f"{t:.3g} sec"
[docs]
def camel_to_snake(s: str) -> str:
"""Naive ``camelCase`` or ``PascalCase`` to ``snake_case`` conversion.
Args:
s: A string to convert.
Returns:
A ``snake_case`` string.
Raises:
IndexError: If `string` is empty.
Examples:
Converting camel case strings.
>>> camel_to_snake("ClassName")
'class_name'
>>> camel_to_snake("variableName")
'variable_name'
Proper ``snake_case`` strings will not be changed.
>>> camel_to_snake("already_snake_case")
'already_snake_case'
Notes:
Passing ``SCREAMING_SNAKE_CASE`` strings is **not** supported.
"""
parts = [s[0]]
for ch in s[1:]:
if ch.isupper():
parts.append("_")
parts.append(ch)
return "".join(parts).lower()
[docs]
def snake_to_camel(s: str, *, lower: bool = True) -> str:
"""Naive ``snake_case`` to ``camelCase`` conversion.
Args:
s: A string to convert.
lower: If ``False``, return ``PamelCase`` instead of ``camelCase``.
Returns:
A ``camelCase`` string.
Raises:
IndexError: If `string` is empty.
Examples:
Converting snake case strings.
>>> snake_to_camel("snake_case")
'snakeCase'
Passing ``SCREAMING_SNAKE_CASE`` strings is supported.
>>> snake_to_camel("SCREAMING_SNAKE_CASE")
'screamingSnakeCase'
Set ``lower=False`` to convert to ``PascalCase`` or ``UpperCamelCase``.
>>> snake_to_camel("SCREAMING_SNAKE_CASE", lower=False)
'ScreamingSnakeCase'
Notes:
Passing ``camelCase`` strings is **not** supported.
"""
s = s.title().replace("_", "")
s0 = s[0]
if lower:
s0 = s0.lower()
return s0 + s[1:]
TRUE = "1", "true", "yes", "on", "enable", "enabled"
FALSE = "0", "false", "no", "off", "disable", "disabled"
[docs]
def str_as_bool(s: str) -> bool:
"""Convert a string `s` to a boolean value.
The output is determined by the content of `s`, as per the mapping shown below.
Keys:
* False: ``{false}``
* True: ``{true}``
Matching is case-insensitive.
Args:
s: A string.
Returns:
A ``bool`` value.
Raises:
TypeError: If `s` is not a string.
ValueError: If `s` cannot be converted to ``bool`` using the keys above.
Examples:
Basic usage.
>>> str_as_bool("true"), str_as_bool("false")
(True, False)
The input is cleaned and normalized.
>>> str_as_bool(" TRUE"), str_as_bool("False")
(True, False)
Input strings are normalized using :py:meth:`str.strip` and :py:meth:`str.lower`.
Notes:
Using ``bool(<str>)`` is equivalent to ``len(<str>) == 0``.
"""
if not isinstance(s, str):
msg = f"Input must be a string; got {type(s).__name__}."
raise TypeError(msg)
s = s.strip().lower()
if s in FALSE:
return False
if s in TRUE:
return True
error = ValueError(f"Cannot cast {s!r} to `bool`.")
error.add_note(f"{FALSE=}")
error.add_note(f"{TRUE=}")
raise error
if str_as_bool.__doc__:
str_as_bool.__doc__ = str_as_bool.__doc__.format(false=FALSE, true=TRUE)
[docs]
def format_kwargs(
kwargs: _t.Mapping[str, _t.Any],
*,
max_value_length: int = 120,
prefix_classname: bool = False,
include_module: bool = False,
) -> str:
"""Format keyword arguments.
Args:
kwargs: Arguments to format.
prefix_classname: If ``True``, prepend the class name if a value belongs to a class.
include_module: If ``True``, prepend the public module (see :func:`.misc.get_public_module`).
max_value_length: Replace value with the class name above this limit. 0=no limit.
Returns:
A string on the form `'key0=repr(value0), key1=repr(value1)'`.
Raises:
ValueError: For keys in `kwargs` that are not valid Python argument names.
Examples:
Basic usage.
>>> format_kwargs({"an_int": 1, "a_string": "Hello!"})
"an_int=1, a_string='Hello!'"
Notes:
Uses :class:`ReprFormatter` to format values.
"""
invalid = [k for k in kwargs if not k.isidentifier()]
if invalid:
raise ValueError(f"Got {len(invalid)} invalid identifiers: {invalid}.")
rf = ReprFormatter(
max_value_length=max_value_length,
prefix_classname=prefix_classname,
include_module=include_module,
)
return ", ".join(f"{k}={rf.format_value(v)}" for k, v in kwargs.items())
[docs]
class ReprFormatter:
"""Alternative :py:func:`repr` implementation.
Values above `max_value_length` characters are replaced by stylized class names.
Args:
max_value_length: Use class name above this length. 0=no limit, -1=force class name.
prefix_classname: If ``True``, prepend the class name if a value belongs to a class.
include_module: If ``True``, prepend the public module (see :func:`.misc.get_public_module`).
module_aliases: A mapping of module replacements, e.g. ``{"pandas": "pd"}``. Default is
:attr:`DEFAULT_MODULE_ALIASES`. Trailing dots are added automatically. Ignored when `include_module` is
``False``.
See Also:
The :func:`format_kwargs`, :func:`.misc.tname`, and :func:`.misc.get_public_module` functions.
"""
DEFAULT_MODULE_ALIASES: _t.Mapping[str, str] = {
"numpy": "np",
"pandas": "pd",
"polars": "pl",
"tensorflow": "tf",
"matplotlib.pyplot": "plt",
}
def __init__(
self,
*,
max_value_length: int = 120,
prefix_classname: bool = False,
include_module: bool = False,
module_aliases: _t.Mapping[str, str] | None = None,
) -> None:
self._max_value_length = max_value_length
if module_aliases is None:
module_aliases = self.DEFAULT_MODULE_ALIASES
self._module_aliases = {k + ".": v + "." for k, v in module_aliases.items()}
self._prefix_classname = prefix_classname
self._include_module = include_module
self._cache: dict[int, str] = {}
[docs]
def format_value(self, value: _t.Any) -> str:
"""Convert any value to string."""
value_id = id(value)
value_repr = self._cache.get(value_id)
if value_repr is None:
value_repr = self._format_value(value)
self._cache[value_id] = value_repr
return value_repr
def _format_value(self, value: _t.Any) -> str:
"""Convert any value to string."""
if self._max_value_length == 0:
return self._serialize_as_value(value)
if self._max_value_length < 0:
shape = self._get_shape(value)
return self._serialize_as_class(value, shape)
for serializer in [
self._repr_str,
self._repr_builtin_collection,
self._format_ndim_array,
]:
value_repr = serializer(value)
if isinstance(value_repr, str):
return value_repr
elif value_repr is False:
break
value_repr = self._serialize_as_value(value)
if len(value_repr) <= self._max_value_length:
return value_repr
return self._serialize_as_class(value, ())
[docs]
def format_ndim_array(self, value: _t.Any) -> str:
"""Format shaped types, e.g. attr:`pandas.DataFrame.shape`."""
shape = self._get_shape(value)
if shape:
return self._serialize_as_class(value, shape)
msg = f"{type(value).__name__}.shape={shape} not valid"
raise TypeError(msg)
def _format_ndim_array(self, value: _t.Any) -> str | None:
if shape := self._get_shape(value):
return self._serialize_as_class(value, shape)
return None
@classmethod
def _serialize_as_value(cls, value: _t.Any) -> str:
from pprint import PrettyPrinter
pp = PrettyPrinter(
indent=2,
width=120,
depth=4,
compact=True,
sort_dicts=True,
underscore_numbers=True,
)
return pp.pformat(value)
def _serialize_as_class(self, value: _t.Any, shape: tuple[int, ...]) -> str:
from rics.misc import tname
value_cls = tname(value, prefix_classname=self._prefix_classname, include_module=self._include_module)
if self._include_module:
for module, alias in self._module_aliases.items():
if value_cls.startswith(module):
value_cls = value_cls.replace(module, alias)
if not shape:
return value_cls
dims = "x".join(map(str, shape))
return f"{value_cls}[{dims}]"
def _repr_str(self, value: _t.Any) -> str | bool:
if isinstance(value, str):
sz = len(value)
if sz > self._max_value_length:
return f"str[{sz}]"
else:
return repr(value) # Might be longer than max_value_length if there's a lot of escaping.
return True
def _repr_builtin_collection(self, value: _t.Any) -> str | bool:
if isinstance(value, (list, tuple, set)):
if len(value) * 3 > self._max_value_length:
shape = (len(value),)
return self._serialize_as_class(value, shape)
else:
return False
if isinstance(value, dict):
if len(value) * 6 > self._max_value_length:
shape = (len(value),)
return self._serialize_as_class(value, shape)
else:
return False
return True
@classmethod
def _get_shape(cls, value: _t.Any) -> tuple[int, ...]:
if hasattr(value, "shape") and isinstance(value.shape, tuple):
return value.shape
elif hasattr(value, "__len__"):
return (len(value),)
return ()