Source code for rics.mapping.filter_functions

"""Functions that remove candidates."""
import logging
import re
from typing import Collection, Iterable, List, Literal, Optional, Set, Tuple, Union

LOGGER = logging.getLogger(__name__)

WhereOptions = Literal["name", "context", "candidate"]
WHERE_OPTIONS = ("name", "candidate", "context")
WhereArg = Union[WhereOptions, Iterable[WhereOptions]]
"""Determines how where matches must be found during filtering operations."""

VERBOSE: bool = False
"""If ``True`` enable optional DEBUG-level log messages on each heuristic function invocation.

Notes:
    Not all functions have verbose messages.
"""


[docs]def require_regex_match( name: str, candidates: Iterable[str], context: Optional[str], regex: Union[str, re.Pattern], where: WhereArg, keep_if_match: bool = True, purpose: str = "matching", ) -> Set[str]: """Require a regex match in `name`, `context`, and/or `candidates`. Args: name: A name. candidates: Potential matches for `name`. context: Context in which the function is being called. regex: A regex pattern to pass to :py:func:`re.compile`. where: Which of ('name', 'candidate', 'context') to match in. keep_if_match: If ``False``, require that `regex` does _not_ match to keep candidates. purpose: A purpose-string used for logging. Returns: Approved candidates. Raises: ValueError: If `where` contains `'context'` when `context` is not given. See Also: The :meth:`banned_substring` method. """ where = _parse_where_args(where) pattern = re.compile(regex, flags=re.IGNORECASE) if isinstance(regex, str) else regex logger = LOGGER.getChild("require_regex_match") candidates = set(candidates) # Short-circuit full refusal if "name" in where: match = pattern.match(name) if keep_if_match and not match: if VERBOSE: logger.debug(f"Refuse {purpose} for {name=}: Does not match {pattern=}.") return set() if match and not keep_if_match: if VERBOSE: logger.debug(f"Refuse {purpose} for {name=}: Matches {pattern=}.") return set() if "context" in where: if context is None: # pragma: no cover raise ValueError(f"No context given but 'context' was found in {where=}.") match = pattern.match(context) if keep_if_match and not match: if VERBOSE: logger.debug(f"Refuse {purpose} for {context=}: Does not match {pattern=}.") return set() if match and not keep_if_match: if VERBOSE: logger.debug(f"Refuse {purpose} for {context=}: Matches {pattern=}.") return set() if "candidate" not in where: return candidates kept: List[str] = [] rejected: List[str] = [] for cand in candidates: lst = kept if (bool(pattern.match(cand)) is keep_if_match) else rejected lst.append(cand) if VERBOSE and logger.isEnabledFor(logging.DEBUG): logger.debug(f"Filtering with {keep_if_match=} and {pattern=}; kept {sorted(kept)}, rejected {rejected}.") return set(kept)
[docs]def banned_substring( name: str, candidates: Iterable[str], context: Optional[str], substrings: Collection[str], where: WhereArg, ) -> Set[str]: """Prevent mapping if banned substrings are found. Matching on `name` or `context` halts all mapping. Matching candidates excludes only those candidates. Args: name: An element to find matches for. candidates: Potential matches for `name` (not used). context: Context in which the function is being called. substrings: Substrings which may not be present in `name`. where: Which of ('name', 'candidate', 'context') to match in. Empty=all. Returns: Approved candidates. See Also: The :meth:`require_regex_match` method, which performs the actual work. """ where = _parse_where_args(where) remaining = set(candidates) for subs in substrings: if not remaining: break matches = require_regex_match( name, remaining, context, regex=re.compile(f".*{subs}.*", flags=re.IGNORECASE), where=where, keep_if_match=False, ) remaining = remaining.intersection(matches) return remaining
def _parse_where_args(args: WhereArg) -> Tuple[WhereOptions, ...]: if not args: raise ValueError(f"At least one of {WHERE_OPTIONS} must be given.") args_tuple = tuple((args,) if isinstance(args, str) else tuple(args)) for where in args_tuple: if where not in WHERE_OPTIONS: raise ValueError(f"Bad where-argument {repr(args)}; {where=} not in {WHERE_OPTIONS}.") return args_tuple