Source code for rics.translation.fetching._pandas_fetcher

import logging
import os
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union

import pandas as pd

from rics._internal_support.types import PathLikeType
from rics.translation.fetching import AbstractFetcher
from rics.translation.fetching.types import FetchInstruction
from rics.translation.offline.types import PlaceholderTranslations
from rics.translation.types import IdType
from rics.utility.misc import get_by_full_name, tname

LOGGER = logging.getLogger(__package__).getChild("PandasFetcher")
PandasReadFunction = Callable[[PathLikeType, Any, Any], pd.DataFrame]
FormatFn = Callable[[PathLikeType], str]


[docs]class PandasFetcher(AbstractFetcher[str, IdType]): """Fetcher implementation using pandas ``DataFrame`` s as the data format. Fetch data from serialized ``DataFrame`` s. How this is done is determined by the `read_function`. This is typically a Pandas function such as :func:`pandas.read_csv` or :func:`pandas.read_pickle`, but any function that accepts a string `source` as the first argument and returns a data frame can be used. Args: read_function: A Pandas `read`-function. read_path_format: A formatting string or a callable to apply to a source before passing them to `read_function`. Must contain a `source` as its only placeholder. Example: ``data/{source}.pkl``. Leave as-is if ``None``. read_function_args: Additional positional arguments for `read_function`. read_function_kwargs: Additional keyword arguments for `read_function`. See Also: The official `Pandas IO documentation <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_ """ def __init__( self, read_function: Union[PandasReadFunction, str] = pd.read_pickle, read_path_format: Optional[Union[str, FormatFn]] = "data/{}.pkl", read_function_args: Iterable[Any] = None, read_function_kwargs: Mapping[str, Any] = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) self._read = get_by_full_name(read_function, pd) if isinstance(read_function, str) else read_function self._format_source: FormatFn = _make_format_fn(read_path_format) self._args = read_function_args or () self._kwargs = read_function_kwargs or {} self._source_paths: Dict[str, Path] = {} self._sources: List[str] = [] self._placeholders: Dict[str, List[str]] = {}
[docs] def read(self, source_path: PathLikeType) -> pd.DataFrame: """Read a ``DataFrame`` from a source path. Args: source_path: Path to serialized ``DataFrame``. Returns: A deserialized ``DataFrame``. """ return self._read(source_path, *self._args, **self._kwargs)
[docs] def find_sources(self) -> Dict[str, Path]: """Search for source paths to pass to `read_function` using `read_path_format`. Returns: A dict ``{source, path}``. Raises: IOError: If files cannot be read. """ abs_file = Path(self._format_source("")).absolute() directory = abs_file.parent file_pattern = abs_file.name if not directory.is_dir(): # pragma: no cover problem = "is not a directory" if directory.exists() else "does not exist" raise IOError(f"Bad path format: {directory} {problem}.") source_paths = {} # Path.glob does not work with absolute directories. for file in map(Path, os.listdir(directory)): if file.name.endswith(str(file_pattern)): # pragma: no cover source_paths[file.name.replace(file_pattern, "")] = directory.joinpath(file) if not source_paths: # pragma: no cover pattern = Path(self._format_source("*")).absolute() raise IOError(f"Bad path pattern: '{pattern}' did not match any files.") return source_paths
@property def sources(self) -> List[str]: if not self._sources: # pragma: no cover if not self._source_paths: self._source_paths = self.find_sources() self._sources = list(self._source_paths) LOGGER.debug("Sources initialized: %s", self._sources) return self._sources @property def placeholders(self) -> Dict[str, List[str]]: if not self._placeholders: self._placeholders = { source: list(self.read(self._source_paths[source]).columns) for source in self.sources } return self._placeholders
[docs] def fetch_translations(self, instr: FetchInstruction[str, IdType]) -> PlaceholderTranslations[str]: return PlaceholderTranslations.make( instr.source, self.read(self._source_paths[instr.source]), )
def __repr__(self) -> str: read_path_format = self._format_source("{source}") return f"{tname(self)}(read_function={tname(self._read)}, {read_path_format=})"
def _make_format_fn(read_path_format: Optional[Union[str, FormatFn]]) -> FormatFn: if callable(read_path_format): # pragma: no cover return read_path_format # At this point read_path_format is a string or None return (read_path_format or "{}").format