Source code for rics.translation.fetching._pandas_fetcher

import logging
import os
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union

import pandas as pd

from rics.translation.fetching import AbstractFetcher
from rics.translation.fetching.types import FetchInstruction
from rics.translation.offline.types import PlaceholderTranslations
from rics.translation.types import IdType
from rics.utility.misc import PathLikeType, get_by_full_name, tname

LOGGER = logging.getLogger(__package__).getChild("PandasFetcher")
PandasReadFunction = Callable[[PathLikeType, Any, Any], pd.DataFrame]
FormatFn = Callable[[PathLikeType], str]


[docs]class PandasFetcher(AbstractFetcher[str, IdType]): """Fetcher implementation using pandas ``DataFrame`` s as the data format. Fetch data from serialized ``DataFrame`` s. How this is done is determined by the `read_function`. This is typically a Pandas function such as :func:`pandas.read_csv` or :func:`pandas.read_pickle`, but any function that accepts a string `source` as the first argument and returns a data frame can be used. Args: read_function: A Pandas `read`-function. read_path_format: A formatting string or a callable to apply to a source before passing them to `read_function`. Must contain a `source` as its only placeholder. Example: ``data/{source}.pkl``. Leave as-is if ``None``. read_function_args: Additional positional arguments for `read_function`. read_function_kwargs: Additional keyword arguments for `read_function`. See Also: The official `Pandas IO documentation <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_ """ def __init__( self, read_function: Union[PandasReadFunction, str] = pd.read_pickle, read_path_format: Optional[Union[str, FormatFn]] = "data/{}.pkl", read_function_args: Iterable[Any] = None, read_function_kwargs: Mapping[str, Any] = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) self._read = get_by_full_name(read_function, pd) if isinstance(read_function, str) else read_function self._format_source: FormatFn = _make_format_fn(read_path_format) self._args = read_function_args or () self._kwargs = read_function_kwargs or {} self._source_paths: Dict[str, Path] = {} self._sources: List[str] = [] self._placeholders: Dict[str, List[str]] = {}
[docs] def read(self, source_path: PathLikeType) -> pd.DataFrame: """Read a ``DataFrame`` from a source path. Args: source_path: Path to serialized ``DataFrame``. Returns: A deserialized `DataFrame``. """ return self._read(source_path, *self._args, **self._kwargs)
[docs] def find_sources(self) -> Dict[str, Path]: """Search for source paths to pass to `read_function` using `read_path_format`. Returns: A dict ``{source, path}``. Raises: IOError: If files cannot be read. """ abs_file = Path(self._format_source("")).absolute() directory = abs_file.parent file_pattern = abs_file.name if not directory.is_dir(): # pragma: no cover problem = "is not a directory" if directory.exists() else "does not exist" raise IOError(f"Bad path format: {directory} {problem}.") source_paths = {} # Path.glob does not work with absolute directories. for file in map(Path, os.listdir(directory)): if file.name.endswith(str(file_pattern)): # pragma: no cover source_paths[file.name.replace(file_pattern, "")] = directory.joinpath(file) if not source_paths: # pragma: no cover pattern = Path(self._format_source("*")).absolute() raise IOError(f"Bad path pattern: '{pattern}' did not match any files.") return source_paths
@property def sources(self) -> List[str]: if not self._sources: # pragma: no cover if not self._source_paths: self._source_paths = self.find_sources() self._sources = list(self._source_paths) LOGGER.debug("Sources initialized: %s", self._sources) return self._sources @property def placeholders(self) -> Dict[str, List[str]]: if not self._placeholders: self._placeholders = { source: list(self.read(self._source_paths[source]).columns) for source in self.sources } return self._placeholders
[docs] def fetch_translations(self, instr: FetchInstruction) -> PlaceholderTranslations: source_path = self._source_paths[instr.source] df = self.read(source_path) return PlaceholderTranslations(instr, tuple(df), list(df.to_records(index=False)))
def __repr__(self) -> str: read_path_format = self._format_source("{source}") return f"{tname(self)}(read_function={tname(self._read)}, {read_path_format=})"
def _make_format_fn(read_path_format: Optional[Union[str, FormatFn]]) -> FormatFn: if callable(read_path_format): # pragma: no cover return read_path_format # At this point read_path_format is a string or None return (read_path_format or "{}").format