import logging
import os
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
import pandas as pd
from rics.translation.fetching._fetch_instruction import FetchInstruction
from rics.translation.fetching._fetcher import Fetcher
from rics.translation.offline.types import IdType, NameType, PlaceholderTranslations
from rics.utility.misc import PathLikeType, tname
LOGGER = logging.getLogger(__package__).getChild("PandasFetcher")
PandasReadFunction = Callable[[PathLikeType, Any, Any], pd.DataFrame]
FormatFn = Callable[[PathLikeType], str]
[docs]class PandasFetcher(Fetcher[NameType, IdType, str]):
"""Fetcher using pandas DataFrames as the data format.
Fetch data from serialized DataFrames. How this is done is determined by the `read_function`. This is typically a
Pandas function such as :func:`pandas.read_csv` or :func:`pandas.read_pickle`, but any function that accepts a
string `source` as the first argument and returns a data frame can be used.
Args:
read_function: A Pandas `read`-function.
read_path_format: A formatting string or a callable to apply to a source before passing them to `read_function`.
Must contain a `source` as its only placeholder. Example: ``data/{source}.pkl``. None=leave as-is.
read_function_args: Additional positional arguments for `read_function`.
read_function_kwargs: Additional keyword arguments for `read_function`.
See Also:
The official `Pandas IO documentation <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_
"""
def __init__(
self,
read_function: Union[PandasReadFunction, str] = pd.read_pickle,
read_path_format: Optional[Union[str, FormatFn]] = "data/{}.pkl",
read_function_args: Iterable[Any] = None,
read_function_kwargs: Mapping[str, Any] = None,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self._read = getattr(pd, read_function) if isinstance(read_function, str) else read_function
self._format_source: FormatFn = _make_format_fn(read_path_format)
self._args = read_function_args or ()
self._kwargs = read_function_kwargs or {}
self._source_paths: Dict[str, Path] = {}
self._sources: List[str] = []
[docs] def read(self, source_path: PathLikeType) -> pd.DataFrame:
"""Read a DataFrame from a source path.
Args:
source_path: Path to serialized DataFrame.
Returns:
A deserialized DataFrame.
"""
return self._read(source_path, *self._args, **self._kwargs)
[docs] def find_sources(self) -> Dict[str, Path]:
"""Search for source paths to pass to `read_function` using `read_path_format`.
Returns:
A dict {source, path}.
Raises:
IOError: If files cannot be read.
"""
abs_file = Path(self._format_source("")).absolute()
directory = abs_file.parent
file_pattern = abs_file.name
if not directory.is_dir():
problem = "is not a directory" if directory.exists() else "does not exist"
raise IOError(f"Bad path format: {directory} {problem}.")
source_paths = {}
# Path.glob does not work with absolute directories.
for file in map(Path, os.listdir(directory)):
if file.name.endswith(str(file_pattern)):
source_paths[file.name.replace(file_pattern, "")] = directory.joinpath(file)
if not source_paths:
pattern = Path(self._format_source("*")).absolute()
raise IOError(f"Bad path pattern: '{pattern}' did not match any files.")
return source_paths
@property
def sources(self) -> List[str]:
"""Source names known to the fetcher, such as ``cities`` or ``languages``."""
if not self._sources:
if not self._source_paths:
self._source_paths = self.find_sources()
self._sources = list(self._source_paths)
LOGGER.debug("Sources initialized: %s", self._sources)
return self._sources
[docs] def fetch_placeholders(self, instr: FetchInstruction) -> PlaceholderTranslations:
"""Read data from disk."""
source_path = self._source_paths[instr.source]
df = self.read(source_path)
return Fetcher.make_and_verify(instr, tuple(df), list(df.to_records(index=False)))
def __repr__(self) -> str:
return f"{tname(self)}(read_function={tname(self._read)})"
def _make_format_fn(read_path_format: Optional[Union[str, FormatFn]]) -> FormatFn:
if callable(read_path_format):
return read_path_format
# At this point read_path_format is a string or None
return (read_path_format or "{}").format