[1]:
import sys

import rics

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1
rics.__version__='0.14.0'
sys.version='3.9.12 (main, Jun  1 2022, 11:38:51) \n[GCC 7.5.0]'
120c623 (HEAD -> master) fixup! Add perftest tests and enable coverage
[2]:
from rics.utility.logs import basic_config, logging

basic_config(level=logging.DEBUG, matplotlib_level=logging.INFO)

In vs between#

What’s faster at various counts of requests IDs?

[3]:
import sqlalchemy

engine = sqlalchemy.create_engine(
    "postgresql+pg8000://postgres:your_password@localhost:5432/imdb"
)

Define candidates#

Define the functions or classes we’re testing. See candidates.py for the actual file, or the end of this notebook for a printout.

[4]:
# candidates.py is located in the same folder as this notebook
from candidates import Candidates

c = Candidates(engine)
candidates = [c.as_is, c.between, c.is_in, c.heuristic]

Verification#

Make sure candidates are equivalent.

[5]:
import numpy as np

all_ids = [int(row[0]) for row in c.as_is([])]
int_list = lambda size: list(
    map(int, np.random.choice(all_ids, size=size, replace=False))
)
test_data = {
    f"{size} ids": int_list(size) for size in [50, 250, 500, 1000, 2500, 5000, 10_000]
}
[6]:
first_case = next(iter(test_data.keys()))
must_include = set(test_data[first_case])
for cand in candidates:
    cand_output = set(row[0] for row in cand(test_data[first_case]))

    assert not must_include.difference(
        cand_output
    ), f"Bad candidate: {cand}. {cand_output=} != {reference_output=}"

Run perfomance comparison#

Run comparison, show results per candidate and test data set.

[7]:
from rics.performance import run_multivariate_test, format_perf_counter
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme("notebook")
data = run_multivariate_test(candidates, test_data, time_per_candidate=3)
2022-07-21T17:41:30.412 [rics.performance:INFO] Expected total runtime: 60.00 sec.
2022-07-21T17:41:30.413 [rics.performance:DEBUG] Run candidate 'as_is' 5x2 times...
2022-07-21T17:41:45.203 [rics.performance:DEBUG] Run candidate 'between' 5x2 times...
2022-07-21T17:41:59.897 [rics.performance:DEBUG] Run candidate 'is_in' 5x8 times...
2022-07-21T17:42:14.991 [rics.performance:DEBUG] Run candidate 'heuristic' 5x5 times...
../../../../_images/documentation_examples_notebooks_in-vs-between_In-vs-Between_10_1.png

Performance summary#

Summarized results per candidate.

[8]:
data.groupby("Candidate")["Time [ms]"].agg(["std", "mean", "min", "max"]).sort_values(
    "mean"
).round(1)
[8]:
std mean min max
Candidate
is_in 43.9 53.9 7.5 154.8
heuristic 86.3 83.0 7.5 241.1
between 32.6 209.9 181.9 368.5
as_is 22.0 211.1 183.7 273.6

Conclusion#

  • Probaby unfair in favour of fetching “too much” since the DB is running locally. We’re also fetching the same things over and over again which may lead to caching.

  • The heuristic method seems good and will only get better with tuning.

  • IDs in this data set (title_basics) are very sparse which causes poor performance for the heuristic on the “5000 ids” set; performance drops earlier than for pure as_in.

  • Heuristic method is suscptible to sparse ID collections

  • Results for is_in are not cached, it just performs horrendously bad for large amounts of “in”-values.

Use heuristic method.

Show run_multivariate_test signature#

Click here for docs.

[9]:
?run_multivariate_test
Signature:
run_multivariate_test(
    candidate_method: Union[Callable[[Any], NoneType], Collection[Callable[[Any], NoneType]], Dict[str, Callable[[Any], NoneType]]],
    test_data: Union[Any, Dict[str, Any]],
    time_per_candidate: float = 6.0,
    plot: bool = True,
    **figure_kwargs: Any,
) -> pandas.core.frame.DataFrame
Docstring:
Run performance tests for multiple candidate methods on collections of test data.

This is a convenience method which combines :meth:`MultiCaseTimer.run() <rics.performance.MultiCaseTimer.run>`,
:meth:`~rics.performance.to_dataframe` and -- if plotting is enabled -- :meth:`~rics.performance.plot_run`. For full
functionally these methods should be use directly.

Args:
    candidate_method: Candidate methods to evaluate.
    test_data: Test data to evaluate.
    time_per_candidate: Desired runtime for each repetition per candidate label.
    plot: If ``True``, plot a figure using :meth:`~rics.performance.plot_run`.
    **figure_kwargs: Keyword arguments for the barplot. Ignored if ``plot=False``.

Returns:
    A long-format DataFrame of results.

Raises:
    ModuleNotFoundError: If Seaborn isn't installed and ``plot=True``.
File:      ~/git/rics/src/rics/performance/_wrapper.py
Type:      function

Show candidates.py#

[10]:
!pygmentize candidates.py
import sqlalchemy

# This value is probably a bit off compared to other real-world scenarios since IDs are very sparse in the IMDb data
MAX_OVERFETCH_FACTOR = 50


class Candidates:
    def __init__(self, engine):

        self.engine = engine
        metadata = sqlalchemy.MetaData()
        metadata.reflect(engine)

        self.id_column = metadata.tables["title_basics"].columns["int_id_tconst"]
        extra_cols = (
            metadata.tables["title_basics"].columns["primaryTitle"],
            metadata.tables["title_basics"].columns["originalTitle"],
            metadata.tables["title_basics"].columns["runtimeMinutes"],
        )
        self.bare_select = sqlalchemy.select(self.id_column, *extra_cols)

    def _exec(self, stmt):
        return list(self.engine.execute(stmt))

    def as_is(self, ids):
        return self._exec(self.bare_select)

    def between(self, ids):
        min_id = min(ids)
        max_id = max(ids)
        select = self.bare_select.where(self.id_column.between(min_id, max_id))
        return self._exec(select)

    def is_in(self, ids):
        return self._exec(self.bare_select.where(self.id_column.in_(ids)))

    def heuristic(self, ids):
        stmt = self.bare_select
        num_ids = len(ids)

        in_clause = stmt.where(self.id_column.in_(ids))
        if num_ids < 1200:
            return self._exec(in_clause)

        min_id = min(ids)
        max_id = max(ids)
        between_clause = stmt.where(self.id_column.between(min_id, max_id))

        if num_ids > 4500:
            return self._exec(between_clause)

        overfetch_factor = (max_id - min_id) / num_ids
        return self._exec(in_clause if overfetch_factor > MAX_OVERFETCH_FACTOR else between_clause)
[ ]: